Skip to content

Commit

Permalink
Bump versions again to get LangChain extractors
Browse files Browse the repository at this point in the history
  • Loading branch information
cbornet committed Jul 22, 2024
1 parent 7fc176c commit 5c3daee
Show file tree
Hide file tree
Showing 15 changed files with 80 additions and 572 deletions.
6 changes: 3 additions & 3 deletions libs/e2e-tests/pyproject.llamaindex.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ llama-index-multi-modal-llms-gemini = { git = "https://github.com/run-llama/llam

llama-parse = { git = "https://github.com/run-llama/llama_parse.git", branch = "main" }

langchain = "0.2.7"
langchain-core = "0.2.12"
langchain-community = "0.2.7"
langchain = "0.2.10"
langchain-core = "0.2.22"
langchain-community = "0.2.9"
langchain-astradb = "0.3.3"
langchain-openai = "0.1.8"
langchain-google-genai = { version = "1.0.6" }
Expand Down
6 changes: 3 additions & 3 deletions libs/langchain/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ ragstack-ai-colbert = { version = "1.0.5", optional = true }
ragstack-ai-knowledge-store = { version = "0.1.0", optional = true }

# langchain
langchain = "0.2.7"
langchain-core = "0.2.12"
langchain-community = "0.2.7"
langchain = "0.2.10"
langchain-core = "0.2.22"
langchain-community = "0.2.9"
langchain-astradb = "0.3.3"
langchain-openai = "0.1.8"
langchain-google-genai = { version = "1.0.6", optional = true }
Expand Down
1 change: 0 additions & 1 deletion libs/langchain/ragstack_langchain/graph_store/cassandra.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
CassandraGraphVectorStore as CassandraGraphStore,
)


__all__ = [
"CassandraGraphStore",
]
Original file line number Diff line number Diff line change
@@ -1,57 +1,9 @@
from typing import Any, Dict, Iterable, List, Optional, Set

from langchain_core.graph_vectorstores import Link

from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor

# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`.
GLiNERInput = str


class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
def __init__(
self,
labels: List[str],
*,
kind: str = "entity",
model: str = "urchade/gliner_mediumv2.1",
extract_kwargs: Optional[Dict[str, Any]] = None,
):
"""Extract keywords using GLiNER.
Args:
kind: Kind of links to produce with this extractor.
labels: List of kinds of entities to extract.
model: GLiNER model to use.
extract_kwargs: Keyword arguments to pass to GLiNER.
"""
try:
from gliner import GLiNER

self._model = GLiNER.from_pretrained(model)

except ImportError:
raise ImportError(
"gliner is required for GLiNERLinkExtractor. "
"Please install it with `pip install gliner`."
) from None

self._labels = labels
self._kind = kind
self._extract_kwargs = extract_kwargs or {}

def extract_one(self, input: GLiNERInput) -> Set[Link]: # noqa: A002
return next(self.extract_many([input]))

def extract_many(
self,
inputs: Iterable[GLiNERInput],
) -> Iterable[Set[Link]]:
strs = [i if isinstance(i, str) else i.page_content for i in inputs]
for entities in self._model.batch_predict_entities(
strs, self._labels, **self._extract_kwargs
):
yield {
Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"])
for e in entities
}
from langchain_community.graph_vectorstores.extractors import (
GLiNERInput,
GLiNERLinkExtractor,
)

__all__ = [
"GLiNERInput",
"GLiNERLinkExtractor",
]
Original file line number Diff line number Diff line change
@@ -1,61 +1,9 @@
from typing import Callable, List, Set

from langchain_core.documents import Document
from langchain_core.graph_vectorstores import Link

from .link_extractor import LinkExtractor
from .link_extractor_adapter import LinkExtractorAdapter

# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`.
HierarchyInput = List[str]


class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
def __init__(
self,
kind: str = "hierarchy",
up_links: bool = True,
down_links: bool = False,
sibling_links: bool = False,
):
"""Extract links from a document hierarchy.
Args:
kind: Kind of links to produce with this extractor.
up_links: Link from a section to it's parent.
down_links: Link from a section to it's children.
sibling_links: Link from a section to other sections with the same parent.
"""
self._kind = kind
self._up_links = up_links
self._down_links = down_links
self._sibling_links = sibling_links

def as_document_extractor(
self, hierarchy: Callable[[Document], HierarchyInput]
) -> LinkExtractor[Document]:
return LinkExtractorAdapter(underlying=self, transform=hierarchy)

def extract_one(
self,
input: HierarchyInput, # noqa: A002
) -> Set[Link]:
this_path = "/".join(input)
parent_path = None

links = set()
if self._up_links:
links.add(Link.incoming(kind=self._kind, tag=f"up:{this_path}"))
if self._down_links:
links.add(Link.outgoing(kind=self._kind, tag=f"down:{this_path}"))

if len(input) >= 1:
parent_path = "/".join(input[0:-1])
if self._up_links and len(input) > 1:
links.add(Link.outgoing(kind=self._kind, tag=f"up:{parent_path}"))
if self._down_links and len(input) > 1:
links.add(Link.incoming(kind=self._kind, tag=f"down:{parent_path}"))
if self._sibling_links:
links.add(Link.bidir(kind=self._kind, tag=f"sib:{parent_path}"))

return links
from langchain_community.graph_vectorstores.extractors import (
HierarchyInput,
HierarchyLinkExtractor,
)

__all__ = [
"HierarchyInput",
"HierarchyLinkExtractor",
]
Original file line number Diff line number Diff line change
@@ -1,119 +1,9 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Set, Union
from urllib.parse import urldefrag, urljoin, urlparse

from langchain_core.documents import Document
from langchain_core.graph_vectorstores import Link

from .link_extractor import LinkExtractor
from .link_extractor_adapter import LinkExtractorAdapter

if TYPE_CHECKING:
from bs4 import BeautifulSoup


def _parse_url(link, page_url, drop_fragments: bool = True):
href = link.get("href")
if href is None:
return None
url = urlparse(href)
if url.scheme not in ["http", "https", ""]:
return None

# Join the HREF with the page_url to convert relative paths to absolute.
url = urljoin(page_url, href)

# Fragments would be useful if we chunked a page based on section.
# Then, each chunk would have a different URL based on the fragment.
# Since we aren't doing that yet, they just "break" links. So, drop
# the fragment.
if drop_fragments:
return urldefrag(url).url
return url


def _parse_hrefs(
soup: "BeautifulSoup", url: str, drop_fragments: bool = True
) -> Set[str]:
links = soup.find_all("a")
links = {
_parse_url(link, page_url=url, drop_fragments=drop_fragments) for link in links
}

# Remove entries for any 'a' tag that failed to parse (didn't have href,
# or invalid domain, etc.)
links.discard(None)

# Remove self links.
links.discard(url)

return links


@dataclass
class HtmlInput:
content: Union[str, "BeautifulSoup"]
base_url: str


class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True):
"""Extract hyperlinks from HTML content.
Expects the input to be an HTML string or a `BeautifulSoup` object.
Args:
kind: The kind of edge to extract. Defaults to "hyperlink".
drop_fragments: Whether fragments in URLs and links shoud be
dropped. Defaults to `True`.
"""
try:
import bs4 # noqa:F401
except ImportError as e:
raise ImportError(
"BeautifulSoup4 is required for HtmlLinkExtractor. "
"Please install it with `pip install beautifulsoup4`."
) from e

self._kind = kind
self.drop_fragments = drop_fragments

def as_document_extractor(
self, url_metadata_key: str = "source"
) -> LinkExtractor[Document]:
"""Return a LinkExtractor that applies to documents.
NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar
link extractors it may be more efficient to call the link extractors directly
on the parsed BeautifulSoup object.
Args:
url_metadata_key: The name of the filed in document metadata with the URL of
the document.
"""
return LinkExtractorAdapter(
underlying=self,
transform=lambda doc: HtmlInput(
doc.page_content, doc.metadata[url_metadata_key]
),
)

def extract_one(
self,
input: HtmlInput, # noqa: A002
) -> Set[Link]:
content = input.content
if isinstance(content, str):
from bs4 import BeautifulSoup

content = BeautifulSoup(content, "html.parser")

base_url = input.base_url
if self.drop_fragments:
base_url = urldefrag(base_url).url

hrefs = _parse_hrefs(content, base_url, self.drop_fragments)

links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs}
links.add(Link.incoming(kind=self._kind, tag=base_url))
return links
from langchain_community.graph_vectorstores.extractors import (
HtmlInput,
HtmlLinkExtractor,
)

__all__ = [
"HtmlInput",
"HtmlLinkExtractor",
]
Original file line number Diff line number Diff line change
@@ -1,63 +1,9 @@
from typing import Any, Dict, Iterable, Optional, Set, Union

from langchain_core.documents import Document
from langchain_core.graph_vectorstores import Link

from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor

# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`.
KeybertInput = Union[str, Document]


class KeybertLinkExtractor(LinkExtractor[KeybertInput]):
def __init__(
self,
*,
kind: str = "kw",
embedding_model: str = "all-MiniLM-L6-v2",
extract_keywords_kwargs: Optional[Dict[str, Any]] = None,
):
"""Extract keywords using Keybert.
Args:
kind: Kind of links to produce with this extractor.
embedding_model: Name of the embedding model to use with Keybert.
extract_keywords_kwargs: Keyword arguments to pass to Keybert's
`extract_keywords` method.
"""
try:
import keybert

self._kw_model = keybert.KeyBERT(model=embedding_model)
except ImportError:
raise ImportError(
"keybert is required for KeybertLinkExtractor. "
"Please install it with `pip install keybert`."
) from None

self._kind = kind
self._extract_keywords_kwargs = extract_keywords_kwargs or {}

def extract_one(self, input: KeybertInput) -> Set[Link]: # noqa: A002
keywords = self._kw_model.extract_keywords(
input if isinstance(input, str) else input.page_content,
**self._extract_keywords_kwargs,
)
return {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}

def extract_many(
self,
inputs: Iterable[KeybertInput],
) -> Iterable[Set[Link]]:
if len(inputs) == 1:
# Even though we pass a list, if it contains one item, keybert will
# flatten it. This means it's easier to just call the special case
# for one item.
yield self.extract_one(inputs[0])
elif len(inputs) > 1:
strs = [i if isinstance(i, str) else i.page_content for i in inputs]
extracted = self._kw_model.extract_keywords(
strs, **self._extract_keywords_kwargs
)
for keywords in extracted:
yield {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}
from langchain_community.graph_vectorstores.extractors import (
KeybertInput,
KeybertLinkExtractor,
)

__all__ = [
"KeybertInput",
"KeybertLinkExtractor",
]
Loading

0 comments on commit 5c3daee

Please sign in to comment.