-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Bump versions again to get LangChain extractors
- Loading branch information
Showing
15 changed files
with
80 additions
and
572 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,6 @@ | |
CassandraGraphVectorStore as CassandraGraphStore, | ||
) | ||
|
||
|
||
__all__ = [ | ||
"CassandraGraphStore", | ||
] |
66 changes: 9 additions & 57 deletions
66
libs/langchain/ragstack_langchain/graph_store/extractors/gliner_link_extractor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,57 +1,9 @@ | ||
from typing import Any, Dict, Iterable, List, Optional, Set | ||
|
||
from langchain_core.graph_vectorstores import Link | ||
|
||
from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor | ||
|
||
# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`. | ||
GLiNERInput = str | ||
|
||
|
||
class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]): | ||
def __init__( | ||
self, | ||
labels: List[str], | ||
*, | ||
kind: str = "entity", | ||
model: str = "urchade/gliner_mediumv2.1", | ||
extract_kwargs: Optional[Dict[str, Any]] = None, | ||
): | ||
"""Extract keywords using GLiNER. | ||
Args: | ||
kind: Kind of links to produce with this extractor. | ||
labels: List of kinds of entities to extract. | ||
model: GLiNER model to use. | ||
extract_kwargs: Keyword arguments to pass to GLiNER. | ||
""" | ||
try: | ||
from gliner import GLiNER | ||
|
||
self._model = GLiNER.from_pretrained(model) | ||
|
||
except ImportError: | ||
raise ImportError( | ||
"gliner is required for GLiNERLinkExtractor. " | ||
"Please install it with `pip install gliner`." | ||
) from None | ||
|
||
self._labels = labels | ||
self._kind = kind | ||
self._extract_kwargs = extract_kwargs or {} | ||
|
||
def extract_one(self, input: GLiNERInput) -> Set[Link]: # noqa: A002 | ||
return next(self.extract_many([input])) | ||
|
||
def extract_many( | ||
self, | ||
inputs: Iterable[GLiNERInput], | ||
) -> Iterable[Set[Link]]: | ||
strs = [i if isinstance(i, str) else i.page_content for i in inputs] | ||
for entities in self._model.batch_predict_entities( | ||
strs, self._labels, **self._extract_kwargs | ||
): | ||
yield { | ||
Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"]) | ||
for e in entities | ||
} | ||
from langchain_community.graph_vectorstores.extractors import ( | ||
GLiNERInput, | ||
GLiNERLinkExtractor, | ||
) | ||
|
||
__all__ = [ | ||
"GLiNERInput", | ||
"GLiNERLinkExtractor", | ||
] |
70 changes: 9 additions & 61 deletions
70
libs/langchain/ragstack_langchain/graph_store/extractors/hierarchy_link_extractor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,61 +1,9 @@ | ||
from typing import Callable, List, Set | ||
|
||
from langchain_core.documents import Document | ||
from langchain_core.graph_vectorstores import Link | ||
|
||
from .link_extractor import LinkExtractor | ||
from .link_extractor_adapter import LinkExtractorAdapter | ||
|
||
# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`. | ||
HierarchyInput = List[str] | ||
|
||
|
||
class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]): | ||
def __init__( | ||
self, | ||
kind: str = "hierarchy", | ||
up_links: bool = True, | ||
down_links: bool = False, | ||
sibling_links: bool = False, | ||
): | ||
"""Extract links from a document hierarchy. | ||
Args: | ||
kind: Kind of links to produce with this extractor. | ||
up_links: Link from a section to it's parent. | ||
down_links: Link from a section to it's children. | ||
sibling_links: Link from a section to other sections with the same parent. | ||
""" | ||
self._kind = kind | ||
self._up_links = up_links | ||
self._down_links = down_links | ||
self._sibling_links = sibling_links | ||
|
||
def as_document_extractor( | ||
self, hierarchy: Callable[[Document], HierarchyInput] | ||
) -> LinkExtractor[Document]: | ||
return LinkExtractorAdapter(underlying=self, transform=hierarchy) | ||
|
||
def extract_one( | ||
self, | ||
input: HierarchyInput, # noqa: A002 | ||
) -> Set[Link]: | ||
this_path = "/".join(input) | ||
parent_path = None | ||
|
||
links = set() | ||
if self._up_links: | ||
links.add(Link.incoming(kind=self._kind, tag=f"up:{this_path}")) | ||
if self._down_links: | ||
links.add(Link.outgoing(kind=self._kind, tag=f"down:{this_path}")) | ||
|
||
if len(input) >= 1: | ||
parent_path = "/".join(input[0:-1]) | ||
if self._up_links and len(input) > 1: | ||
links.add(Link.outgoing(kind=self._kind, tag=f"up:{parent_path}")) | ||
if self._down_links and len(input) > 1: | ||
links.add(Link.incoming(kind=self._kind, tag=f"down:{parent_path}")) | ||
if self._sibling_links: | ||
links.add(Link.bidir(kind=self._kind, tag=f"sib:{parent_path}")) | ||
|
||
return links | ||
from langchain_community.graph_vectorstores.extractors import ( | ||
HierarchyInput, | ||
HierarchyLinkExtractor, | ||
) | ||
|
||
__all__ = [ | ||
"HierarchyInput", | ||
"HierarchyLinkExtractor", | ||
] |
128 changes: 9 additions & 119 deletions
128
libs/langchain/ragstack_langchain/graph_store/extractors/html_link_extractor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,119 +1,9 @@ | ||
from dataclasses import dataclass | ||
from typing import TYPE_CHECKING, Set, Union | ||
from urllib.parse import urldefrag, urljoin, urlparse | ||
|
||
from langchain_core.documents import Document | ||
from langchain_core.graph_vectorstores import Link | ||
|
||
from .link_extractor import LinkExtractor | ||
from .link_extractor_adapter import LinkExtractorAdapter | ||
|
||
if TYPE_CHECKING: | ||
from bs4 import BeautifulSoup | ||
|
||
|
||
def _parse_url(link, page_url, drop_fragments: bool = True): | ||
href = link.get("href") | ||
if href is None: | ||
return None | ||
url = urlparse(href) | ||
if url.scheme not in ["http", "https", ""]: | ||
return None | ||
|
||
# Join the HREF with the page_url to convert relative paths to absolute. | ||
url = urljoin(page_url, href) | ||
|
||
# Fragments would be useful if we chunked a page based on section. | ||
# Then, each chunk would have a different URL based on the fragment. | ||
# Since we aren't doing that yet, they just "break" links. So, drop | ||
# the fragment. | ||
if drop_fragments: | ||
return urldefrag(url).url | ||
return url | ||
|
||
|
||
def _parse_hrefs( | ||
soup: "BeautifulSoup", url: str, drop_fragments: bool = True | ||
) -> Set[str]: | ||
links = soup.find_all("a") | ||
links = { | ||
_parse_url(link, page_url=url, drop_fragments=drop_fragments) for link in links | ||
} | ||
|
||
# Remove entries for any 'a' tag that failed to parse (didn't have href, | ||
# or invalid domain, etc.) | ||
links.discard(None) | ||
|
||
# Remove self links. | ||
links.discard(url) | ||
|
||
return links | ||
|
||
|
||
@dataclass | ||
class HtmlInput: | ||
content: Union[str, "BeautifulSoup"] | ||
base_url: str | ||
|
||
|
||
class HtmlLinkExtractor(LinkExtractor[HtmlInput]): | ||
def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True): | ||
"""Extract hyperlinks from HTML content. | ||
Expects the input to be an HTML string or a `BeautifulSoup` object. | ||
Args: | ||
kind: The kind of edge to extract. Defaults to "hyperlink". | ||
drop_fragments: Whether fragments in URLs and links shoud be | ||
dropped. Defaults to `True`. | ||
""" | ||
try: | ||
import bs4 # noqa:F401 | ||
except ImportError as e: | ||
raise ImportError( | ||
"BeautifulSoup4 is required for HtmlLinkExtractor. " | ||
"Please install it with `pip install beautifulsoup4`." | ||
) from e | ||
|
||
self._kind = kind | ||
self.drop_fragments = drop_fragments | ||
|
||
def as_document_extractor( | ||
self, url_metadata_key: str = "source" | ||
) -> LinkExtractor[Document]: | ||
"""Return a LinkExtractor that applies to documents. | ||
NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar | ||
link extractors it may be more efficient to call the link extractors directly | ||
on the parsed BeautifulSoup object. | ||
Args: | ||
url_metadata_key: The name of the filed in document metadata with the URL of | ||
the document. | ||
""" | ||
return LinkExtractorAdapter( | ||
underlying=self, | ||
transform=lambda doc: HtmlInput( | ||
doc.page_content, doc.metadata[url_metadata_key] | ||
), | ||
) | ||
|
||
def extract_one( | ||
self, | ||
input: HtmlInput, # noqa: A002 | ||
) -> Set[Link]: | ||
content = input.content | ||
if isinstance(content, str): | ||
from bs4 import BeautifulSoup | ||
|
||
content = BeautifulSoup(content, "html.parser") | ||
|
||
base_url = input.base_url | ||
if self.drop_fragments: | ||
base_url = urldefrag(base_url).url | ||
|
||
hrefs = _parse_hrefs(content, base_url, self.drop_fragments) | ||
|
||
links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs} | ||
links.add(Link.incoming(kind=self._kind, tag=base_url)) | ||
return links | ||
from langchain_community.graph_vectorstores.extractors import ( | ||
HtmlInput, | ||
HtmlLinkExtractor, | ||
) | ||
|
||
__all__ = [ | ||
"HtmlInput", | ||
"HtmlLinkExtractor", | ||
] |
72 changes: 9 additions & 63 deletions
72
libs/langchain/ragstack_langchain/graph_store/extractors/keybert_link_extractor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,63 +1,9 @@ | ||
from typing import Any, Dict, Iterable, Optional, Set, Union | ||
|
||
from langchain_core.documents import Document | ||
from langchain_core.graph_vectorstores import Link | ||
|
||
from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor | ||
|
||
# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`. | ||
KeybertInput = Union[str, Document] | ||
|
||
|
||
class KeybertLinkExtractor(LinkExtractor[KeybertInput]): | ||
def __init__( | ||
self, | ||
*, | ||
kind: str = "kw", | ||
embedding_model: str = "all-MiniLM-L6-v2", | ||
extract_keywords_kwargs: Optional[Dict[str, Any]] = None, | ||
): | ||
"""Extract keywords using Keybert. | ||
Args: | ||
kind: Kind of links to produce with this extractor. | ||
embedding_model: Name of the embedding model to use with Keybert. | ||
extract_keywords_kwargs: Keyword arguments to pass to Keybert's | ||
`extract_keywords` method. | ||
""" | ||
try: | ||
import keybert | ||
|
||
self._kw_model = keybert.KeyBERT(model=embedding_model) | ||
except ImportError: | ||
raise ImportError( | ||
"keybert is required for KeybertLinkExtractor. " | ||
"Please install it with `pip install keybert`." | ||
) from None | ||
|
||
self._kind = kind | ||
self._extract_keywords_kwargs = extract_keywords_kwargs or {} | ||
|
||
def extract_one(self, input: KeybertInput) -> Set[Link]: # noqa: A002 | ||
keywords = self._kw_model.extract_keywords( | ||
input if isinstance(input, str) else input.page_content, | ||
**self._extract_keywords_kwargs, | ||
) | ||
return {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords} | ||
|
||
def extract_many( | ||
self, | ||
inputs: Iterable[KeybertInput], | ||
) -> Iterable[Set[Link]]: | ||
if len(inputs) == 1: | ||
# Even though we pass a list, if it contains one item, keybert will | ||
# flatten it. This means it's easier to just call the special case | ||
# for one item. | ||
yield self.extract_one(inputs[0]) | ||
elif len(inputs) > 1: | ||
strs = [i if isinstance(i, str) else i.page_content for i in inputs] | ||
extracted = self._kw_model.extract_keywords( | ||
strs, **self._extract_keywords_kwargs | ||
) | ||
for keywords in extracted: | ||
yield {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords} | ||
from langchain_community.graph_vectorstores.extractors import ( | ||
KeybertInput, | ||
KeybertLinkExtractor, | ||
) | ||
|
||
__all__ = [ | ||
"KeybertInput", | ||
"KeybertLinkExtractor", | ||
] |
Oops, something went wrong.