Fix remarques

langchain-ai · Jan 10, 2025 · 6765dbf · 6765dbf
1 parent 20f5a41
commit 6765dbf
Show file tree

Hide file tree

Showing 10 changed files with 213 additions and 152 deletions.
diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt
@@ -66,6 +66,7 @@ praw>=7.7.1,<8
 premai>=0.3.25,<0.4
 psychicapi>=0.8.0,<0.9
 pydantic>=2.7.4,<3
+pytesseract>=0.3.13
 py-trello>=0.19.0,<0.20
 pyjwt>=2.8.0,<3
 pymupdf>=1.22.3,<2

diff --git a/libs/community/langchain_community/document_loaders/parsers/__init__.py b/libs/community/langchain_community/document_loaders/parsers/__init__.py
@@ -17,6 +17,11 @@
     from langchain_community.document_loaders.parsers.html import (
         BS4HTMLParser,
     )
+    from langchain_community.document_loaders.parsers.images import (
+        MultimodalBlobParser,
+        RapidOCRBlobParser,
+        TesseractBlobParser,
+    )
     from langchain_community.document_loaders.parsers.language import (
         LanguageParser,
     )
@@ -30,11 +35,6 @@
     from langchain_community.document_loaders.parsers.vsdx import (
         VsdxParser,
     )
-    from langchain_community.document_loaders.parsers.images import (
-        MultimodalBlobParser,
-        RapidOCRBlobParser,
-        TesseractBlobParser,
-    )
 
 
 _module_lookup = {

diff --git a/libs/community/langchain_community/document_loaders/parsers/images.py b/libs/community/langchain_community/document_loaders/parsers/images.py
@@ -3,87 +3,97 @@
 import io
 import logging
 from abc import abstractmethod
+from typing import TYPE_CHECKING, Iterator, Literal
 
-from PIL import Image
-from typing import Iterator, Literal
-
-from langchain_community.document_loaders.base import BaseBlobParser
-from langchain_community.document_loaders.blob_loaders import Blob
+import numpy as np
 from langchain_core.documents import Document
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import HumanMessage
 
+if TYPE_CHECKING:
+    from PIL.Image import Image
+
+from langchain_community.document_loaders.base import BaseBlobParser
+from langchain_community.document_loaders.blob_loaders import Blob
+
 logger = logging.getLogger(__name__)
 
 
 class ImageBlobParser(BaseBlobParser):
     def __init__(
-            self,
-            *,
-            format: Literal["text", "markdown", "html"] = "text",
+        self,
+        *,
+        format: Literal["text", "markdown", "html"] = "text",
     ):
         self.format = format
 
     @abstractmethod
-    def _analyze_image(self, img: Image) -> str:
+    def _analyze_image(self, img: "Image") -> str:
         pass
 
     def lazy_parse(self, blob: Blob) -> Iterator[Document]:
-        with blob.as_bytes_io() as buf:
-            img = Image.open(buf)
-            content = self._analyze_image(img)
-            if content:
-                if self.format == "markdown":
-                    content = content.replace("]", r"\\]")
-                    content = f"![{content}](.)"
-                elif self.format == "html":
-                    content = f'<img alt="{html.escape(content, quote=True)}" />'
-            logger.debug("Image text: %s", content.replace("\n", "\\n"))
-            yield Document(
-                page_content=content,
-                metadata={"source": blob.source},
+        try:
+            from PIL import Image as Img
+
+            with blob.as_bytes_io() as buf:
+                img = Img.open(buf)
+                content = self._analyze_image(img)
+                if content:
+                    if self.format == "markdown":
+                        content = content.replace("]", r"\\]")
+                        content = f"![{content}](.)"
+                    elif self.format == "html":
+                        content = f'<img alt="{html.escape(content, quote=True)}" />'
+                logger.debug("Image text: %s", content.replace("\n", "\\n"))
+                yield Document(
+                    page_content=content,
+                    metadata={"source": blob.source},
+                )
+        except ImportError:
+            raise ImportError(
+                "`rapidocr-onnxruntime` package not found, please install it with "
+                "`pip install Pillow`"
             )
 
 
 class RapidOCRBlobParser(ImageBlobParser):
     def __init__(
-            self,
-            *,
-            format: Literal["text", "markdown", "html"] = "text",
+        self,
+        *,
+        format: Literal["text", "markdown", "html"] = "text",
     ):
         super().__init__(format=format)
         self.ocr = None
 
-    def _analyze_image(self, img: Image) -> str:
+    def _analyze_image(self, img: "Image") -> str:
         if not self.ocr:
             try:
                 from rapidocr_onnxruntime import RapidOCR
+
+                self.ocr = RapidOCR()
             except ImportError:
                 raise ImportError(
                     "`rapidocr-onnxruntime` package not found, please install it with "
                     "`pip install rapidocr-onnxruntime`"
                 )
-            self.ocr = RapidOCR()
-        ocr_result, _ = self.ocr(img)
+        ocr_result, _ = self.ocr(np.array(img))  # type: ignore
         content = ""
         if ocr_result:
             content = ("\n".join([text[1] for text in ocr_result])).strip()
         return content
 
 
 class TesseractBlobParser(ImageBlobParser):
-
     def __init__(
-            self,
-            *,
-            format: Literal["text", "markdown", "html"] = "text",
-            langs: list[str] = ["eng"],
-
+        self,
+        *,
+        format: Literal["text", "markdown", "html"] = "text",
+        langs: list[str] = ["eng"],
     ):
         super().__init__(format=format)
         self.langs = langs
 
-    def _analyze_image(self, img: Image) -> str:
+    def _analyze_image(self, img: "Image") -> str:
         try:
             import pytesseract
         except ImportError:
@@ -99,24 +109,23 @@ def _analyze_image(self, img: Image) -> str:
     "images for retrieval. "
     "These summaries will be embedded and used to retrieve the raw image. "
     "Give a concise summary of the image that is well optimized for retrieval "
-    "and extract all the text from the image.")
+    "and extract all the text from the image."
+)
 
 
 class MultimodalBlobParser(ImageBlobParser):
-
     def __init__(
-            self,
-            *,
-            format: Literal["text", "markdown", "html"] = "text",
-            model: BaseChatModel,
-            prompt: str = _prompt_images_to_description,
-
+        self,
+        *,
+        format: Literal["text", "markdown", "html"] = "text",
+        model: BaseChatModel,
+        prompt: str = _prompt_images_to_description,
     ):
         super().__init__(format=format)
         self.model = model
         self.prompt = prompt
 
-    def _analyze_image(self, img: Image) -> str:
+    def _analyze_image(self, img: "Image") -> str:
         image_bytes = io.BytesIO()
         img.save(image_bytes, format="PNG")
         img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")

diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -2,31 +2,33 @@
 
 from __future__ import annotations
 
-import html
 import io
 import logging
 import threading
 import warnings
 from datetime import datetime
-from urllib.parse import urlparse
-
-import numpy as np
 from typing import (
     TYPE_CHECKING,
     Any,
+    Iterable,
     Iterator,
     Literal,
     Mapping,
     Optional,
     Sequence,
     Union,
 )
+from urllib.parse import urlparse
+
+import numpy as np
+from langchain_core.documents import Document
 
 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
-from langchain_community.document_loaders.parsers.images import ImageBlobParser, \
-    RapidOCRBlobParser
-from langchain_core.documents import Document
+from langchain_community.document_loaders.parsers.images import (
+    ImageBlobParser,
+    RapidOCRBlobParser,
+)
 
 if TYPE_CHECKING:
     import pdfminer
@@ -53,16 +55,49 @@
     "JBIG2Decode",
 ]
 
+
+def extract_from_images_with_rapidocr(
+    images: Sequence[Union[Iterable[np.ndarray], bytes]],
+) -> str:
+    """Extract text from images with RapidOCR.
+
+    Args:
+        images: Images to extract text from.
+
+    Returns:
+        Text extracted from images.
+
+    Raises:
+        ImportError: If `rapidocr-onnxruntime` package is not installed.
+    """
+    try:
+        from rapidocr_onnxruntime import RapidOCR
+    except ImportError:
+        raise ImportError(
+            "`rapidocr-onnxruntime` package not found, please install it with "
+            "`pip install rapidocr-onnxruntime`"
+        )
+    ocr = RapidOCR()
+    text = ""
+    for img in images:
+        result, _ = ocr(img)
+        if result:
+            result = [text[1] for text in result]
+            text += "\n".join(result)
+    return text
+
+
 logger = logging.getLogger(__name__)
 
 _FORMAT_IMAGE_STR = "\n\n{image_text}\n\n"
 _JOIN_IMAGES = "\n"
 _JOIN_TABLES = "\n"
 _DEFAULT_PAGE_DELIMITOR = "\n\f"
 
-_STD_METADATA_KEYS={"source", "total_pages", "creationdate", "creator", "producer"}
+_STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"}
+
 
-def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
+def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
     """Validates the presence of at least the following keys:
     - source
     - page (if mode='page')
@@ -73,7 +108,7 @@ def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
     """
     if not _STD_METADATA_KEYS.issubset(metadata.keys()):
         raise ValueError("The PDF parser must valorize the standard metadata.")
-    if not isinstance(metadata.get("page",0), int):
+    if not isinstance(metadata.get("page", 0), int):
         raise ValueError("The PDF metadata page must be a integer.")
     return metadata
 
@@ -116,7 +151,10 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
     return new_metadata
 
 
-_PARAGRAPH_DELIMITOR = ["\n\n\n", "\n\n"]  # To insert images or table in the middle of the page.
+_PARAGRAPH_DELIMITOR = [
+    "\n\n\n",
+    "\n\n",
+]  # To insert images or table in the middle of the page.
 
 
 def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
@@ -132,7 +170,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
     """
 
     def _recurs_merge_text_and_extras(
-            extras: list[str], text_from_page: str, recurs: bool
+        extras: list[str], text_from_page: str, recurs: bool
     ) -> Optional[str]:
         if extras:
             for delim in _PARAGRAPH_DELIMITOR:
@@ -151,8 +189,9 @@ def _recurs_merge_text_and_extras(
                         str_extras = "\n\n".join(filter(lambda x: x, extras))
                         if str_extras:
                             all_extras = delim + str_extras
-                        all_text = text_from_page[:pos] + all_extras + text_from_page[
-                                                                       pos:]
+                        all_text = (
+                            text_from_page[:pos] + all_extras + text_from_page[pos:]
+                        )
                     break
             else:
                 all_text = None
@@ -171,7 +210,6 @@ def _recurs_merge_text_and_extras(
     return all_text
 
 
-
 class ImagesPdfParser(BaseBlobParser):
     """Abstract interface for blob parsers with images_to_text."""
 
@@ -218,8 +256,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
             )
 
         def _extract_text_from_page(page: pypdf.PageObject) -> str:
-            """Extract text from image given the version of pypdf.
-            """
+            """Extract text from image given the version of pypdf."""
             if pypdf.__version__.startswith("3"):
                 return page.extract_text()
             else:
@@ -561,11 +598,11 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
                 for page in doc:
                     all_text = self._get_page_content(doc, page, blob).strip()
                     if self.mode == "page":
-
                         yield Document(
                             page_content=all_text,
-                            metadata=_validate_metadata(doc_metadata |
-                                                        {"page": page.number}),
+                            metadata=_validate_metadata(
+                                doc_metadata | {"page": page.number}
+                            ),
                         )
                     else:
                         full_content.append(all_text)
@@ -658,17 +695,16 @@ def _extract_images_from_page(
             if self.images_parser:
                 xref = img[0]
                 pix = pymupdf.Pixmap(doc, xref)
-                image=np.frombuffer(pix.samples, dtype=np.uint8).reshape(
-                        pix.height, pix.width, -1
-                    )
+                image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
+                    pix.height, pix.width, -1
+                )
                 image_bytes = io.BytesIO()
                 Image.fromarray(image).save(image_bytes, format="PNG")
-                blob=Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
+                blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
                 images.append(next(self.images_parser.lazy_parse(blob)).page_content)
         return _FORMAT_IMAGE_STR.format(
-                image_text=_JOIN_IMAGES.join(filter(None,images))
-            )
-
+            image_text=_JOIN_IMAGES.join(filter(None, images))
+        )
 
     def _extract_tables_from_page(self, page: pymupdf.Page) -> str:
         """Extract tables from a PDF page.