Skip to content

Commit

Permalink
Fix remarques
Browse files Browse the repository at this point in the history
  • Loading branch information
pprados committed Jan 10, 2025
1 parent 20f5a41 commit 6765dbf
Show file tree
Hide file tree
Showing 10 changed files with 213 additions and 152 deletions.
1 change: 1 addition & 0 deletions libs/community/extended_testing_deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ praw>=7.7.1,<8
premai>=0.3.25,<0.4
psychicapi>=0.8.0,<0.9
pydantic>=2.7.4,<3
pytesseract>=0.3.13
py-trello>=0.19.0,<0.20
pyjwt>=2.8.0,<3
pymupdf>=1.22.3,<2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
from langchain_community.document_loaders.parsers.html import (
BS4HTMLParser,
)
from langchain_community.document_loaders.parsers.images import (
MultimodalBlobParser,
RapidOCRBlobParser,
TesseractBlobParser,
)
from langchain_community.document_loaders.parsers.language import (
LanguageParser,
)
Expand All @@ -30,11 +35,6 @@
from langchain_community.document_loaders.parsers.vsdx import (
VsdxParser,
)
from langchain_community.document_loaders.parsers.images import (
MultimodalBlobParser,
RapidOCRBlobParser,
TesseractBlobParser,
)


_module_lookup = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,87 +3,97 @@
import io
import logging
from abc import abstractmethod
from typing import TYPE_CHECKING, Iterator, Literal

from PIL import Image
from typing import Iterator, Literal

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
import numpy as np
from langchain_core.documents import Document
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import HumanMessage

if TYPE_CHECKING:
from PIL.Image import Image

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob

logger = logging.getLogger(__name__)


class ImageBlobParser(BaseBlobParser):
def __init__(
self,
*,
format: Literal["text", "markdown", "html"] = "text",
self,
*,
format: Literal["text", "markdown", "html"] = "text",
):
self.format = format

@abstractmethod
def _analyze_image(self, img: Image) -> str:
def _analyze_image(self, img: "Image") -> str:
pass

def lazy_parse(self, blob: Blob) -> Iterator[Document]:
with blob.as_bytes_io() as buf:
img = Image.open(buf)
content = self._analyze_image(img)
if content:
if self.format == "markdown":
content = content.replace("]", r"\\]")
content = f"![{content}](.)"
elif self.format == "html":
content = f'<img alt="{html.escape(content, quote=True)}" />'
logger.debug("Image text: %s", content.replace("\n", "\\n"))
yield Document(
page_content=content,
metadata={"source": blob.source},
try:
from PIL import Image as Img

with blob.as_bytes_io() as buf:
img = Img.open(buf)
content = self._analyze_image(img)
if content:
if self.format == "markdown":
content = content.replace("]", r"\\]")
content = f"![{content}](.)"
elif self.format == "html":
content = f'<img alt="{html.escape(content, quote=True)}" />'
logger.debug("Image text: %s", content.replace("\n", "\\n"))
yield Document(
page_content=content,
metadata={"source": blob.source},
)
except ImportError:
raise ImportError(
"`rapidocr-onnxruntime` package not found, please install it with "
"`pip install Pillow`"
)


class RapidOCRBlobParser(ImageBlobParser):
def __init__(
self,
*,
format: Literal["text", "markdown", "html"] = "text",
self,
*,
format: Literal["text", "markdown", "html"] = "text",
):
super().__init__(format=format)
self.ocr = None

def _analyze_image(self, img: Image) -> str:
def _analyze_image(self, img: "Image") -> str:
if not self.ocr:
try:
from rapidocr_onnxruntime import RapidOCR

self.ocr = RapidOCR()
except ImportError:
raise ImportError(
"`rapidocr-onnxruntime` package not found, please install it with "
"`pip install rapidocr-onnxruntime`"
)
self.ocr = RapidOCR()
ocr_result, _ = self.ocr(img)
ocr_result, _ = self.ocr(np.array(img)) # type: ignore
content = ""
if ocr_result:
content = ("\n".join([text[1] for text in ocr_result])).strip()
return content


class TesseractBlobParser(ImageBlobParser):

def __init__(
self,
*,
format: Literal["text", "markdown", "html"] = "text",
langs: list[str] = ["eng"],

self,
*,
format: Literal["text", "markdown", "html"] = "text",
langs: list[str] = ["eng"],
):
super().__init__(format=format)
self.langs = langs

def _analyze_image(self, img: Image) -> str:
def _analyze_image(self, img: "Image") -> str:
try:
import pytesseract
except ImportError:
Expand All @@ -99,24 +109,23 @@ def _analyze_image(self, img: Image) -> str:
"images for retrieval. "
"These summaries will be embedded and used to retrieve the raw image. "
"Give a concise summary of the image that is well optimized for retrieval "
"and extract all the text from the image.")
"and extract all the text from the image."
)


class MultimodalBlobParser(ImageBlobParser):

def __init__(
self,
*,
format: Literal["text", "markdown", "html"] = "text",
model: BaseChatModel,
prompt: str = _prompt_images_to_description,

self,
*,
format: Literal["text", "markdown", "html"] = "text",
model: BaseChatModel,
prompt: str = _prompt_images_to_description,
):
super().__init__(format=format)
self.model = model
self.prompt = prompt

def _analyze_image(self, img: Image) -> str:
def _analyze_image(self, img: "Image") -> str:
image_bytes = io.BytesIO()
img.save(image_bytes, format="PNG")
img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
Expand Down
90 changes: 63 additions & 27 deletions libs/community/langchain_community/document_loaders/parsers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,33 @@

from __future__ import annotations

import html
import io
import logging
import threading
import warnings
from datetime import datetime
from urllib.parse import urlparse

import numpy as np
from typing import (
TYPE_CHECKING,
Any,
Iterable,
Iterator,
Literal,
Mapping,
Optional,
Sequence,
Union,
)
from urllib.parse import urlparse

import numpy as np
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.images import ImageBlobParser, \
RapidOCRBlobParser
from langchain_core.documents import Document
from langchain_community.document_loaders.parsers.images import (
ImageBlobParser,
RapidOCRBlobParser,
)

if TYPE_CHECKING:
import pdfminer
Expand All @@ -53,16 +55,49 @@
"JBIG2Decode",
]


def extract_from_images_with_rapidocr(
images: Sequence[Union[Iterable[np.ndarray], bytes]],
) -> str:
"""Extract text from images with RapidOCR.
Args:
images: Images to extract text from.
Returns:
Text extracted from images.
Raises:
ImportError: If `rapidocr-onnxruntime` package is not installed.
"""
try:
from rapidocr_onnxruntime import RapidOCR
except ImportError:
raise ImportError(
"`rapidocr-onnxruntime` package not found, please install it with "
"`pip install rapidocr-onnxruntime`"
)
ocr = RapidOCR()
text = ""
for img in images:
result, _ = ocr(img)
if result:
result = [text[1] for text in result]
text += "\n".join(result)
return text


logger = logging.getLogger(__name__)

_FORMAT_IMAGE_STR = "\n\n{image_text}\n\n"
_JOIN_IMAGES = "\n"
_JOIN_TABLES = "\n"
_DEFAULT_PAGE_DELIMITOR = "\n\f"

_STD_METADATA_KEYS={"source", "total_pages", "creationdate", "creator", "producer"}
_STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"}


def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
"""Validates the presence of at least the following keys:
- source
- page (if mode='page')
Expand All @@ -73,7 +108,7 @@ def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
"""
if not _STD_METADATA_KEYS.issubset(metadata.keys()):
raise ValueError("The PDF parser must valorize the standard metadata.")
if not isinstance(metadata.get("page",0), int):
if not isinstance(metadata.get("page", 0), int):
raise ValueError("The PDF metadata page must be a integer.")
return metadata

Expand Down Expand Up @@ -116,7 +151,10 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
return new_metadata


_PARAGRAPH_DELIMITOR = ["\n\n\n", "\n\n"] # To insert images or table in the middle of the page.
_PARAGRAPH_DELIMITOR = [
"\n\n\n",
"\n\n",
] # To insert images or table in the middle of the page.


def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
Expand All @@ -132,7 +170,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
"""

def _recurs_merge_text_and_extras(
extras: list[str], text_from_page: str, recurs: bool
extras: list[str], text_from_page: str, recurs: bool
) -> Optional[str]:
if extras:
for delim in _PARAGRAPH_DELIMITOR:
Expand All @@ -151,8 +189,9 @@ def _recurs_merge_text_and_extras(
str_extras = "\n\n".join(filter(lambda x: x, extras))
if str_extras:
all_extras = delim + str_extras
all_text = text_from_page[:pos] + all_extras + text_from_page[
pos:]
all_text = (
text_from_page[:pos] + all_extras + text_from_page[pos:]
)
break
else:
all_text = None
Expand All @@ -171,7 +210,6 @@ def _recurs_merge_text_and_extras(
return all_text



class ImagesPdfParser(BaseBlobParser):
"""Abstract interface for blob parsers with images_to_text."""

Expand Down Expand Up @@ -218,8 +256,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
)

def _extract_text_from_page(page: pypdf.PageObject) -> str:
"""Extract text from image given the version of pypdf.
"""
"""Extract text from image given the version of pypdf."""
if pypdf.__version__.startswith("3"):
return page.extract_text()
else:
Expand Down Expand Up @@ -561,11 +598,11 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
for page in doc:
all_text = self._get_page_content(doc, page, blob).strip()
if self.mode == "page":

yield Document(
page_content=all_text,
metadata=_validate_metadata(doc_metadata |
{"page": page.number}),
metadata=_validate_metadata(
doc_metadata | {"page": page.number}
),
)
else:
full_content.append(all_text)
Expand Down Expand Up @@ -658,17 +695,16 @@ def _extract_images_from_page(
if self.images_parser:
xref = img[0]
pix = pymupdf.Pixmap(doc, xref)
image=np.frombuffer(pix.samples, dtype=np.uint8).reshape(
pix.height, pix.width, -1
)
image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
pix.height, pix.width, -1
)
image_bytes = io.BytesIO()
Image.fromarray(image).save(image_bytes, format="PNG")
blob=Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
images.append(next(self.images_parser.lazy_parse(blob)).page_content)
return _FORMAT_IMAGE_STR.format(
image_text=_JOIN_IMAGES.join(filter(None,images))
)

image_text=_JOIN_IMAGES.join(filter(None, images))
)

def _extract_tables_from_page(self, page: pymupdf.Page) -> str:
"""Extract tables from a PDF page.
Expand Down
Loading

0 comments on commit 6765dbf

Please sign in to comment.