Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring PDF loaders: 02 PyMuPDF #29063

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,165 changes: 1,121 additions & 44 deletions docs/docs/integrations/document_loaders/pymupdf.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions libs/community/extended_testing_deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ praw>=7.7.1,<8
premai>=0.3.25,<0.4
psychicapi>=0.8.0,<0.9
pydantic>=2.7.4,<3
pytesseract>=0.3.13
py-trello>=0.19.0,<0.20
pyjwt>=2.8.0,<3
pymupdf>=1.22.3,<2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
from langchain_community.document_loaders.parsers.html import (
BS4HTMLParser,
)
from langchain_community.document_loaders.parsers.images import (
MultimodalBlobParser,
RapidOCRBlobParser,
TesseractBlobParser,
)
from langchain_community.document_loaders.parsers.language import (
LanguageParser,
)
Expand All @@ -38,12 +43,15 @@
"DocAIParser": "langchain_community.document_loaders.parsers.docai",
"GrobidParser": "langchain_community.document_loaders.parsers.grobid",
"LanguageParser": "langchain_community.document_loaders.parsers.language",
"MultimodalBlobParser": "langchain_community.document_loaders.parsers.images",
"OpenAIWhisperParser": "langchain_community.document_loaders.parsers.audio",
"PDFMinerParser": "langchain_community.document_loaders.parsers.pdf",
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
"VsdxParser": "langchain_community.document_loaders.parsers.vsdx",
}

Expand All @@ -61,11 +69,14 @@ def __getattr__(name: str) -> Any:
"DocAIParser",
"GrobidParser",
"LanguageParser",
"MultimodalBlobParser",
"OpenAIWhisperParser",
"PDFMinerParser",
"PDFPlumberParser",
"PyMuPDFParser",
"PyPDFParser",
"PyPDFium2Parser",
"RapidOCRBlobParser",
"TesseractBlobParser",
"VsdxParser",
]
149 changes: 149 additions & 0 deletions libs/community/langchain_community/document_loaders/parsers/images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import base64
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this makes sense -- would you be willing to do a documentation pass for the API reference for this file? you could push the entire code through chat gpt and ask for google style doc-strings. It'll probably do a reasonable

import html
import io
import logging
from abc import abstractmethod
from typing import TYPE_CHECKING, Iterator, Literal

import numpy as np
from langchain_core.documents import Document
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import HumanMessage

if TYPE_CHECKING:
from PIL.Image import Image

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob

logger = logging.getLogger(__name__)


class ImageBlobParser(BaseBlobParser):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we rename to BaseImageBlobParser or mark as private so it's clear that i's abstract

def __init__(
self,
*,
format: Literal["text", "markdown", "html"] = "text",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This format is fairly surprising to see as part of the API -- but I think I'm OK with it.

):
self.format = format

@abstractmethod
def _analyze_image(self, img: "Image") -> str:
pass

def lazy_parse(self, blob: Blob) -> Iterator[Document]:
try:
from PIL import Image as Img

with blob.as_bytes_io() as buf:
img = Img.open(buf)
content = self._analyze_image(img)
if content:
if self.format == "markdown":
content = content.replace("]", r"\\]")
content = f"![{content}](.)"
elif self.format == "html":
content = f'<img alt="{html.escape(content, quote=True)}" />'
logger.debug("Image text: %s", content.replace("\n", "\\n"))
yield Document(
page_content=content,
metadata={"source": blob.source},
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Propagate blob metadata as well?

)
except ImportError:
raise ImportError(
"`rapidocr-onnxruntime` package not found, please install it with "
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: incorrect import error

"`pip install Pillow`"
)


class RapidOCRBlobParser(ImageBlobParser):
def __init__(
self,
*,
format: Literal["text", "markdown", "html"] = "text",
):
super().__init__(format=format)
self.ocr = None

def _analyze_image(self, img: "Image") -> str:
if not self.ocr:
try:
from rapidocr_onnxruntime import RapidOCR

self.ocr = RapidOCR()
except ImportError:
raise ImportError(
"`rapidocr-onnxruntime` package not found, please install it with "
"`pip install rapidocr-onnxruntime`"
)
ocr_result, _ = self.ocr(np.array(img)) # type: ignore
content = ""
if ocr_result:
content = ("\n".join([text[1] for text in ocr_result])).strip()
return content


class TesseractBlobParser(ImageBlobParser):
def __init__(
self,
*,
format: Literal["text", "markdown", "html"] = "text",
langs: list[str] = ["eng"],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tuple -- should not be a mutable default

):
super().__init__(format=format)
self.langs = langs

def _analyze_image(self, img: "Image") -> str:
try:
import pytesseract
except ImportError:
raise ImportError(
"`pytesseract` package not found, please install it with "
"`pip install pytesseract`"
)
return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()


_prompt_images_to_description = (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: style to follow google conventions a bit more closely

Suggested change
_prompt_images_to_description = (
_PROMPT_IMAGES_TO_DESCRIPTION = (

"You are an assistant tasked with summarizing "
"images for retrieval. "
"These summaries will be embedded and used to retrieve the raw image. "
"Give a concise summary of the image that is well optimized for retrieval "
"and extract all the text from the image."
)


class MultimodalBlobParser(ImageBlobParser):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LLMImageBlobParser or something like that?

Goal is to communicate that this is being done by a multi modal llm

def __init__(
self,
*,
format: Literal["text", "markdown", "html"] = "text",
model: BaseChatModel,
prompt: str = _prompt_images_to_description,
):
super().__init__(format=format)
self.model = model
self.prompt = prompt

def _analyze_image(self, img: "Image") -> str:
image_bytes = io.BytesIO()
img.save(image_bytes, format="PNG")
img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
msg = self.model.invoke(
[
HumanMessage(
content=[
{"type": "text", "text": self.prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_base64}"
},
},
]
)
]
)
result = msg.content
assert isinstance(result, str)
return result
Loading
Loading