-
Notifications
You must be signed in to change notification settings - Fork 15.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactoring PDF loaders: 02 PyMuPDF #29063
base: master
Are you sure you want to change the base?
Changes from all commits
21759e2
4607354
668dc9c
7a5b5c5
6340ded
4845781
3beda82
743a83e
b623750
20f5a41
91234f0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,149 @@ | ||||||
import base64 | ||||||
import html | ||||||
import io | ||||||
import logging | ||||||
from abc import abstractmethod | ||||||
from typing import TYPE_CHECKING, Iterator, Literal | ||||||
|
||||||
import numpy as np | ||||||
from langchain_core.documents import Document | ||||||
from langchain_core.language_models import BaseChatModel | ||||||
from langchain_core.messages import HumanMessage | ||||||
|
||||||
if TYPE_CHECKING: | ||||||
from PIL.Image import Image | ||||||
|
||||||
from langchain_community.document_loaders.base import BaseBlobParser | ||||||
from langchain_community.document_loaders.blob_loaders import Blob | ||||||
|
||||||
logger = logging.getLogger(__name__) | ||||||
|
||||||
|
||||||
class ImageBlobParser(BaseBlobParser): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we rename to BaseImageBlobParser or mark as private so it's clear that i's abstract |
||||||
def __init__( | ||||||
self, | ||||||
*, | ||||||
format: Literal["text", "markdown", "html"] = "text", | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This format is fairly surprising to see as part of the API -- but I think I'm OK with it. |
||||||
): | ||||||
self.format = format | ||||||
|
||||||
@abstractmethod | ||||||
def _analyze_image(self, img: "Image") -> str: | ||||||
pass | ||||||
|
||||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: | ||||||
try: | ||||||
from PIL import Image as Img | ||||||
|
||||||
with blob.as_bytes_io() as buf: | ||||||
img = Img.open(buf) | ||||||
content = self._analyze_image(img) | ||||||
if content: | ||||||
if self.format == "markdown": | ||||||
content = content.replace("]", r"\\]") | ||||||
content = f"![{content}](.)" | ||||||
elif self.format == "html": | ||||||
content = f'<img alt="{html.escape(content, quote=True)}" />' | ||||||
logger.debug("Image text: %s", content.replace("\n", "\\n")) | ||||||
yield Document( | ||||||
page_content=content, | ||||||
metadata={"source": blob.source}, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Propagate blob metadata as well? |
||||||
) | ||||||
except ImportError: | ||||||
raise ImportError( | ||||||
"`rapidocr-onnxruntime` package not found, please install it with " | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: incorrect import error |
||||||
"`pip install Pillow`" | ||||||
) | ||||||
|
||||||
|
||||||
class RapidOCRBlobParser(ImageBlobParser): | ||||||
def __init__( | ||||||
self, | ||||||
*, | ||||||
format: Literal["text", "markdown", "html"] = "text", | ||||||
): | ||||||
super().__init__(format=format) | ||||||
self.ocr = None | ||||||
|
||||||
def _analyze_image(self, img: "Image") -> str: | ||||||
if not self.ocr: | ||||||
try: | ||||||
from rapidocr_onnxruntime import RapidOCR | ||||||
|
||||||
self.ocr = RapidOCR() | ||||||
except ImportError: | ||||||
raise ImportError( | ||||||
"`rapidocr-onnxruntime` package not found, please install it with " | ||||||
"`pip install rapidocr-onnxruntime`" | ||||||
) | ||||||
ocr_result, _ = self.ocr(np.array(img)) # type: ignore | ||||||
content = "" | ||||||
if ocr_result: | ||||||
content = ("\n".join([text[1] for text in ocr_result])).strip() | ||||||
return content | ||||||
|
||||||
|
||||||
class TesseractBlobParser(ImageBlobParser): | ||||||
def __init__( | ||||||
self, | ||||||
*, | ||||||
format: Literal["text", "markdown", "html"] = "text", | ||||||
langs: list[str] = ["eng"], | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. tuple -- should not be a mutable default |
||||||
): | ||||||
super().__init__(format=format) | ||||||
self.langs = langs | ||||||
|
||||||
def _analyze_image(self, img: "Image") -> str: | ||||||
try: | ||||||
import pytesseract | ||||||
except ImportError: | ||||||
raise ImportError( | ||||||
"`pytesseract` package not found, please install it with " | ||||||
"`pip install pytesseract`" | ||||||
) | ||||||
return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip() | ||||||
|
||||||
|
||||||
_prompt_images_to_description = ( | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: style to follow google conventions a bit more closely
Suggested change
|
||||||
"You are an assistant tasked with summarizing " | ||||||
"images for retrieval. " | ||||||
"These summaries will be embedded and used to retrieve the raw image. " | ||||||
"Give a concise summary of the image that is well optimized for retrieval " | ||||||
"and extract all the text from the image." | ||||||
) | ||||||
|
||||||
|
||||||
class MultimodalBlobParser(ImageBlobParser): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Goal is to communicate that this is being done by a multi modal llm |
||||||
def __init__( | ||||||
self, | ||||||
*, | ||||||
format: Literal["text", "markdown", "html"] = "text", | ||||||
model: BaseChatModel, | ||||||
prompt: str = _prompt_images_to_description, | ||||||
): | ||||||
super().__init__(format=format) | ||||||
self.model = model | ||||||
self.prompt = prompt | ||||||
|
||||||
def _analyze_image(self, img: "Image") -> str: | ||||||
image_bytes = io.BytesIO() | ||||||
img.save(image_bytes, format="PNG") | ||||||
img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8") | ||||||
msg = self.model.invoke( | ||||||
[ | ||||||
HumanMessage( | ||||||
content=[ | ||||||
{"type": "text", "text": self.prompt}, | ||||||
{ | ||||||
"type": "image_url", | ||||||
"image_url": { | ||||||
"url": f"data:image/jpeg;base64,{img_base64}" | ||||||
}, | ||||||
}, | ||||||
] | ||||||
) | ||||||
] | ||||||
) | ||||||
result = msg.content | ||||||
assert isinstance(result, str) | ||||||
return result |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this makes sense -- would you be willing to do a documentation pass for the API reference for this file? you could push the entire code through chat gpt and ask for google style doc-strings. It'll probably do a reasonable