Skip to content

Commit

Permalink
Merge pull request #44 from enoch3712/40-split-process-aggregate-appr…
Browse files Browse the repository at this point in the history
…oach-beyond-model-context

40 split process aggregate approach beyond model context
  • Loading branch information
enoch3712 authored Oct 30, 2024
2 parents 7b1e9c4 + 3fe3643 commit 1203996
Show file tree
Hide file tree
Showing 15 changed files with 682 additions and 204 deletions.
8 changes: 6 additions & 2 deletions extract_thinker/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .models.classification_strategy import ClassificationStrategy
from .extractor import Extractor
from .llm import LLM
from .document_loader.document_loader import DocumentLoader
Expand All @@ -8,9 +9,10 @@
from .document_loader.document_loader_pypdf import DocumentLoaderPyPdf
from .document_loader.document_loader_pdfplumber import DocumentLoaderPdfPlumber
from .models import classification, classification_response
from .process import Process, ClassificationStrategy
from .process import Process
from .splitter import Splitter
from .image_splitter import ImageSplitter
from .text_splitter import TextSplitter
from .models.classification import Classification
from .models.contract import Contract

Expand All @@ -31,6 +33,8 @@
'ClassificationStrategy',
'Splitter',
'ImageSplitter',
'TextSplitter',
'Classification',
'Contract'
'Contract',
'SplittingStrategy',
]
31 changes: 11 additions & 20 deletions extract_thinker/document_loader/document_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,27 +114,18 @@ def _convert_stream_to_images(self, file_stream: io.BytesIO, scale: float) -> Di
return self._convert_pdf_to_images(pdfium.PdfDocument(file_stream), scale)

def _convert_pdf_to_images(self, pdf_file, scale: float) -> Dict[int, bytes]:
page_indices = [i for i in range(len(pdf_file))]

with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {i: executor.submit(self.render_page, pdf_file, i, scale) for i in page_indices}

final_images = {}
for i, future in futures.items():
final_images[i] = future.result()

return final_images

@staticmethod
def render_page(pdf_file: pdfium.PdfDocument, page_index: int, scale: float) -> Dict[int, bytes]:
# Get all pages at once
renderer = pdf_file.render(
pdfium.PdfBitmap.to_pil,
page_indices=[page_index],
page_indices=list(range(len(pdf_file))),
scale=scale,
)
image_list = list(renderer)
image = image_list[0]
image_byte_array = BytesIO()
image.save(image_byte_array, format="jpeg", optimize=True)
image_byte_array = image_byte_array.getvalue()
return {page_index: image_byte_array}

# Convert all images to bytes and store in dictionary
final_images = {}
for page_index, image in enumerate(renderer):
image_byte_array = BytesIO()
image.save(image_byte_array, format="jpeg", optimize=True)
final_images[page_index] = image_byte_array.getvalue()

return final_images
34 changes: 28 additions & 6 deletions extract_thinker/document_loader/document_loader_tesseract.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from io import BytesIO
from operator import attrgetter
import os

import threading
from typing import Any, List, Union
from PIL import Image
Expand Down Expand Up @@ -74,16 +75,27 @@ def process_pdf(self, stream: BytesIO) -> str:
try:
# Reset stream position
stream.seek(0)
# Can you give me a file: Union[str, io.BytesIO]
file = BytesIO(stream.read())
images = self.convert_to_images(file)

# Add debug logging
if not images:
raise Exception("No images were extracted from PDF")

extracted_text = []

for page_number, image_bytes in images.items():
image = BytesIO(image_bytes[0])
text = self.process_image(image)
# Check if image_bytes is not empty and has the expected structure
# if not image_bytes or not isinstance(image_bytes, (list, tuple)):
# print(f"Skipping page {page_number}: Invalid image data")
# continue

# image = BytesIO(image_bytes[0])
text = self.process_image(image_bytes)
extracted_text.append(text)

if not extracted_text:
raise Exception("No text was extracted from any pages")

# Combine text from all pages
self.content = "\n".join(extracted_text)
return self.content
Expand All @@ -93,7 +105,9 @@ def process_pdf(self, stream: BytesIO) -> str:
def process_image(self, image: BytesIO) -> str:
for attempt in range(3):
try:
raw_text = str(pytesseract.image_to_string(Image.open(image)))
# Convert bytes to PIL Image
pil_image = Image.open(image)
raw_text = str(pytesseract.image_to_string(pil_image))
if raw_text:
return raw_text
except Exception as e:
Expand All @@ -113,6 +127,7 @@ def worker(self, input_queue: Queue, output_queue: Queue):
output_queue.put((image, str(e)))
input_queue.task_done()

@cachedmethod(cache=attrgetter('cache'), key=lambda self, stream: hashkey(id(stream)))
def load_content_from_stream_list(self, stream: BytesIO) -> List[Any]:
images = self.convert_to_images(stream)
input_queue = Queue()
Expand Down Expand Up @@ -140,8 +155,12 @@ def load_content_from_stream_list(self, stream: BytesIO) -> List[Any]:
image, content = output_queue.get()
contents.append({"image": image, "content": content})

# put the first page at the end of the list
contents.append(contents.pop(0))

return contents

@cachedmethod(cache=attrgetter('cache'), key=lambda self, input: hashkey(id(input)))
def load_content_from_file_list(self, input: List[Union[str, BytesIO]]) -> List[Any]:
images = self.convert_to_images(input)
input_queue = Queue()
Expand Down Expand Up @@ -169,4 +188,7 @@ def load_content_from_file_list(self, input: List[Union[str, BytesIO]]) -> List[
image, content = output_queue.get()
contents.append({"image": Image.open(image), "content": content})

return contents
# put the first page at the end of the list
contents.append(contents.pop(0))

return contents
Loading

0 comments on commit 1203996

Please sign in to comment.