Merge pull request #44 from enoch3712/40-split-process-aggregate-appr…

…oach-beyond-model-context 40 split process aggregate approach beyond model context
enoch3712 · Oct 30, 2024 · 1203996 · 1203996
2 parents 7b1e9c4 + 3fe3643
commit 1203996
Show file tree

Hide file tree

Showing 15 changed files with 682 additions and 204 deletions.
diff --git a/extract_thinker/__init__.py b/extract_thinker/__init__.py
@@ -1,3 +1,4 @@
+from .models.classification_strategy import ClassificationStrategy
 from .extractor import Extractor
 from .llm import LLM
 from .document_loader.document_loader import DocumentLoader
@@ -8,9 +9,10 @@
 from .document_loader.document_loader_pypdf import DocumentLoaderPyPdf
 from .document_loader.document_loader_pdfplumber import DocumentLoaderPdfPlumber
 from .models import classification, classification_response
-from .process import Process, ClassificationStrategy
+from .process import Process
 from .splitter import Splitter
 from .image_splitter import ImageSplitter
+from .text_splitter import TextSplitter
 from .models.classification import Classification
 from .models.contract import Contract
 
@@ -31,6 +33,8 @@
     'ClassificationStrategy',
     'Splitter',
     'ImageSplitter',
+    'TextSplitter',
     'Classification',
-    'Contract'
+    'Contract',
+    'SplittingStrategy',
 ]
diff --git a/extract_thinker/document_loader/document_loader.py b/extract_thinker/document_loader/document_loader.py
@@ -114,27 +114,18 @@ def _convert_stream_to_images(self, file_stream: io.BytesIO, scale: float) -> Di
         return self._convert_pdf_to_images(pdfium.PdfDocument(file_stream), scale)
 
     def _convert_pdf_to_images(self, pdf_file, scale: float) -> Dict[int, bytes]:
-        page_indices = [i for i in range(len(pdf_file))]
-
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            futures = {i: executor.submit(self.render_page, pdf_file, i, scale) for i in page_indices}
-
-        final_images = {}
-        for i, future in futures.items():
-            final_images[i] = future.result()
-
-        return final_images
-
-    @staticmethod
-    def render_page(pdf_file: pdfium.PdfDocument, page_index: int, scale: float) -> Dict[int, bytes]:
+        # Get all pages at once
         renderer = pdf_file.render(
             pdfium.PdfBitmap.to_pil,
-            page_indices=[page_index],
+            page_indices=list(range(len(pdf_file))),
             scale=scale,
         )
-        image_list = list(renderer)
-        image = image_list[0]
-        image_byte_array = BytesIO()
-        image.save(image_byte_array, format="jpeg", optimize=True)
-        image_byte_array = image_byte_array.getvalue()
-        return {page_index: image_byte_array}
+
+        # Convert all images to bytes and store in dictionary
+        final_images = {}
+        for page_index, image in enumerate(renderer):
+            image_byte_array = BytesIO()
+            image.save(image_byte_array, format="jpeg", optimize=True)
+            final_images[page_index] = image_byte_array.getvalue()
+
+        return final_images
diff --git a/extract_thinker/document_loader/document_loader_tesseract.py b/extract_thinker/document_loader/document_loader_tesseract.py
@@ -1,6 +1,7 @@
 from io import BytesIO
 from operator import attrgetter
 import os
+
 import threading
 from typing import Any, List, Union
 from PIL import Image
@@ -74,16 +75,27 @@ def process_pdf(self, stream: BytesIO) -> str:
         try:
             # Reset stream position
             stream.seek(0)
-            # Can you give me a file: Union[str, io.BytesIO]
             file = BytesIO(stream.read())
             images = self.convert_to_images(file)
+
+            # Add debug logging
+            if not images:
+                raise Exception("No images were extracted from PDF")
+
             extracted_text = []
-
             for page_number, image_bytes in images.items():
-                image = BytesIO(image_bytes[0])
-                text = self.process_image(image)
+                # Check if image_bytes is not empty and has the expected structure
+                # if not image_bytes or not isinstance(image_bytes, (list, tuple)):
+                #     print(f"Skipping page {page_number}: Invalid image data")
+                #     continue
+
+                # image = BytesIO(image_bytes[0])
+                text = self.process_image(image_bytes)
                 extracted_text.append(text)
 
+            if not extracted_text:
+                raise Exception("No text was extracted from any pages")
+
             # Combine text from all pages
             self.content = "\n".join(extracted_text)
             return self.content
@@ -93,7 +105,9 @@ def process_pdf(self, stream: BytesIO) -> str:
     def process_image(self, image: BytesIO) -> str:
         for attempt in range(3):
             try:
-                raw_text = str(pytesseract.image_to_string(Image.open(image)))
+                # Convert bytes to PIL Image
+                pil_image = Image.open(image)
+                raw_text = str(pytesseract.image_to_string(pil_image))
                 if raw_text:
                     return raw_text
             except Exception as e:
@@ -113,6 +127,7 @@ def worker(self, input_queue: Queue, output_queue: Queue):
                 output_queue.put((image, str(e)))
             input_queue.task_done()
 
+    @cachedmethod(cache=attrgetter('cache'), key=lambda self, stream: hashkey(id(stream)))
     def load_content_from_stream_list(self, stream: BytesIO) -> List[Any]:
         images = self.convert_to_images(stream)
         input_queue = Queue()
@@ -140,8 +155,12 @@ def load_content_from_stream_list(self, stream: BytesIO) -> List[Any]:
             image, content = output_queue.get()
             contents.append({"image": image, "content": content})
 
+        # put the first page at the end of the list
+        contents.append(contents.pop(0))
+
         return contents
 
+    @cachedmethod(cache=attrgetter('cache'), key=lambda self, input: hashkey(id(input)))
     def load_content_from_file_list(self, input: List[Union[str, BytesIO]]) -> List[Any]:
         images = self.convert_to_images(input)
         input_queue = Queue()
@@ -169,4 +188,7 @@ def load_content_from_file_list(self, input: List[Union[str, BytesIO]]) -> List[
             image, content = output_queue.get()
             contents.append({"image": Image.open(image), "content": content})
 
-        return contents
+        # put the first page at the end of the list
+        contents.append(contents.pop(0))
+
+        return contents