Merge pull request #13 from enoch3712/5-add-azure-document-intelligen…

…ce-as-a-documentloader 5 add azure document intelligence as a documentloader
enoch3712 · Jun 6, 2024 · e21bb82 · e21bb82
2 parents 31dd376 + 1d6bcd6
commit e21bb82
Show file tree

Hide file tree

Showing 9 changed files with 703 additions and 440 deletions.
diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml
@@ -16,7 +16,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v3
       with:
-        python-version: '3.8'
+        python-version: '3.9'
 
     - name: Install dependencies
       run: |

diff --git a/extract_thinker/__init__.py b/extract_thinker/__init__.py
@@ -3,6 +3,7 @@
 from .document_loader.cached_document_loader import CachedDocumentLoader
 from .document_loader.document_loader_tesseract import DocumentLoaderTesseract
 from .document_loader.document_loader_spreadsheet import DocumentLoaderSpreadSheet
+from .document_loader.document_loader_azure_document_intelligence import DocumentLoaderAzureForm
 from .document_loader.document_loader_pypdf import DocumentLoaderPyPdf
 from .document_loader.document_loader_text import DocumentLoaderText
 from .models import classification, classification_response
@@ -18,6 +19,7 @@
     'DocumentLoader',
     'CachedDocumentLoader',
     'DocumentLoaderTesseract',
+    'DocumentLoaderAzureForm',
     'DocumentLoaderPyPdf',
     'DocumentLoaderText',
     'classification',

diff --git a/extract_thinker/document_loader/azure_form_recognizer_loader.py b/extract_thinker/document_loader/azure_form_recognizer_loader.py
diff --git a/extract_thinker/document_loader/document_loader_azure_document_intelligence.py b/extract_thinker/document_loader/document_loader_azure_document_intelligence.py
@@ -0,0 +1,108 @@
+from io import BytesIO
+from operator import attrgetter
+from typing import Any, List, Union
+from azure.core.credentials import AzureKeyCredential
+from azure.ai.formrecognizer import AnalyzeResult, DocumentPage, DocumentTable, Point
+from azure.ai.formrecognizer import DocumentAnalysisClient
+from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
+from cachetools import cachedmethod
+from cachetools.keys import hashkey
+
+
+class DocumentLoaderAzureForm(CachedDocumentLoader):
+    def __init__(self, subscription_key: str, endpoint: str, content: Any = None, cache_ttl: int = 300):
+        super().__init__(content, cache_ttl)
+        self.subscription_key = subscription_key
+        self.endpoint = endpoint
+        self.credential = AzureKeyCredential(self.subscription_key)
+        self.client = DocumentAnalysisClient(endpoint=self.endpoint, credential=self.credential)
+
+    @cachedmethod(cache=attrgetter('cache'), key=lambda self, file_path: hashkey(file_path))
+    def load_content_from_file(self, file_path: str) -> Union[str, object]:
+        try:
+            with open(file_path, "rb") as document:
+                poller = self.client.begin_analyze_document("prebuilt-layout", document)
+                result = poller.result()
+                return self.process_result(result)
+        except Exception as e:
+            raise Exception(f"Error processing file: {e}") from e
+
+    @cachedmethod(cache=attrgetter('cache'), key=lambda self, stream: hashkey(id(stream)))
+    def load_content_from_stream(self, stream: Union[BytesIO, str]) -> Union[str, object]:
+        try:
+            poller = self.client.begin_analyze_document("prebuilt-layout", stream)
+            result = poller.result()
+            return self.process_result(result)
+        except Exception as e:
+            raise Exception(f"Error processing stream: {e}") from e
+
+    def process_result(self, result: AnalyzeResult) -> List[dict]:
+        extract_results = []
+        tables = self.build_tables(result.tables)
+        for page in result.pages:
+            paragraphs = [p.content for p in page.lines]
+            tables = self.build_tables(result.tables)
+            # words_with_locations = self.process_words(page)
+            # Remove lines that are present in tables
+            paragraphs = self.remove_lines_present_in_tables(paragraphs, tables)
+            output = {
+                #"content": result.content,
+                "paragraphs": paragraphs,
+                #"words": words_with_locations,
+                "tables": tables.get(page.page_number, [])
+            }
+            extract_results.append(output)
+        return {"pages": extract_results}
+
+    def remove_lines_present_in_tables(self, paragraphs: List[str], tables: dict[int, List[List[str]]]) -> List[str]:
+        for table in tables.values():
+            for row in table:
+                for cell in row:
+                    if cell in paragraphs:
+                        paragraphs.remove(cell)
+        return paragraphs
+
+    def page_to_string(self, page: DocumentPage) -> str:
+        page_string = ""
+        for word in page.words:
+            for point in word.polygon:
+                page_string += f"({point.x}, {point.y}): {word.content}\n"
+        return page_string
+
+    def process_words(self, page: DocumentPage) -> List[dict]:
+        words_with_locations = []
+
+        for word in page.words:
+            word_info = {
+                "content": word.content,
+                "bounding_box": {
+                    "points": word.polygon
+                },
+                "page_number": page.page_number
+            }
+            words_with_locations.append(word_info)
+
+        return words_with_locations
+
+    def build_tables(self, tables: List[DocumentTable]) -> dict[int, List[List[str]]]:
+        table_data = {}
+        for table in tables:
+            rows = []
+            for row_idx in range(table.row_count):
+                row = []
+                for cell in table.cells:
+                    if cell.row_index == row_idx:
+                        row.append(cell.content)
+                rows.append(row)
+            # Use the page number as the key for the dictionary
+            table_data[table.bounding_regions[0].page_number] = rows
+        return table_data
+
+    def build_points(self, bounding_box: List[Point]) -> List[dict]:
+        return [{"x": point.x, "y": point.y} for point in bounding_box]
+
+    def load_content_from_stream_list(self, stream: BytesIO) -> List[Any]:
+        pass
+
+    def load_content_from_file_list(self, file_path: str) -> List[Any]:
+        pass