Merge pull request #2 from enoch3712/extractThinker

Extract thinker 0.0.1
enoch3712 · May 14, 2024 · 57c0e8c · 57c0e8c
2 parents cb5d01d + 741b9aa
commit 57c0e8c
Show file tree

Hide file tree

Showing 64 changed files with 3,707 additions and 44 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+ignore = E501
diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml
@@ -0,0 +1,37 @@
+name: Python package workflow
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.8'
+
+    - name: Install dependencies
+      run: |
+        pip install poetry
+        poetry install
+
+    - name: Run tests
+      run: poetry run pytest
+
+    - name: Build package
+      run: poetry build
+
+    - name: Publish to PyPI
+      if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+      uses: pypa/[email protected]
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -162,3 +162,6 @@ Scripts/unstructured-ingest-script.py
 Scripts/unstructured-ingest.exe
 Scripts/uvicorn.exe
 Scripts/vba_extract.py
+
+# VSCode settings
+.vscode/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,23 @@
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.7 # Ruff version
+    hooks:
+      - id: ruff # Run the linter.
+        name: Run Linter Check (Ruff)
+        args: [ --fix ]
+        files: ^(extractthinker|tests|examples)/
+      - id: ruff-format       # Run the formatter.
+        name: Run Formatter (Ruff)
+  - repo: local
+    hooks:
+      - id: ci_type_mypy
+        name: Run Type Check (Mypy)
+        entry: >
+            bash -c 'set -o pipefail;
+            export CUSTOM_PACKAGES="extractthinker/_types/_alias.py extractthinker/cli/cli.py extractthinker/cli/files.py extractthinker/cli/usage.py extractthinker/exceptions.py" &&
+            export CUSTOM_FLAGS="--python-version=3.9 --color-output --no-pretty --follow-imports=skip" &&
+            curl -sSL https://raw.githubusercontent.com/gao-hongnan/omniverse/2fd5de1b8103e955cd5f022ab016b72fa901fa8f/scripts/devops/continuous-integration/type_mypy.sh |
+            bash'
+        language: system
+        types: [python]
+        pass_filenames: false
diff --git a/.ruff.toml b/.ruff.toml
@@ -0,0 +1,64 @@
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".git-rewrite",
+    ".hg",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "venv",
+]
+
+# Same as Black.
+line-length = 88
+output-format = "grouped"
+
+target-version = "py39"
+
+[lint]
+select = [
+  # bugbear rules
+  "B",
+  # remove unused imports
+  "F401",
+  # bare except statements
+  "E722",
+  # unused arguments
+  "ARG",
+  # redefined variables
+  "ARG005",
+]
+ignore = [
+  # mutable defaults
+  "B006",
+  "B018",
+]
+
+unfixable = [
+  # disable auto fix for print statements
+  "T201",
+  "T203",
+]
+ignore-init-module-imports = true
+
+[extend-per-file-ignores]
+"instructor/distil.py" = ["ARG002"]
+"tests/test_distil.py" = ["ARG001"]
+"tests/test_patch.py" = ["ARG001"]
+"examples/task_planner/task_planner_topological_sort.py" = ["ARG002"]
+"examples/citation_with_extraction/main.py" = ["ARG001"]
+
diff --git a/README.md b/README.md
@@ -1,65 +1,133 @@
-# Open-DocLLM
+<p align="center">
+  <img src="https://github.com/enoch3712/Open-DocLLM/assets/9283394/41d9d151-acb5-44da-9c10-0058f76c2512" alt="Extract Thinker Logo" width="200"/>
+</p>
+<p align="center">
+<a href="https://medium.com/@enoch3712">
+    <img alt="Medium" src="https://img.shields.io/badge/Medium-12100E?style=flat&logo=medium&logoColor=white" />
+</a>
+<img alt="GitHub Last Commit" src="https://img.shields.io/github/last-commit/enoch3712/Open-DocLLM" />
+<img alt="Github License" src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" />
+</p>
 
-## Introduction
-This project aims to tackle the challenges of data extraction and processing using OCR and LLM. It is inspired by JP Morgan's DocLLM but is fully open-source and offers a larger context window size. The project is divided into two parts: the OCR and LLM layer.
+# ExtractThinker
 
-![image](https://github.com/enoch3712/Open-DocLLM/assets/9283394/2612cc9e-fc66-401e-912d-3acaef42d9cc)
+Library to extract data from files and documents agnostically using LLMs. `extract_thinker` provides ORM-style interaction between files and LLMs, allowing for flexible and powerful document extraction workflows.
 
-## OCR Layer
-The OCR layer is responsible for reading all the content from a document. It involves the following steps:
+## Features
 
-1. **Convert pages to images**: Any type of file is converted into an image so that all the content in the document can be read.
+- Supports multiple document loaders including Tesseract OCR, Azure Form Recognizer, AWS TextExtract, Google Document AI.
+- Customizable extraction using contract definitions.
+- Asynchronous processing for efficient document handling.
+- Built-in support for various document formats.
+- ORM-style interaction between files and LLMs.
 
-2. **Preprocess image for OCR**: The image is adjusted to improve its quality and readability.
+<p align="center">
+  <img src="https://github.com/enoch3712/Open-DocLLM/assets/9283394/b1b8800c-3c55-4ee5-92fe-b8b663c7a81f" alt="Extract Thinker Features Diagram" width="300"/>
+</p>
 
-3. **Tesseract OCR**: The Tesseract OCR, the most popular open-source OCR in the world, is used to read the content from the images.
+## Installation
 
-## LLM Layer
-The LLM layer is responsible for extracting specific content from the document in a structured way. It involves defining an extraction contract and extracting the JSON data.
+To install `extract_thinker`, you can use `pip`:
 
-## Running Locally
-You can run the models on-premises using LLM studio or Ollama. This project uses LlamaIndex and Ollama.
+```bash
+pip install extract_thinker
+```
 
-## Running the Code
-The repo includes a FastAPI app with one endpoint for testing. Make sure to point to the proper Tesseract executable and change the key in the config.py file.
+## Usage
+Here's a quick example to get you started with extract_thinker. This example demonstrates how to load a document using Tesseract OCR and extract specific fields defined in a contract.
 
-1. Install Tessaract 
-https://github.com/tesseract-ocr/tesseract
+```python
+import os
+from dotenv import load_dotenv
+from extract_thinker import DocumentLoaderTesseract, Extractor, Contract
 
-2. Install the required Python packages.
-```sh
-pip install -r requirements.txt
-```
+load_dotenv()
+cwd = os.getcwd()
 
-3. Run fast api
-```sh
-uvicorn main:app --reload
-```
+class InvoiceContract(Contract):
+    invoice_number: str
+    invoice_date: str
+
+tesseract_path = os.getenv("TESSERACT_PATH")
+test_file_path = os.path.join(cwd, "test_images", "invoice.png")
+
+extractor = Extractor()
+extractor.load_document_loader(
+    DocumentLoaderTesseract(tesseract_path)
+)
+extractor.load_llm("claude-3-haiku-20240307")
 
-4. go to the Swgger page: 
-http://localhost:8000/docs
+result = extractor.extract(test_file_path, InvoiceContract)
 
-## Running with Docker
-1. Build the Docker image.
-```sh
-docker build -t your-image-name .
+print("Invoice Number: ", result.invoice_number)
+print("Invoice Date: ", result.invoice_date)
 ```
 
-2. Run the Docker container.
-```sh
-docker run -p 8000:8000 your-image-name
+## Splitting Files Example
+You can also split and process documents using extract_thinker. Here's how you can do it:
+
+```python
+import os
+from dotenv import load_dotenv
+from extract_thinker import DocumentLoaderTesseract, Extractor, Process, Classification, ImageSplitter
+
+load_dotenv()
+
+class DriverLicense(Contract):
+    # Define your DriverLicense contract fields here
+    pass
+
+class InvoiceContract(Contract):
+    invoice_number: str
+    invoice_date: str
+
+extractor = Extractor()
+extractor.load_document_loader(DocumentLoaderTesseract(os.getenv("TESSERACT_PATH")))
+extractor.load_llm("gpt-3.5-turbo")
+
+classifications = [
+    Classification(name="Driver License", description="This is a driver license", contract=DriverLicense, extractor=extractor),
+    Classification(name="Invoice", description="This is an invoice", contract=InvoiceContract, extractor=extractor)
+]
+
+process = Process()
+process.load_document_loader(DocumentLoaderTesseract(os.getenv("TESSERACT_PATH")))
+process.load_splitter(ImageSplitter())
+
+path = "..."
+
+split_content = process.load_file(path)\
+    .split(classifications)\
+    .extract()
+
+# Process the split_content as needed
 ```
 
-3. go to the Swgger page: 
-http://localhost:8000/docs
+## Infrastructure
+
+The `extract_thinker` project is inspired by the LangChain ecosystem, featuring a modular infrastructure with templates, components, and core functions to facilitate robust document extraction and processing. 
+
+<p align="center">
+  <img src="https://github.com/enoch3712/Open-DocLLM/assets/9283394/996fb2de-0558-4f13-ab3d-7ea56a593951" alt="Extract Thinker Logo" width="400"/>
+</p>
+
+## Why Just Not LangChain?
+While LangChain is a generalized framework designed for a wide array of use cases, extract_thinker is specifically focused on Intelligent Document Processing (IDP). Although achieving 100% accuracy in IDP remains a challenge, leveraging LLMs brings us significantly closer to this goal.
+
+## Additional Examples
+You can find more examples in the repository. These examples cover various use cases and demonstrate the flexibility of extract_thinker. Also check my the medium of the author that contains several examples about the library
 
+## Contributing
+We welcome contributions from the community! If you would like to contribute, please follow these steps:
 
-## Advanced Cases: 1 Million token context
-The project also explores advanced cases like a 1 million token context using LLM Lingua and Mistral Yarn 128k context window.
+Fork the repository.
+Create a new branch for your feature or bugfix.
+Write tests for your changes.
+Run tests to ensure everything is working correctly.
+Submit a pull request with a description of your changes.
 
-## Conclusion
-The integration of OCR and LLM technologies in this project marks a pivotal advancement in analyzing unstructured data. The combination of open-source projects like Tesseract and Mistral makes a perfect implementation that could be used in an on-premise use case.
+## License
+This project is licensed under the Apache License 2.0. See the LICENSE file for more details.
 
-## References & Documents 
-1. [DOCLLM: A LAYOUT-AWARE GENERATIVE LANGUAGE MODEL FOR MULTIMODAL DOCUMENT UNDERSTANDING](https://arxiv.org/pdf/2401.00908.pdf)
-2. [YaRN: Efficient Context Window Extension of Large Language Models](https://arxiv.org/pdf/2309.00071.pdf)
+## Contact
+For any questions or issues, please open an issue on the GitHub repository.
diff --git a/examples/extractor_basic.py b/examples/extractor_basic.py
@@ -0,0 +1,33 @@
+import os
+
+from dotenv import load_dotenv
+
+from extract_thinker import DocumentLoaderTesseract, Extractor, Contract
+
+load_dotenv()
+cwd = os.getcwd()
+
+
+class InvoiceContract(Contract):
+    invoice_number: str
+    invoice_date: str
+
+
+tesseract_path = os.getenv("TESSERACT_PATH")
+test_file_path = os.path.join(cwd, "tests", "test_images", "invoice.png")
+
+extractor = Extractor()
+extractor.load_document_loader(
+    DocumentLoaderTesseract(tesseract_path)
+)
+extractor.load_llm("claude-3-haiku-20240307")
+
+result = extractor.extract(test_file_path, InvoiceContract)
+
+if result is not None:
+    print("Extraction successful.")
+else:
+    print("Extraction failed.")
+
+print("Invoice Number: ", result.invoice_number)
+print("Invoice Date: ", result.invoice_date)
diff --git a/extract_thinker/__init__.py b/extract_thinker/__init__.py
@@ -0,0 +1,13 @@
+from .extractor import Extractor
+from .document_loader.document_loader import DocumentLoader
+from .document_loader.cached_document_loader import CachedDocumentLoader
+from .document_loader.document_loader_tesseract import DocumentLoaderTesseract
+from .models import classification, classification_response
+from .process import Process
+from .splitter import Splitter
+from .image_splitter import ImageSplitter
+from .models.classification import Classification
+from .models.contract import Contract
+
+
+__all__ = ['Extractor', 'DocumentLoader', 'CachedDocumentLoader', 'DocumentLoaderTesseract', 'classification', 'classification_response', 'Process', 'Splitter', 'ImageSplitter', 'Classification', 'Contract']