-
Notifications
You must be signed in to change notification settings - Fork 90
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from enoch3712/extractThinker
Extract thinker 0.0.1
- Loading branch information
Showing
64 changed files
with
3,707 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[flake8] | ||
ignore = E501 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
name: Python package workflow | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
pull_request: | ||
branches: | ||
- main | ||
|
||
jobs: | ||
build-and-test: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Set up Python | ||
uses: actions/setup-python@v3 | ||
with: | ||
python-version: '3.8' | ||
|
||
- name: Install dependencies | ||
run: | | ||
pip install poetry | ||
poetry install | ||
- name: Run tests | ||
run: poetry run pytest | ||
|
||
- name: Build package | ||
run: poetry build | ||
|
||
- name: Publish to PyPI | ||
if: github.event_name == 'push' && github.ref == 'refs/heads/main' | ||
uses: pypa/[email protected] | ||
with: | ||
user: __token__ | ||
password: ${{ secrets.PYPI_API_TOKEN }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
repos: | ||
- repo: https://github.com/astral-sh/ruff-pre-commit | ||
rev: v0.1.7 # Ruff version | ||
hooks: | ||
- id: ruff # Run the linter. | ||
name: Run Linter Check (Ruff) | ||
args: [ --fix ] | ||
files: ^(extractthinker|tests|examples)/ | ||
- id: ruff-format # Run the formatter. | ||
name: Run Formatter (Ruff) | ||
- repo: local | ||
hooks: | ||
- id: ci_type_mypy | ||
name: Run Type Check (Mypy) | ||
entry: > | ||
bash -c 'set -o pipefail; | ||
export CUSTOM_PACKAGES="extractthinker/_types/_alias.py extractthinker/cli/cli.py extractthinker/cli/files.py extractthinker/cli/usage.py extractthinker/exceptions.py" && | ||
export CUSTOM_FLAGS="--python-version=3.9 --color-output --no-pretty --follow-imports=skip" && | ||
curl -sSL https://raw.githubusercontent.com/gao-hongnan/omniverse/2fd5de1b8103e955cd5f022ab016b72fa901fa8f/scripts/devops/continuous-integration/type_mypy.sh | | ||
bash' | ||
language: system | ||
types: [python] | ||
pass_filenames: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# Exclude a variety of commonly ignored directories. | ||
exclude = [ | ||
".bzr", | ||
".direnv", | ||
".eggs", | ||
".git", | ||
".git-rewrite", | ||
".hg", | ||
".mypy_cache", | ||
".nox", | ||
".pants.d", | ||
".pytype", | ||
".ruff_cache", | ||
".svn", | ||
".tox", | ||
".venv", | ||
"__pypackages__", | ||
"_build", | ||
"buck-out", | ||
"build", | ||
"dist", | ||
"node_modules", | ||
"venv", | ||
] | ||
|
||
# Same as Black. | ||
line-length = 88 | ||
output-format = "grouped" | ||
|
||
target-version = "py39" | ||
|
||
[lint] | ||
select = [ | ||
# bugbear rules | ||
"B", | ||
# remove unused imports | ||
"F401", | ||
# bare except statements | ||
"E722", | ||
# unused arguments | ||
"ARG", | ||
# redefined variables | ||
"ARG005", | ||
] | ||
ignore = [ | ||
# mutable defaults | ||
"B006", | ||
"B018", | ||
] | ||
|
||
unfixable = [ | ||
# disable auto fix for print statements | ||
"T201", | ||
"T203", | ||
] | ||
ignore-init-module-imports = true | ||
|
||
[extend-per-file-ignores] | ||
"instructor/distil.py" = ["ARG002"] | ||
"tests/test_distil.py" = ["ARG001"] | ||
"tests/test_patch.py" = ["ARG001"] | ||
"examples/task_planner/task_planner_topological_sort.py" = ["ARG002"] | ||
"examples/citation_with_extraction/main.py" = ["ARG001"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,65 +1,133 @@ | ||
# Open-DocLLM | ||
<p align="center"> | ||
<img src="https://github.com/enoch3712/Open-DocLLM/assets/9283394/41d9d151-acb5-44da-9c10-0058f76c2512" alt="Extract Thinker Logo" width="200"/> | ||
</p> | ||
<p align="center"> | ||
<a href="https://medium.com/@enoch3712"> | ||
<img alt="Medium" src="https://img.shields.io/badge/Medium-12100E?style=flat&logo=medium&logoColor=white" /> | ||
</a> | ||
<img alt="GitHub Last Commit" src="https://img.shields.io/github/last-commit/enoch3712/Open-DocLLM" /> | ||
<img alt="Github License" src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" /> | ||
</p> | ||
|
||
## Introduction | ||
This project aims to tackle the challenges of data extraction and processing using OCR and LLM. It is inspired by JP Morgan's DocLLM but is fully open-source and offers a larger context window size. The project is divided into two parts: the OCR and LLM layer. | ||
# ExtractThinker | ||
|
||
![image](https://github.com/enoch3712/Open-DocLLM/assets/9283394/2612cc9e-fc66-401e-912d-3acaef42d9cc) | ||
Library to extract data from files and documents agnostically using LLMs. `extract_thinker` provides ORM-style interaction between files and LLMs, allowing for flexible and powerful document extraction workflows. | ||
|
||
## OCR Layer | ||
The OCR layer is responsible for reading all the content from a document. It involves the following steps: | ||
## Features | ||
|
||
1. **Convert pages to images**: Any type of file is converted into an image so that all the content in the document can be read. | ||
- Supports multiple document loaders including Tesseract OCR, Azure Form Recognizer, AWS TextExtract, Google Document AI. | ||
- Customizable extraction using contract definitions. | ||
- Asynchronous processing for efficient document handling. | ||
- Built-in support for various document formats. | ||
- ORM-style interaction between files and LLMs. | ||
|
||
2. **Preprocess image for OCR**: The image is adjusted to improve its quality and readability. | ||
<p align="center"> | ||
<img src="https://github.com/enoch3712/Open-DocLLM/assets/9283394/b1b8800c-3c55-4ee5-92fe-b8b663c7a81f" alt="Extract Thinker Features Diagram" width="300"/> | ||
</p> | ||
|
||
3. **Tesseract OCR**: The Tesseract OCR, the most popular open-source OCR in the world, is used to read the content from the images. | ||
## Installation | ||
|
||
## LLM Layer | ||
The LLM layer is responsible for extracting specific content from the document in a structured way. It involves defining an extraction contract and extracting the JSON data. | ||
To install `extract_thinker`, you can use `pip`: | ||
|
||
## Running Locally | ||
You can run the models on-premises using LLM studio or Ollama. This project uses LlamaIndex and Ollama. | ||
```bash | ||
pip install extract_thinker | ||
``` | ||
|
||
## Running the Code | ||
The repo includes a FastAPI app with one endpoint for testing. Make sure to point to the proper Tesseract executable and change the key in the config.py file. | ||
## Usage | ||
Here's a quick example to get you started with extract_thinker. This example demonstrates how to load a document using Tesseract OCR and extract specific fields defined in a contract. | ||
|
||
1. Install Tessaract | ||
https://github.com/tesseract-ocr/tesseract | ||
```python | ||
import os | ||
from dotenv import load_dotenv | ||
from extract_thinker import DocumentLoaderTesseract, Extractor, Contract | ||
|
||
2. Install the required Python packages. | ||
```sh | ||
pip install -r requirements.txt | ||
``` | ||
load_dotenv() | ||
cwd = os.getcwd() | ||
|
||
3. Run fast api | ||
```sh | ||
uvicorn main:app --reload | ||
``` | ||
class InvoiceContract(Contract): | ||
invoice_number: str | ||
invoice_date: str | ||
|
||
tesseract_path = os.getenv("TESSERACT_PATH") | ||
test_file_path = os.path.join(cwd, "test_images", "invoice.png") | ||
|
||
extractor = Extractor() | ||
extractor.load_document_loader( | ||
DocumentLoaderTesseract(tesseract_path) | ||
) | ||
extractor.load_llm("claude-3-haiku-20240307") | ||
|
||
4. go to the Swgger page: | ||
http://localhost:8000/docs | ||
result = extractor.extract(test_file_path, InvoiceContract) | ||
|
||
## Running with Docker | ||
1. Build the Docker image. | ||
```sh | ||
docker build -t your-image-name . | ||
print("Invoice Number: ", result.invoice_number) | ||
print("Invoice Date: ", result.invoice_date) | ||
``` | ||
|
||
2. Run the Docker container. | ||
```sh | ||
docker run -p 8000:8000 your-image-name | ||
## Splitting Files Example | ||
You can also split and process documents using extract_thinker. Here's how you can do it: | ||
|
||
```python | ||
import os | ||
from dotenv import load_dotenv | ||
from extract_thinker import DocumentLoaderTesseract, Extractor, Process, Classification, ImageSplitter | ||
|
||
load_dotenv() | ||
|
||
class DriverLicense(Contract): | ||
# Define your DriverLicense contract fields here | ||
pass | ||
|
||
class InvoiceContract(Contract): | ||
invoice_number: str | ||
invoice_date: str | ||
|
||
extractor = Extractor() | ||
extractor.load_document_loader(DocumentLoaderTesseract(os.getenv("TESSERACT_PATH"))) | ||
extractor.load_llm("gpt-3.5-turbo") | ||
|
||
classifications = [ | ||
Classification(name="Driver License", description="This is a driver license", contract=DriverLicense, extractor=extractor), | ||
Classification(name="Invoice", description="This is an invoice", contract=InvoiceContract, extractor=extractor) | ||
] | ||
|
||
process = Process() | ||
process.load_document_loader(DocumentLoaderTesseract(os.getenv("TESSERACT_PATH"))) | ||
process.load_splitter(ImageSplitter()) | ||
|
||
path = "..." | ||
|
||
split_content = process.load_file(path)\ | ||
.split(classifications)\ | ||
.extract() | ||
|
||
# Process the split_content as needed | ||
``` | ||
|
||
3. go to the Swgger page: | ||
http://localhost:8000/docs | ||
## Infrastructure | ||
|
||
The `extract_thinker` project is inspired by the LangChain ecosystem, featuring a modular infrastructure with templates, components, and core functions to facilitate robust document extraction and processing. | ||
|
||
<p align="center"> | ||
<img src="https://github.com/enoch3712/Open-DocLLM/assets/9283394/996fb2de-0558-4f13-ab3d-7ea56a593951" alt="Extract Thinker Logo" width="400"/> | ||
</p> | ||
|
||
## Why Just Not LangChain? | ||
While LangChain is a generalized framework designed for a wide array of use cases, extract_thinker is specifically focused on Intelligent Document Processing (IDP). Although achieving 100% accuracy in IDP remains a challenge, leveraging LLMs brings us significantly closer to this goal. | ||
|
||
## Additional Examples | ||
You can find more examples in the repository. These examples cover various use cases and demonstrate the flexibility of extract_thinker. Also check my the medium of the author that contains several examples about the library | ||
|
||
## Contributing | ||
We welcome contributions from the community! If you would like to contribute, please follow these steps: | ||
|
||
## Advanced Cases: 1 Million token context | ||
The project also explores advanced cases like a 1 million token context using LLM Lingua and Mistral Yarn 128k context window. | ||
Fork the repository. | ||
Create a new branch for your feature or bugfix. | ||
Write tests for your changes. | ||
Run tests to ensure everything is working correctly. | ||
Submit a pull request with a description of your changes. | ||
|
||
## Conclusion | ||
The integration of OCR and LLM technologies in this project marks a pivotal advancement in analyzing unstructured data. The combination of open-source projects like Tesseract and Mistral makes a perfect implementation that could be used in an on-premise use case. | ||
## License | ||
This project is licensed under the Apache License 2.0. See the LICENSE file for more details. | ||
|
||
## References & Documents | ||
1. [DOCLLM: A LAYOUT-AWARE GENERATIVE LANGUAGE MODEL FOR MULTIMODAL DOCUMENT UNDERSTANDING](https://arxiv.org/pdf/2401.00908.pdf) | ||
2. [YaRN: Efficient Context Window Extension of Large Language Models](https://arxiv.org/pdf/2309.00071.pdf) | ||
## Contact | ||
For any questions or issues, please open an issue on the GitHub repository. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import os | ||
|
||
from dotenv import load_dotenv | ||
|
||
from extract_thinker import DocumentLoaderTesseract, Extractor, Contract | ||
|
||
load_dotenv() | ||
cwd = os.getcwd() | ||
|
||
|
||
class InvoiceContract(Contract): | ||
invoice_number: str | ||
invoice_date: str | ||
|
||
|
||
tesseract_path = os.getenv("TESSERACT_PATH") | ||
test_file_path = os.path.join(cwd, "tests", "test_images", "invoice.png") | ||
|
||
extractor = Extractor() | ||
extractor.load_document_loader( | ||
DocumentLoaderTesseract(tesseract_path) | ||
) | ||
extractor.load_llm("claude-3-haiku-20240307") | ||
|
||
result = extractor.extract(test_file_path, InvoiceContract) | ||
|
||
if result is not None: | ||
print("Extraction successful.") | ||
else: | ||
print("Extraction failed.") | ||
|
||
print("Invoice Number: ", result.invoice_number) | ||
print("Invoice Date: ", result.invoice_date) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
from .extractor import Extractor | ||
from .document_loader.document_loader import DocumentLoader | ||
from .document_loader.cached_document_loader import CachedDocumentLoader | ||
from .document_loader.document_loader_tesseract import DocumentLoaderTesseract | ||
from .models import classification, classification_response | ||
from .process import Process | ||
from .splitter import Splitter | ||
from .image_splitter import ImageSplitter | ||
from .models.classification import Classification | ||
from .models.contract import Contract | ||
|
||
|
||
__all__ = ['Extractor', 'DocumentLoader', 'CachedDocumentLoader', 'DocumentLoaderTesseract', 'classification', 'classification_response', 'Process', 'Splitter', 'ImageSplitter', 'Classification', 'Contract'] |
Oops, something went wrong.