Skip to content

Commit

Permalink
Merge pull request #39 from enoch3712/38-documentloaderpdfplumber
Browse files Browse the repository at this point in the history
Pdfpumbler added
  • Loading branch information
enoch3712 authored Oct 3, 2024
2 parents 9092ddb + 541314d commit d1d0b4e
Show file tree
Hide file tree
Showing 5 changed files with 270 additions and 7 deletions.
2 changes: 2 additions & 0 deletions extract_thinker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .document_loader.document_loader_spreadsheet import DocumentLoaderSpreadSheet
from .document_loader.document_loader_azure_document_intelligence import DocumentLoaderAzureForm
from .document_loader.document_loader_pypdf import DocumentLoaderPyPdf
from .document_loader.document_loader_pdfplumber import DocumentLoaderPdfPlumber
from .models import classification, classification_response
from .process import Process, ClassificationStrategy
from .splitter import Splitter
Expand All @@ -23,6 +24,7 @@
'DocumentLoaderSpreadSheet',
'DocumentLoaderAzureForm',
'DocumentLoaderPyPdf',
'DocumentLoaderPdfPlumber',
'classification',
'classification_response',
'Process',
Expand Down
55 changes: 55 additions & 0 deletions extract_thinker/document_loader/document_loader_pdfplumber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import io
from typing import Any, Dict, List, Union

import pdfplumber

from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
from extract_thinker.utils import get_file_extension

SUPPORTED_FORMATS = ['pdf']

class DocumentLoaderPdfPlumber(CachedDocumentLoader):
def __init__(self, content: Any = None, cache_ttl: int = 300):
super().__init__(content, cache_ttl)

def load_content_from_file(self, file_path: str) -> Union[str, Dict[str, Any]]:
try:
if get_file_extension(file_path).lower() not in SUPPORTED_FORMATS:
raise Exception(f"Unsupported file type: {file_path}")

with pdfplumber.open(file_path) as pdf:
return self.extract_data_from_pdf(pdf)
except Exception as e:
raise Exception(f"Error processing file: {e}") from e

def load_content_from_stream(self, stream: io.BytesIO) -> Union[str, Dict[str, Any]]:
try:
with pdfplumber.open(stream) as pdf:
return self.extract_data_from_pdf(pdf)
except Exception as e:
raise Exception(f"Error processing stream: {e}") from e

def extract_data_from_pdf(self, pdf: pdfplumber.PDF) -> Dict[str, Any]:
document_data = {
"text": [],
"tables": []
}

for page in pdf.pages:
# Extract text
page_text = page.extract_text()
if page_text:
document_data["text"].extend(page_text.split('\n'))

# Extract tables
tables = page.extract_tables()
for table in tables:
document_data["tables"].append(table)

return document_data

def load_content_from_file_list(self, file_paths: List[str]) -> List[Dict[str, Any]]:
return [self.load_content_from_file(file_path) for file_path in file_paths]

def load_content_from_stream_list(self, streams: List[io.BytesIO]) -> List[Dict[str, Any]]:
return [self.load_content_from_stream(stream) for stream in streams]
Loading

0 comments on commit d1d0b4e

Please sign in to comment.