-
Notifications
You must be signed in to change notification settings - Fork 90
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #39 from enoch3712/38-documentloaderpdfplumber
Pdfpumbler added
- Loading branch information
Showing
5 changed files
with
270 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
55 changes: 55 additions & 0 deletions
55
extract_thinker/document_loader/document_loader_pdfplumber.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import io | ||
from typing import Any, Dict, List, Union | ||
|
||
import pdfplumber | ||
|
||
from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader | ||
from extract_thinker.utils import get_file_extension | ||
|
||
SUPPORTED_FORMATS = ['pdf'] | ||
|
||
class DocumentLoaderPdfPlumber(CachedDocumentLoader): | ||
def __init__(self, content: Any = None, cache_ttl: int = 300): | ||
super().__init__(content, cache_ttl) | ||
|
||
def load_content_from_file(self, file_path: str) -> Union[str, Dict[str, Any]]: | ||
try: | ||
if get_file_extension(file_path).lower() not in SUPPORTED_FORMATS: | ||
raise Exception(f"Unsupported file type: {file_path}") | ||
|
||
with pdfplumber.open(file_path) as pdf: | ||
return self.extract_data_from_pdf(pdf) | ||
except Exception as e: | ||
raise Exception(f"Error processing file: {e}") from e | ||
|
||
def load_content_from_stream(self, stream: io.BytesIO) -> Union[str, Dict[str, Any]]: | ||
try: | ||
with pdfplumber.open(stream) as pdf: | ||
return self.extract_data_from_pdf(pdf) | ||
except Exception as e: | ||
raise Exception(f"Error processing stream: {e}") from e | ||
|
||
def extract_data_from_pdf(self, pdf: pdfplumber.PDF) -> Dict[str, Any]: | ||
document_data = { | ||
"text": [], | ||
"tables": [] | ||
} | ||
|
||
for page in pdf.pages: | ||
# Extract text | ||
page_text = page.extract_text() | ||
if page_text: | ||
document_data["text"].extend(page_text.split('\n')) | ||
|
||
# Extract tables | ||
tables = page.extract_tables() | ||
for table in tables: | ||
document_data["tables"].append(table) | ||
|
||
return document_data | ||
|
||
def load_content_from_file_list(self, file_paths: List[str]) -> List[Dict[str, Any]]: | ||
return [self.load_content_from_file(file_path) for file_path in file_paths] | ||
|
||
def load_content_from_stream_list(self, streams: List[io.BytesIO]) -> List[Dict[str, Any]]: | ||
return [self.load_content_from_stream(stream) for stream in streams] |
Oops, something went wrong.