diff --git a/extract_thinker/document_loader/document_loader_docling.py b/extract_thinker/document_loader/document_loader_docling.py index b462243..6d01e9b 100644 --- a/extract_thinker/document_loader/document_loader_docling.py +++ b/extract_thinker/document_loader/document_loader_docling.py @@ -12,19 +12,39 @@ class DoclingConfig: """Configuration for Docling document loader. + This class supports both simple and complex configurations: + + Simple usage: + config = DoclingConfig() # Uses default settings + + Complex usage: + config = DoclingConfig( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=PdfPipelineOptions( + do_table_structure=True, + do_ocr=True, + table_structure_options=TableStructureOptions( + do_cell_matching=True + ) + ) + ) + } + ) + Args: content: Initial content (optional) cache_ttl: Cache time-to-live in seconds (default: 300) - format_options: Dictionary mapping input formats to their FormatOption configurations - Example: + format_options: Dictionary mapping input formats to their FormatOption configurations. + If None, default options will be created based on other parameters. + For complex scenarios, you can provide your own format options: { InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options), InputFormat.IMAGE: ImageFormatOption(pipeline_options=image_options), ... } - ocr_enabled: Whether to enable OCR processing (default: True) + ocr_enabled: Whether to enable OCR processing (default: False) table_structure_enabled: Whether to enable table structure detection (default: True) - tesseract_cmd: Path to tesseract executable (default: None) force_full_page_ocr: Whether to force OCR on entire pages (default: False) do_cell_matching: Whether to enable cell matching in tables (default: True) """ @@ -32,50 +52,43 @@ class DoclingConfig: content: Optional[Any] = None cache_ttl: int = 300 format_options: Optional[Dict[str, Any]] = None - ocr_enabled: bool = True + ocr_enabled: bool = False # OCR disabled by default table_structure_enabled: bool = True - tesseract_cmd: Optional[str] = None force_full_page_ocr: bool = False do_cell_matching: bool = True def __post_init__(self): - """Initialize format options based on configuration.""" - if self.format_options is None: - from docling.datamodel.pipeline_options import ( - PdfPipelineOptions, - TesseractCliOcrOptions, - TableStructureOptions, + """Initialize format options if not provided.""" + # If format_options are provided, use them as is (complex configuration) + if self.format_options is not None: + return + + # Simple configuration: create default format options based on parameters + from docling.datamodel.pipeline_options import ( + PdfPipelineOptions, + TableStructureOptions, + ) + from docling.datamodel.base_models import InputFormat + from docling.document_converter import PdfFormatOption + + # Set up table options + table_options = None + if self.table_structure_enabled: + table_options = TableStructureOptions( + do_cell_matching=self.do_cell_matching ) - from docling.datamodel.base_models import InputFormat - from docling.document_converter import PdfFormatOption - - # Set up OCR options - ocr_options = None - if self.ocr_enabled: - ocr_options = TesseractCliOcrOptions( - force_full_page_ocr=self.force_full_page_ocr, - tesseract_cmd=self.tesseract_cmd - ) - - # Set up table options - table_options = None - if self.table_structure_enabled: - table_options = TableStructureOptions( - do_cell_matching=self.do_cell_matching - ) - # Create pipeline options - pipeline_options = PdfPipelineOptions( - do_table_structure=self.table_structure_enabled, - do_ocr=self.ocr_enabled, - ocr_options=ocr_options, - table_structure_options=table_options - ) + # Create pipeline options + pipeline_options = PdfPipelineOptions( + do_table_structure=self.table_structure_enabled, + do_ocr=self.ocr_enabled, + table_structure_options=table_options + ) - # Create format options - self.format_options = { - InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) - } + # Create format options + self.format_options = { + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + } class DocumentLoaderDocling(CachedDocumentLoader): diff --git a/tests/test_document_loader_docling.py b/tests/test_document_loader_docling.py index efee356..aacbbb9 100644 --- a/tests/test_document_loader_docling.py +++ b/tests/test_document_loader_docling.py @@ -11,11 +11,21 @@ TesseractCliOcrOptions, TableStructureOptions, ) - from docling.datamodel.base_models import InputFormat -from docling.document_converter import PdfFormatOption, ImageFormatOption +from docling.document_converter import PdfFormatOption + class TestDocumentLoaderDocling(BaseDocumentLoaderTest): + @pytest.fixture + def test_file_path(self): + current_dir = os.path.dirname(os.path.abspath(__file__)) + return os.path.join(current_dir, 'files', 'invoice.pdf') + + @pytest.fixture + def loader(self): + """Required fixture from BaseDocumentLoaderTest - returns a basic loader instance""" + return DocumentLoaderDocling() + @pytest.fixture def default_pipeline_options(self): """Default pipeline options for testing""" @@ -35,56 +45,72 @@ def default_pipeline_options(self): table_structure_options=table_options ) - @pytest.fixture - def docling_config(self, default_pipeline_options): - """Default Docling configuration for testing""" - format_options = { - InputFormat.PDF: PdfFormatOption(pipeline_options=default_pipeline_options) - } - return DoclingConfig( - format_options=format_options, - ocr_enabled=True, + def test_simple_initialization(self, test_file_path, loader): + """Test simple initialization without any configuration""" + # Basic load and verify + pages = loader.load(test_file_path) + assert isinstance(pages, list) + assert len(pages) > 0 + assert "content" in pages[0] + assert isinstance(pages[0]["content"], str) + assert len(pages[0]["content"]) > 0 + + def test_simple_config(self, test_file_path): + """Test simple configuration with basic options""" + config = DoclingConfig( + ocr_enabled=False, table_structure_enabled=True, - tesseract_cmd="/opt/homebrew/bin/tesseract", - force_full_page_ocr=True, do_cell_matching=True ) + loader = DocumentLoaderDocling(config) + + pages = loader.load(test_file_path) + assert isinstance(pages, list) + assert len(pages) > 0 + assert "content" in pages[0] - @pytest.fixture - def loader(self, docling_config): - return DocumentLoaderDocling(docling_config) - - @pytest.fixture - def loader_no_ocr(self): - """Loader instance with OCR disabled""" - return DocumentLoaderDocling( - DoclingConfig( - ocr_enabled=False, - table_structure_enabled=True + def test_complex_config(self, test_file_path): + """Test complex configuration with custom format options""" + # Set up pipeline options + pipeline_options = PdfPipelineOptions( + do_table_structure=True, + do_ocr=False, + table_structure_options=TableStructureOptions( + do_cell_matching=True ) ) - - @pytest.fixture - def test_file_path(self): - current_dir = os.path.dirname(os.path.abspath(__file__)) - return os.path.join(current_dir, 'files', 'invoice.pdf') - - def test_docling_specific_content(self, loader, test_file_path): - """Test Docling-specific content extraction""" - pages = loader.load(test_file_path) + # Create format options + format_options = { + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options + ) + } + + # Create config with format options + config = DoclingConfig(format_options=format_options) + loader = DocumentLoaderDocling(config) + + pages = loader.load(test_file_path) assert isinstance(pages, list) assert len(pages) > 0 - - first_page = pages[0] - assert "content" in first_page - assert len(first_page["content"]) > 0 + assert "content" in pages[0] - def test_vision_mode(self, loader, test_file_path): + def test_stream_loading(self, test_file_path, loader): + """Test loading from BytesIO stream""" + with open(test_file_path, 'rb') as f: + stream = BytesIO(f.read()) + pages = loader.load(stream) + + assert isinstance(pages, list) + assert len(pages) > 0 + assert "content" in pages[0] + + def test_vision_mode(self, test_file_path, loader): """Test vision mode functionality""" loader.set_vision_mode(True) - pages = loader.load(test_file_path) + pages = loader.load(test_file_path) assert isinstance(pages, list) assert len(pages) > 0 @@ -95,20 +121,9 @@ def test_vision_mode(self, loader, test_file_path): assert "image" in page assert isinstance(page["image"], bytes) - def test_stream_loading(self, loader, test_file_path): - """Test loading from BytesIO stream""" - with open(test_file_path, 'rb') as f: - stream = BytesIO(f.read()) - pages = loader.load(stream) - - assert isinstance(pages, list) - assert len(pages) > 0 - assert "content" in pages[0] - - def test_pagination(self, loader, test_file_path): + def test_pagination(self, test_file_path, loader): """Test pagination functionality""" pages = loader.load(test_file_path) - assert isinstance(pages, list) if loader.can_handle_paginate(test_file_path): assert len(pages) > 0 @@ -116,44 +131,78 @@ def test_pagination(self, loader, test_file_path): assert "content" in page assert isinstance(page["content"], str) - def test_no_ocr_loading(self, loader_no_ocr, test_file_path): - """Test loading with OCR disabled""" - pages = loader_no_ocr.load(test_file_path) + def test_supported_formats(self, loader): + """Test that supported formats are correctly defined""" + assert isinstance(loader.SUPPORTED_FORMATS, list) + assert "pdf" in loader.SUPPORTED_FORMATS + assert "docx" in loader.SUPPORTED_FORMATS + assert "txt" in loader.SUPPORTED_FORMATS + + def test_ocr_disabled(self, test_file_path): + """Test that OCR is disabled by default""" + config = DoclingConfig() # Default config + loader = DocumentLoaderDocling(config) + pages = loader.load(test_file_path) assert isinstance(pages, list) assert len(pages) > 0 assert "content" in pages[0] - def test_config_features(self, test_file_path): - """Test various configuration features""" - # Test with custom OCR settings + def test_ocr_enabled(self, test_file_path, default_pipeline_options): + """Test with OCR enabled using tesseract""" + # Create format options with OCR + format_options = { + InputFormat.PDF: PdfFormatOption( + pipeline_options=default_pipeline_options + ) + } + config = DoclingConfig( + format_options=format_options, ocr_enabled=True, - tesseract_cmd="/opt/homebrew/bin/tesseract", force_full_page_ocr=True ) loader = DocumentLoaderDocling(config) + pages = loader.load(test_file_path) + assert isinstance(pages, list) assert len(pages) > 0 + assert "content" in pages[0] - # Test with custom table settings + def test_custom_ocr_config(self, test_file_path): + """Test with custom OCR configuration""" + # Set up OCR options + ocr_options = TesseractCliOcrOptions( + force_full_page_ocr=True, + tesseract_cmd="/opt/homebrew/bin/tesseract" + ) + + # Set up pipeline options with OCR + pipeline_options = PdfPipelineOptions( + do_table_structure=True, + do_ocr=True, + ocr_options=ocr_options, + table_structure_options=TableStructureOptions( + do_cell_matching=True + ) + ) + + # Create format options + format_options = { + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options + ) + } + + # Create config with OCR enabled config = DoclingConfig( - table_structure_enabled=True, - do_cell_matching=False + format_options=format_options, + ocr_enabled=True, + force_full_page_ocr=True ) loader = DocumentLoaderDocling(config) - pages = loader.load(test_file_path) - assert len(pages) > 0 - - def test_simple_initialization(self, test_file_path): - """Test simple initialization and basic functionality without any special configurations""" - # Simple initialization like before - loader = DocumentLoaderDocling() - # Basic load and verify pages = loader.load(test_file_path) assert isinstance(pages, list) assert len(pages) > 0 assert "content" in pages[0] - assert isinstance(pages[0]["content"], str) - assert len(pages[0]["content"]) > 0 # Should have extracted some text