Skip to content

Commit

Permalink
Docling refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
enoch3712 committed Jan 17, 2025
1 parent 6dd3cd9 commit d845e92
Show file tree
Hide file tree
Showing 2 changed files with 173 additions and 111 deletions.
93 changes: 53 additions & 40 deletions extract_thinker/document_loader/document_loader_docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,70 +12,83 @@
class DoclingConfig:
"""Configuration for Docling document loader.
This class supports both simple and complex configurations:
Simple usage:
config = DoclingConfig() # Uses default settings
Complex usage:
config = DoclingConfig(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=PdfPipelineOptions(
do_table_structure=True,
do_ocr=True,
table_structure_options=TableStructureOptions(
do_cell_matching=True
)
)
)
}
)
Args:
content: Initial content (optional)
cache_ttl: Cache time-to-live in seconds (default: 300)
format_options: Dictionary mapping input formats to their FormatOption configurations
Example:
format_options: Dictionary mapping input formats to their FormatOption configurations.
If None, default options will be created based on other parameters.
For complex scenarios, you can provide your own format options:
{
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options),
InputFormat.IMAGE: ImageFormatOption(pipeline_options=image_options),
...
}
ocr_enabled: Whether to enable OCR processing (default: True)
ocr_enabled: Whether to enable OCR processing (default: False)
table_structure_enabled: Whether to enable table structure detection (default: True)
tesseract_cmd: Path to tesseract executable (default: None)
force_full_page_ocr: Whether to force OCR on entire pages (default: False)
do_cell_matching: Whether to enable cell matching in tables (default: True)
"""
# Optional parameters
content: Optional[Any] = None
cache_ttl: int = 300
format_options: Optional[Dict[str, Any]] = None
ocr_enabled: bool = True
ocr_enabled: bool = False # OCR disabled by default
table_structure_enabled: bool = True
tesseract_cmd: Optional[str] = None
force_full_page_ocr: bool = False
do_cell_matching: bool = True

def __post_init__(self):
"""Initialize format options based on configuration."""
if self.format_options is None:
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TesseractCliOcrOptions,
TableStructureOptions,
"""Initialize format options if not provided."""
# If format_options are provided, use them as is (complex configuration)
if self.format_options is not None:
return

# Simple configuration: create default format options based on parameters
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TableStructureOptions,
)
from docling.datamodel.base_models import InputFormat
from docling.document_converter import PdfFormatOption

# Set up table options
table_options = None
if self.table_structure_enabled:
table_options = TableStructureOptions(
do_cell_matching=self.do_cell_matching
)
from docling.datamodel.base_models import InputFormat
from docling.document_converter import PdfFormatOption

# Set up OCR options
ocr_options = None
if self.ocr_enabled:
ocr_options = TesseractCliOcrOptions(
force_full_page_ocr=self.force_full_page_ocr,
tesseract_cmd=self.tesseract_cmd
)

# Set up table options
table_options = None
if self.table_structure_enabled:
table_options = TableStructureOptions(
do_cell_matching=self.do_cell_matching
)

# Create pipeline options
pipeline_options = PdfPipelineOptions(
do_table_structure=self.table_structure_enabled,
do_ocr=self.ocr_enabled,
ocr_options=ocr_options,
table_structure_options=table_options
)
# Create pipeline options
pipeline_options = PdfPipelineOptions(
do_table_structure=self.table_structure_enabled,
do_ocr=self.ocr_enabled,
table_structure_options=table_options
)

# Create format options
self.format_options = {
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
# Create format options
self.format_options = {
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}


class DocumentLoaderDocling(CachedDocumentLoader):
Expand Down
191 changes: 120 additions & 71 deletions tests/test_document_loader_docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,21 @@
TesseractCliOcrOptions,
TableStructureOptions,
)

from docling.datamodel.base_models import InputFormat
from docling.document_converter import PdfFormatOption, ImageFormatOption
from docling.document_converter import PdfFormatOption


class TestDocumentLoaderDocling(BaseDocumentLoaderTest):
@pytest.fixture
def test_file_path(self):
current_dir = os.path.dirname(os.path.abspath(__file__))
return os.path.join(current_dir, 'files', 'invoice.pdf')

@pytest.fixture
def loader(self):
"""Required fixture from BaseDocumentLoaderTest - returns a basic loader instance"""
return DocumentLoaderDocling()

@pytest.fixture
def default_pipeline_options(self):
"""Default pipeline options for testing"""
Expand All @@ -35,56 +45,72 @@ def default_pipeline_options(self):
table_structure_options=table_options
)

@pytest.fixture
def docling_config(self, default_pipeline_options):
"""Default Docling configuration for testing"""
format_options = {
InputFormat.PDF: PdfFormatOption(pipeline_options=default_pipeline_options)
}
return DoclingConfig(
format_options=format_options,
ocr_enabled=True,
def test_simple_initialization(self, test_file_path, loader):
"""Test simple initialization without any configuration"""
# Basic load and verify
pages = loader.load(test_file_path)
assert isinstance(pages, list)
assert len(pages) > 0
assert "content" in pages[0]
assert isinstance(pages[0]["content"], str)
assert len(pages[0]["content"]) > 0

def test_simple_config(self, test_file_path):
"""Test simple configuration with basic options"""
config = DoclingConfig(
ocr_enabled=False,
table_structure_enabled=True,
tesseract_cmd="/opt/homebrew/bin/tesseract",
force_full_page_ocr=True,
do_cell_matching=True
)
loader = DocumentLoaderDocling(config)

pages = loader.load(test_file_path)
assert isinstance(pages, list)
assert len(pages) > 0
assert "content" in pages[0]

@pytest.fixture
def loader(self, docling_config):
return DocumentLoaderDocling(docling_config)

@pytest.fixture
def loader_no_ocr(self):
"""Loader instance with OCR disabled"""
return DocumentLoaderDocling(
DoclingConfig(
ocr_enabled=False,
table_structure_enabled=True
def test_complex_config(self, test_file_path):
"""Test complex configuration with custom format options"""
# Set up pipeline options
pipeline_options = PdfPipelineOptions(
do_table_structure=True,
do_ocr=False,
table_structure_options=TableStructureOptions(
do_cell_matching=True
)
)

@pytest.fixture
def test_file_path(self):
current_dir = os.path.dirname(os.path.abspath(__file__))
return os.path.join(current_dir, 'files', 'invoice.pdf')

def test_docling_specific_content(self, loader, test_file_path):
"""Test Docling-specific content extraction"""
pages = loader.load(test_file_path)

# Create format options
format_options = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options
)
}

# Create config with format options
config = DoclingConfig(format_options=format_options)
loader = DocumentLoaderDocling(config)

pages = loader.load(test_file_path)
assert isinstance(pages, list)
assert len(pages) > 0

first_page = pages[0]
assert "content" in first_page
assert len(first_page["content"]) > 0
assert "content" in pages[0]

def test_vision_mode(self, loader, test_file_path):
def test_stream_loading(self, test_file_path, loader):
"""Test loading from BytesIO stream"""
with open(test_file_path, 'rb') as f:
stream = BytesIO(f.read())
pages = loader.load(stream)

assert isinstance(pages, list)
assert len(pages) > 0
assert "content" in pages[0]

def test_vision_mode(self, test_file_path, loader):
"""Test vision mode functionality"""
loader.set_vision_mode(True)
pages = loader.load(test_file_path)

pages = loader.load(test_file_path)
assert isinstance(pages, list)
assert len(pages) > 0

Expand All @@ -95,65 +121,88 @@ def test_vision_mode(self, loader, test_file_path):
assert "image" in page
assert isinstance(page["image"], bytes)

def test_stream_loading(self, loader, test_file_path):
"""Test loading from BytesIO stream"""
with open(test_file_path, 'rb') as f:
stream = BytesIO(f.read())
pages = loader.load(stream)

assert isinstance(pages, list)
assert len(pages) > 0
assert "content" in pages[0]

def test_pagination(self, loader, test_file_path):
def test_pagination(self, test_file_path, loader):
"""Test pagination functionality"""
pages = loader.load(test_file_path)

assert isinstance(pages, list)
if loader.can_handle_paginate(test_file_path):
assert len(pages) > 0
for page in pages:
assert "content" in page
assert isinstance(page["content"], str)

def test_no_ocr_loading(self, loader_no_ocr, test_file_path):
"""Test loading with OCR disabled"""
pages = loader_no_ocr.load(test_file_path)
def test_supported_formats(self, loader):
"""Test that supported formats are correctly defined"""
assert isinstance(loader.SUPPORTED_FORMATS, list)
assert "pdf" in loader.SUPPORTED_FORMATS
assert "docx" in loader.SUPPORTED_FORMATS
assert "txt" in loader.SUPPORTED_FORMATS

def test_ocr_disabled(self, test_file_path):
"""Test that OCR is disabled by default"""
config = DoclingConfig() # Default config
loader = DocumentLoaderDocling(config)

pages = loader.load(test_file_path)
assert isinstance(pages, list)
assert len(pages) > 0
assert "content" in pages[0]

def test_config_features(self, test_file_path):
"""Test various configuration features"""
# Test with custom OCR settings
def test_ocr_enabled(self, test_file_path, default_pipeline_options):
"""Test with OCR enabled using tesseract"""
# Create format options with OCR
format_options = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=default_pipeline_options
)
}

config = DoclingConfig(
format_options=format_options,
ocr_enabled=True,
tesseract_cmd="/opt/homebrew/bin/tesseract",
force_full_page_ocr=True
)
loader = DocumentLoaderDocling(config)

pages = loader.load(test_file_path)
assert isinstance(pages, list)
assert len(pages) > 0
assert "content" in pages[0]

# Test with custom table settings
def test_custom_ocr_config(self, test_file_path):
"""Test with custom OCR configuration"""
# Set up OCR options
ocr_options = TesseractCliOcrOptions(
force_full_page_ocr=True,
tesseract_cmd="/opt/homebrew/bin/tesseract"
)

# Set up pipeline options with OCR
pipeline_options = PdfPipelineOptions(
do_table_structure=True,
do_ocr=True,
ocr_options=ocr_options,
table_structure_options=TableStructureOptions(
do_cell_matching=True
)
)

# Create format options
format_options = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options
)
}

# Create config with OCR enabled
config = DoclingConfig(
table_structure_enabled=True,
do_cell_matching=False
format_options=format_options,
ocr_enabled=True,
force_full_page_ocr=True
)
loader = DocumentLoaderDocling(config)
pages = loader.load(test_file_path)
assert len(pages) > 0

def test_simple_initialization(self, test_file_path):
"""Test simple initialization and basic functionality without any special configurations"""
# Simple initialization like before
loader = DocumentLoaderDocling()

# Basic load and verify
pages = loader.load(test_file_path)
assert isinstance(pages, list)
assert len(pages) > 0
assert "content" in pages[0]
assert isinstance(pages[0]["content"], str)
assert len(pages[0]["content"]) > 0 # Should have extracted some text

0 comments on commit d845e92

Please sign in to comment.