From edb3e04284ceb230450f1975371a1b69b01b0fb6 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Wed, 22 May 2024 17:00:18 +0200 Subject: [PATCH 1/2] only check for layer identifiers in the first word of each line --- src/stratigraphy/extract.py | 2 +- .../util/layer_identifier_column.py | 41 ++++++++++--------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index d48f9bf5..cb9732aa 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -72,7 +72,7 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict current_line_words = [] # Detect Layer Index Columns - layer_identifier_entries = find_layer_identifier_column_entries(words) + layer_identifier_entries = find_layer_identifier_column_entries(lines) layer_identifier_columns = ( find_layer_identifier_column(layer_identifier_entries) if layer_identifier_entries else [] ) diff --git a/src/stratigraphy/util/layer_identifier_column.py b/src/stratigraphy/util/layer_identifier_column.py index d0bb3990..b3ad0f59 100644 --- a/src/stratigraphy/util/layer_identifier_column.py +++ b/src/stratigraphy/util/layer_identifier_column.py @@ -7,7 +7,7 @@ from stratigraphy.util.depthcolumn import LayerDepthColumnEntry from stratigraphy.util.find_depth_columns import extract_layer_depth_interval_entries -from stratigraphy.util.line import TextWord +from stratigraphy.util.line import TextLine from stratigraphy.util.textblock import TextBlock @@ -34,13 +34,13 @@ def to_json(self): class LayerIdentifierColumn: """Class for a layer identifier column.""" - def __init__(self, words: list[TextWord]): + def __init__(self, entries: list[LayerIdentifierEntry]): """Initialize the LayerIdentifierColumn object. Args: - words (list[TextWord]): The entries corresponding to the layer indices. + entries (list[LayerIdentifierEntry]): The entries corresponding to the layer indices. """ - self.entries = [LayerIdentifierEntry(word.rect, word.text) for word in words] + self.entries = entries @property def max_x0(self) -> float: @@ -65,13 +65,13 @@ def rect(self) -> fitz.Rect: def rects(self) -> list[fitz.Rect]: return [entry.rect for entry in self.entries] - def add_entry(self, entry: TextWord): + def add_entry(self, entry: LayerIdentifierEntry): """Add a new layer identifier column entry to the layer identifier column. Args: - entry (TextWord): The layer identifier column entry to be added. + entry (LayerIdentifierEntry): The layer identifier column entry to be added. """ - self.entries.append(LayerIdentifierEntry(entry.rect, entry.text)) + self.entries.append(entry) def can_be_appended(self, rect: fitz.Rect) -> bool: """Checks if a new layer identifier column entry can be appended to the current layer identifier column. @@ -161,7 +161,7 @@ def to_json(self): } -def find_layer_identifier_column_entries(all_words: list[TextWord]) -> list: +def find_layer_identifier_column_entries(lines: list[TextLine]) -> list[LayerIdentifierEntry]: r"""Find the layer identifier column entries. Regex explanation: @@ -172,30 +172,31 @@ def find_layer_identifier_column_entries(all_words: list[TextWord]) -> list: This regular expression will match strings like "1)", "2)", "a)", "b)", "1a4)", "6de)", etc. Args: - all_words (list[TextWord]): The words to search for layer identifier columns. + lines (list[TextLine]): The lines to search for layer identifier columns. Returns: - list: The layer identifier column entries. + list[LayerIdentifierEntry]: The layer identifier column entries. """ entries = [] - for word in sorted(all_words, key=lambda word: word.rect.y0): - # TODO There are quite a few false positives such as "(ca. 10 cm)" where "cm)" would be matched currently. - # Could we avoid some of those examples by requiring that the word is at the start of a line and/or there are - # no other words immediately to the left of it? - regex = re.compile(r"\b[\da-z-]+\)") - match = regex.match(word.text) - if match and len(word.text) < 7: - entries.append(word) + for line in sorted(lines, key=lambda line: line.rect.y0): + if len(line.words) > 0: + # Only match in the first word of every line, to avoid e.g. matching with "cm)" in a material description + # containing an expression like "(diameter max 6 cm)". + first_word = line.words[0] + regex = re.compile(r"\b[\da-z-]+\)") + match = regex.match(first_word.text) + if match and len(first_word.text) < 7: + entries.append(LayerIdentifierEntry(first_word.rect, first_word.text)) return entries -def find_layer_identifier_column(entries: list[TextWord]) -> list[LayerIdentifierColumn]: +def find_layer_identifier_column(entries: list[LayerIdentifierEntry]) -> list[LayerIdentifierColumn]: """Find the layer identifier column given the index column entries. Note: Similar to find_depth_columns.find_depth_columns. Refactoring may be desired. Args: - entries (list[TextWord]): The layer identifier column entries. + entries (list[LayerIdentifierEntry]): The layer identifier column entries. Returns: list[LayerIdentifierColumn]: The found layer identifier columns. From c8a6a8b1e255b9a27db308b99a99add4d94cbe52 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Wed, 22 May 2024 17:11:37 +0200 Subject: [PATCH 2/2] undo accidental rename --- src/stratigraphy/util/find_depth_columns.py | 4 ++-- src/stratigraphy/util/layer_identifier_column.py | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py index 75a3c2b4..5ba730a0 100644 --- a/src/stratigraphy/util/find_depth_columns.py +++ b/src/stratigraphy/util/find_depth_columns.py @@ -32,7 +32,7 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis entries.append(DepthColumnEntry(word.rect, value)) elif include_splits: # support for e.g. "1.10-1.60m" extracted as a single word - layer_depth_column_entry = extract_layer_depth_interval_entries(input_string, word.rect) + layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect) entries.extend( [layer_depth_column_entry.start, layer_depth_column_entry.end] if layer_depth_column_entry else [] ) @@ -47,7 +47,7 @@ def value_as_float(string_value: str) -> float: # noqa: D103 return abs(float(parsed_text)) -def extract_layer_depth_interval_entries( +def extract_layer_depth_interval( text: str, rect: fitz.Rect, require_start_of_string: bool = True ) -> LayerDepthColumnEntry | None: """Extracts a LayerDepthColumnEntry from a string. diff --git a/src/stratigraphy/util/layer_identifier_column.py b/src/stratigraphy/util/layer_identifier_column.py index dba9b258..17a56a86 100644 --- a/src/stratigraphy/util/layer_identifier_column.py +++ b/src/stratigraphy/util/layer_identifier_column.py @@ -5,7 +5,7 @@ import fitz from stratigraphy.util.depthcolumn import LayerDepthColumnEntry -from stratigraphy.util.find_depth_columns import extract_layer_depth_interval_entries +from stratigraphy.util.find_depth_columns import extract_layer_depth_interval from stratigraphy.util.line import TextLine from stratigraphy.util.textblock import TextBlock @@ -132,9 +132,7 @@ def get_depth_interval(self, block: TextBlock) -> LayerDepthColumnEntry: depth_entries = [] for line in block.lines: try: - layer_depth_entry = extract_layer_depth_interval_entries( - line.text, line.rect, require_start_of_string=False - ) + layer_depth_entry = extract_layer_depth_interval(line.text, line.rect, require_start_of_string=False) # require_start_of_string = False because the depth interval may not always start at the beginning # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" if layer_depth_entry: