Merge pull request #46 from swisstopo/feat/extend_geneva_layouts

Feat/extend geneva layouts
swisstopo · May 23, 2024 · d26fbf7 · d26fbf7 · github-actions · May 23, 2024
2 parents e156378 + 9bae4a1
commit d26fbf7
Show file tree

Hide file tree

Showing 3 changed files with 189 additions and 117 deletions.
diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
@@ -5,7 +5,6 @@
 
 import fitz
 
-from stratigraphy import DATAPATH
 from stratigraphy.util import find_depth_columns
 from stratigraphy.util.dataclasses import Line
 from stratigraphy.util.depthcolumn import DepthColumn
@@ -16,6 +15,7 @@
 )
 from stratigraphy.util.interval import BoundaryInterval, Interval
 from stratigraphy.util.layer_identifier_column import (
+    LayerIdentifierColumn,
     find_layer_identifier_column,
     find_layer_identifier_column_entries,
 )
@@ -71,86 +71,55 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict
             lines.append(TextLine(current_line_words))
             current_line_words = []
 
-    depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True)
-    layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words)
-
-    used_entry_rects = []
-    for column in layer_depth_columns:
-        for entry in column.entries:
-            used_entry_rects.extend([entry.start.rect, entry.end.rect])
-
-    depth_column_entries = [
-        entry
-        for entry in find_depth_columns.depth_column_entries(words, include_splits=False)
-        if entry.rect not in used_entry_rects
-    ]
-    depth_columns: list[DepthColumn] = layer_depth_columns
-    depth_columns.extend(
-        find_depth_columns.find_depth_columns(
-            depth_column_entries, words, depth_column_params=params["depth_column_params"]
-        )
-    )
-
     # Detect Layer Index Columns
-    layer_identifier_entries = find_layer_identifier_column_entries(words)
+    layer_identifier_entries = find_layer_identifier_column_entries(lines)
     layer_identifier_columns = (
         find_layer_identifier_column(layer_identifier_entries) if layer_identifier_entries else []
     )
+    pairs = []
     if layer_identifier_columns:
-        layer_identifier_pairs = []
         for layer_identifier_column in layer_identifier_columns:
             material_description_rect = find_material_description_column(
                 lines, layer_identifier_column, language, **params["material_description"]
             )
             if material_description_rect:
-                layer_identifier_pairs.append((layer_identifier_column, material_description_rect))
-
-        # Obtain the best pair. In contrast to depth columns, there only ever is one layer index column per page.
-        if layer_identifier_pairs:
-            layer_identifier_pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1]))
-            layer_identifier_column, material_description_rect = layer_identifier_pairs[-1]
-            # split the material description rect into blocks.
-            description_lines = get_description_lines(lines, material_description_rect)
-            blocks = get_description_blocks_from_layer_identifier(layer_identifier_column.entries, description_lines)
-
-            predictions = [{"material_description": block.to_json()} for block in blocks]
-            predictions = parse_and_remove_empty_predictions(predictions)
-
-            json_filtered_pairs = [
-                {
-                    "depth_column": None,
-                    "material_description_rect": [
-                        material_description_rect.x0,
-                        material_description_rect.y0,
-                        material_description_rect.x1,
-                        material_description_rect.y1,
-                    ],
-                }
-            ]
-
-            # Visualization: To be dropped before merging to main.
-            for layer_identifier_column in layer_identifier_columns:
-                fitz.utils.draw_rect(
-                    page, layer_identifier_column.rect() * page.derotation_matrix, color=fitz.utils.getColor("blue")
-                )
-            for block in blocks:
-                fitz.utils.draw_rect(page, block.rect * page.derotation_matrix, color=fitz.utils.getColor("red"))
-            fitz.utils.draw_rect(
-                page, material_description_rect * page.derotation_matrix, color=fitz.utils.getColor("blue")
+                pairs.append((layer_identifier_column, material_description_rect))
+
+        # Obtain the best pair. In contrast do depth columns, there only ever is one layer index column per page.
+        if pairs:
+            pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1]))
+
+    # If there is a layer identifier column, then we use this directly.
+    # Else, we search for depth columns. We could also think of some scoring mechanism to decide which one to use.
+    if not pairs:
+        depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True)
+        layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words)
+
+        used_entry_rects = []
+        for column in layer_depth_columns:
+            for entry in column.entries:
+                used_entry_rects.extend([entry.start.rect, entry.end.rect])
+
+        depth_column_entries = [
+            entry
+            for entry in find_depth_columns.depth_column_entries(words, include_splits=False)
+            if entry.rect not in used_entry_rects
+        ]
+        depth_columns: list[DepthColumn] = layer_depth_columns
+        depth_columns.extend(
+            find_depth_columns.find_depth_columns(
+                depth_column_entries, words, depth_column_params=params["depth_column_params"]
             )
-            page.parent.save(DATAPATH / "_temp" / "output.pdf", garbage=4, deflate=True, clean=True)
-
-            return predictions, json_filtered_pairs
-
-    pairs = []
-    for depth_column in depth_columns:
-        material_description_rect = find_material_description_column(
-            lines, depth_column, language, **params["material_description"]
         )
-        if material_description_rect:
-            pairs.append((depth_column, material_description_rect))
-    # lowest score first
-    pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words))
+
+        for depth_column in depth_columns:
+            material_description_rect = find_material_description_column(
+                lines, depth_column, language, **params["material_description"]
+            )
+            if material_description_rect:
+                pairs.append((depth_column, material_description_rect))
+        # lowest score first
+        pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words))
 
     to_delete = []
     for i, (_depth_column, material_description_rect) in enumerate(pairs):
@@ -257,7 +226,7 @@ def score_column_match(
 
 
 def match_columns(
-    depth_column: DepthColumn,
+    depth_column: DepthColumn | LayerIdentifierColumn,
     description_lines: list[TextLine],
     geometric_lines: list[Line],
     material_description_rect: fitz.Rect,
@@ -266,10 +235,11 @@ def match_columns(
     """Match the depth column entries with the description lines.
 
     This function identifies groups of depth intervals and text blocks that are likely to match.
-    In this process, the number of text blocks is adjusted to match the number of depth intervals.
+    Makes a distinction between DepthColumn and LayerIdentifierColumn and obtains the corresponding text blocks
+    as well as their depth intervals where present.
 
     Args:
-        depth_column (DepthColumn): The depth column.
+        depth_column (DepthColumn | LayerIdentifierColumn): The depth column.
         description_lines (list[TextLine]): The description lines.
         geometric_lines (list[Line]): The geometric lines.
         material_description_rect (fitz.Rect): The material description rectangle.
@@ -278,13 +248,28 @@ def match_columns(
     Returns:
         list: The matched depth intervals and text blocks.
     """
-    return [
-        element
-        for group in depth_column.identify_groups(
-            description_lines, geometric_lines, material_description_rect, **params
+    if isinstance(depth_column, DepthColumn):
+        return [
+            element
+            for group in depth_column.identify_groups(
+                description_lines, geometric_lines, material_description_rect, **params
+            )
+            for element in transform_groups(group["depth_intervals"], group["blocks"], **params)
+        ]
+    elif isinstance(depth_column, LayerIdentifierColumn):
+        blocks = get_description_blocks_from_layer_identifier(depth_column.entries, description_lines)
+        groups = []
+        for block in blocks:
+            depth_interval = depth_column.get_depth_interval(block)
+            if depth_interval:
+                groups.append({"depth_interval": depth_interval, "block": block})
+            else:
+                groups.append({"block": block})
+        return groups
+    else:
+        raise ValueError(
+            f"depth_column must be a DepthColumn or a LayerIdentifierColumn object. Got {type(depth_column)}."
         )
-        for element in transform_groups(group["depth_intervals"], group["blocks"], **params)
-    ]
 
 
 def transform_groups(

diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py
@@ -21,12 +21,6 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis
     Returns:
         list[DepthColumnEntry]: The extracted depth column entries.
     """
-
-    def value_as_float(string_value: str) -> float:  # noqa: D103
-        # OCR sometimes tends to miss the decimal comma
-        parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value)
-        return abs(float(parsed_text))
-
     entries = []
     for word in sorted(all_words, key=lambda word: word.rect.y0):
         try:
@@ -38,26 +32,54 @@ def value_as_float(string_value: str) -> float:  # noqa: D103
                 entries.append(DepthColumnEntry(word.rect, value))
             elif include_splits:
                 # support for e.g. "1.10-1.60m" extracted as a single word
-                regex2 = re.compile(r"^-?([0-9]+(\.[0-9]+)?)[müMN\\.]*\W+([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
-                match2 = regex2.match(input_string)
-
-                if match2:
-                    value1 = value_as_float(match2.group(1))
-                    first_half_rect = fitz.Rect(
-                        word.rect.x0, word.rect.y0, word.rect.x1 - word.rect.width / 2, word.rect.y1
-                    )
-                    entries.append(DepthColumnEntry(first_half_rect, value1))
-
-                    value2 = value_as_float(match2.group(3))
-                    second_half_rect = fitz.Rect(
-                        word.rect.x0 + word.rect.width / 2, word.rect.y0, word.rect.x1, word.rect.y1
-                    )
-                    entries.append(DepthColumnEntry(second_half_rect, value2))
+                layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect)
+                entries.extend(
+                    [layer_depth_column_entry.start, layer_depth_column_entry.end] if layer_depth_column_entry else []
+                )
         except ValueError:
             pass
     return entries
 
 
+def value_as_float(string_value: str) -> float:  # noqa: D103
+    # OCR sometimes tends to miss the decimal comma
+    parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value)
+    return abs(float(parsed_text))
+
+
+def extract_layer_depth_interval(
+    text: str, rect: fitz.Rect, require_start_of_string: bool = True
+) -> LayerDepthColumnEntry | None:
+    """Extracts a LayerDepthColumnEntry from a string.
+
+    Args:
+        text (str): The string to extract the depth interval from.
+        rect (fitz.Rect): The rectangle of the text.
+        require_start_of_string (bool, optional): Whether the number to extract needs to be
+                                                  at the start of a string. Defaults to True.
+
+    Returns:
+        LayerDepthColumnEntry | None: The extracted LayerDepthColumnEntry or None if none is found.
+    """
+    input_string = text.strip().replace(",", ".")
+
+    query = r"-?([0-9]+(\.[0-9]+)?)[müMN\]*[\s-]+([0-9]+(\.[0-9]+)?)[müMN\\.]*"
+    if not require_start_of_string:
+        query = r".*?" + query
+    regex = re.compile(query)
+    match = regex.match(input_string)
+    if match:
+        value1 = value_as_float(match.group(1))
+        first_half_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 - rect.width / 2, rect.y1)
+
+        value2 = value_as_float(match.group(3))
+        second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1)
+        return LayerDepthColumnEntry(
+            DepthColumnEntry(first_half_rect, value1), DepthColumnEntry(second_half_rect, value2)
+        )
+    return None
+
+
 def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[TextWord]) -> list[LayerDepthColumn]:
     """Finds all layer depth columns.
File	Stmts	Miss	Cover	Missing
src/stratigraphy
__init__.py	8	1	88%	11
extract.py	211	211	0%	3–507
get_files.py	21	21	0%	3–48
line_detection.py	26	26	0%	3–76
main.py	91	91	0%	3–232
src/stratigraphy/util
coordinate_extraction.py	128	31	76%	30, 50, 54, 58–66, 143, 163, 235–241, 250–252, 268–282
dataclasses.py	32	3	91%	37–39
depthcolumn.py	206	67	67%	26, 30, 51, 57, 60–61, 85, 88, 95, 102, 110–111, 121, 138–154, 199, 238, 254–262, 274, 279, 286, 310, 314, 343, 364, 367–378, 393–394, 439–481
depthcolumnentry.py	20	4	80%	12, 15, 27, 34
description_block_splitter.py	70	2	97%	24, 139
draw.py	73	73	0%	3–225
duplicate_detection.py	32	32	0%	3–81
find_depth_columns.py	89	6	93%	39–40, 68, 80, 173–174
find_description.py	63	28	56%	27–35, 50–63, 79–95, 172–175
geometric_line_utilities.py	87	2	98%	83, 133
interval.py	107	52	51%	25–28, 32–35, 40, 45, 48, 100–146, 167, 172–188
language_detection.py	18	18	0%	3–43
layer_identifier_column.py	91	91	0%	3–227
line.py	49	26	47%	25, 42, 51, 65–95, 98
linesquadtree.py	46	1	98%	76
plot_utils.py	44	44	0%	3–121
predictions.py	187	187	0%	3–385
textblock.py	74	8	89%	27, 51, 63, 75, 98, 119, 127, 155
util.py	40	22	45%	15–18, 22, 26, 40–47, 61–63, 87–88, 100–105
TOTAL	1813	1047	42%