swisstopo · redur · May 22, 2024 · May 17, 2024 · May 21, 2024 · May 21, 2024
diff --git a/config/matching_params.yml b/config/matching_params.yml
@@ -46,31 +46,44 @@ material_description:
   fr:
     including_expressions:
       - sol
-      - végétal
+      - végétal  # remove accents generally; ocr might be wrong
       - dallage
       - terre
       - bitume
       - bitumineux
-      - grave d'infrastructure
+      - grave d'infrastructure  # what happens if we remove this?
       - sable
       - limon
       - gravier
       - asphalte
-      - humus
+      - humus  # hummus maybe?
       - brun
       - gris
+      - grise
       - mou
       - dur
+      - dure
+      - ferme
       - racine
       - revetement
       - pierre
       - beige
       - beton
       - craie
       - marne
-      - materiau de base
+      - materiau
       - matrice sableuse
-      - enrobé
+      - enrobé  # accent --> check what happens if it's removed
+      - terrain
+      - remblais
+      - remblai
+      - molasse
+      - phase
+      - formations
+      - limoneuse
+      - argileuse
+      - argileux
+      - mousse
     excluding_expressions:
       - monsieur
       - fin

diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
@@ -5,11 +5,20 @@
 
 import fitz
 
+from stratigraphy import DATAPATH
 from stratigraphy.util import find_depth_columns
 from stratigraphy.util.dataclasses import Line
 from stratigraphy.util.depthcolumn import DepthColumn
-from stratigraphy.util.find_description import get_description_blocks, get_description_lines
+from stratigraphy.util.find_description import (
+    get_description_blocks,
+    get_description_blocks_from_layer_identifier,
+    get_description_lines,
+)
 from stratigraphy.util.interval import BoundaryInterval, Interval
+from stratigraphy.util.layer_identifier_column import (
+    find_layer_identifier_column,
+    find_layer_identifier_column_entries,
+)
 from stratigraphy.util.line import TextLine, TextWord
 from stratigraphy.util.textblock import TextBlock, block_distance
 from stratigraphy.util.util import (
@@ -41,7 +50,7 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict
     for x0, y0, x1, y1, word, block_no, line_no, _word_no in fitz.utils.get_text(page, "words"):
         rect = fitz.Rect(x0, y0, x1, y1) * page.rotation_matrix
         text_word = TextWord(rect, word)
-        words.append(TextLine([text_word]))
+        words.append(text_word)
         key = f"{block_no}_{line_no}"
         if key not in words_by_line:
             words_by_line[key] = []
@@ -81,14 +90,65 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict
             depth_column_entries, words, depth_column_params=params["depth_column_params"]
         )
     )
+
+    # Detect Layer Index Columns
+    layer_identifier_entries = find_layer_identifier_column_entries(words)
+    layer_identifier_columns = (
+        find_layer_identifier_column(layer_identifier_entries) if layer_identifier_entries else []
+    )
+    if layer_identifier_columns:
+        layer_identifier_pairs = []
+        for layer_identifier_column in layer_identifier_columns:
+            material_description_rect = find_material_description_column(
+                lines, layer_identifier_column, language, **params["material_description"]
+            )
+            if material_description_rect:
+                layer_identifier_pairs.append((layer_identifier_column, material_description_rect))
+
+        # Obtain the best pair. In contrast to depth columns, there only ever is one layer index column per page.
+        if layer_identifier_pairs:
+            layer_identifier_pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1]))
+            layer_identifier_column, material_description_rect = layer_identifier_pairs[-1]
+            # split the material description rect into blocks.
+            description_lines = get_description_lines(lines, material_description_rect)
+            blocks = get_description_blocks_from_layer_identifier(layer_identifier_column.entries, description_lines)
+
+            predictions = [{"material_description": block.to_json()} for block in blocks]
+            predictions = parse_and_remove_empty_predictions(predictions)
+
+            json_filtered_pairs = [
+                {
+                    "depth_column": None,
+                    "material_description_rect": [
+                        material_description_rect.x0,
+                        material_description_rect.y0,
+                        material_description_rect.x1,
+                        material_description_rect.y1,
+                    ],
+                }
+            ]
+
+            # Visualization: To be dropped before merging to main.
+            for layer_identifier_column in layer_identifier_columns:
+                fitz.utils.draw_rect(
+                    page, layer_identifier_column.rect() * page.derotation_matrix, color=fitz.utils.getColor("blue")
+                )
+            for block in blocks:
+                fitz.utils.draw_rect(page, block.rect * page.derotation_matrix, color=fitz.utils.getColor("red"))
+            fitz.utils.draw_rect(
+                page, material_description_rect * page.derotation_matrix, color=fitz.utils.getColor("blue")
+            )
+            page.parent.save(DATAPATH / "_temp" / "output.pdf", garbage=4, deflate=True, clean=True)
+
+            return predictions, json_filtered_pairs
+
     pairs = []
     for depth_column in depth_columns:
         material_description_rect = find_material_description_column(
             lines, depth_column, language, **params["material_description"]
         )
         if material_description_rect:
             pairs.append((depth_column, material_description_rect))
-
     # lowest score first
     pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words))
 
@@ -101,7 +161,7 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict
     filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete]
 
     groups = []  # list of matched depth intervals and text blocks
-    # groups is of the form: ["depth_interval": BoundaryInterval, "block": TextBlock]
+    # groups is of the form: [{"depth_interval": BoundaryInterval, "block": TextBlock}]
     if len(filtered_pairs):  # match depth column items with material description
         for depth_column, material_description_rect in filtered_pairs:
             description_lines = get_description_lines(lines, material_description_rect)

diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py
@@ -6,16 +6,16 @@
 
 from stratigraphy.util.depthcolumn import BoundaryDepthColumn, LayerDepthColumn
 from stratigraphy.util.depthcolumnentry import DepthColumnEntry, LayerDepthColumnEntry
-from stratigraphy.util.line import TextLine
+from stratigraphy.util.line import TextWord
 
 
-def depth_column_entries(all_words: list[TextLine], include_splits: bool) -> list[DepthColumnEntry]:
+def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]:
     """Find all depth column entries given a list of TextLine objects.
 
     Note: Only depths up to two digits before the decimal point are supported.
 
     Args:
-        all_words (list[TextLine]): List of Text lines to extract depth column entries from.
+        all_words (list[TextWord]): List of text words to extract depth column entries from.
         include_splits (bool): Whether to include split entries.
 
     Returns:
@@ -28,14 +28,14 @@ def value_as_float(string_value: str) -> float:  # noqa: D103
         return abs(float(parsed_text))
 
     entries = []
-    for line in sorted(all_words, key=lambda line: line.rect.y0):
+    for word in sorted(all_words, key=lambda word: word.rect.y0):
         try:
-            input_string = line.text.strip().replace(",", ".")
+            input_string = word.text.strip().replace(",", ".")
             regex = re.compile(r"^-?([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
             match = regex.match(input_string)
             if match:
                 value = value_as_float(match.group(1))
-                entries.append(DepthColumnEntry(line.rect, value))
+                entries.append(DepthColumnEntry(word.rect, value))
             elif include_splits:
                 # support for e.g. "1.10-1.60m" extracted as a single word
                 regex2 = re.compile(r"^-?([0-9]+(\.[0-9]+)?)[müMN\\.]*\W+([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
@@ -44,33 +44,33 @@ def value_as_float(string_value: str) -> float:  # noqa: D103
                 if match2:
                     value1 = value_as_float(match2.group(1))
                     first_half_rect = fitz.Rect(
-                        line.rect.x0, line.rect.y0, line.rect.x1 - line.rect.width / 2, line.rect.y1
+                        word.rect.x0, word.rect.y0, word.rect.x1 - word.rect.width / 2, word.rect.y1
                     )
                     entries.append(DepthColumnEntry(first_half_rect, value1))
 
                     value2 = value_as_float(match2.group(3))
                     second_half_rect = fitz.Rect(
-                        line.rect.x0 + line.rect.width / 2, line.rect.y0, line.rect.x1, line.rect.y1
+                        word.rect.x0 + word.rect.width / 2, word.rect.y0, word.rect.x1, word.rect.y1
                     )
                     entries.append(DepthColumnEntry(second_half_rect, value2))
         except ValueError:
             pass
     return entries
 
 
-def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[TextLine]) -> list[LayerDepthColumn]:
+def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[TextWord]) -> list[LayerDepthColumn]:
     """Finds all layer depth columns.
 
-    Generates a list of LayerDepthColumnEntry objects by finding conseucutive pairs of DepthColumnEntry objects.
-    Different columns are grouped together in LayerDepthColumn objects. Finally a list of LayerDepthColumn objects,
+    Generates a list of LayerDepthColumnEntry objects by finding consecutive pairs of DepthColumnEntry objects.
+    Different columns are grouped together in LayerDepthColumn objects. Finally, a list of LayerDepthColumn objects,
     one for each column, is returned.
 
     A layer corresponds to a material layer. The layer is defined using a start and end point (e.g. 1.10-1.60m).
     The start and end points are represented as DepthColumnEntry objects.
 
     Args:
         entries (list[DepthColumnEntry]): List of depth column entries.
-        all_words (list[TextLine]): List of all TextLine objects.
+        all_words (list[TextWord]): List of all TextWord objects.
 
     Returns:
         list[LayerDepthColumn]: List of all layer depth columns identified.
@@ -125,7 +125,7 @@ def find_pair(entry: DepthColumnEntry) -> DepthColumnEntry | None:  # noqa: D103
 
 
 def find_depth_columns(
-    entries: list[DepthColumnEntry], all_words: list[TextLine], depth_column_params: dict
+    entries: list[DepthColumnEntry], all_words: list[TextWord], depth_column_params: dict
 ) -> list[BoundaryDepthColumn]:
     """Construct all possible BoundaryDepthColumn objects from the given DepthColumnEntry objects.
 

diff --git a/src/stratigraphy/util/find_description.py b/src/stratigraphy/util/find_description.py
@@ -24,16 +24,77 @@ def get_description_lines(lines: list[TextLine], material_description_rect: fitz
     Returns:
         list[TextLine]: The filtered lines.
     """
+    if not lines:
+        return []
     filtered_lines = [
         line
         for line in lines
         if line.rect.x0 < material_description_rect.x1 - 0.4 * material_description_rect.width
         if material_description_rect.contains(line.rect)
     ]
-
     return sorted([line for line in filtered_lines if line], key=lambda line: line.rect.y0)
 
 
+def get_description_blocks_from_layer_identifier(
+    layer_identifier_entries: list[TextLine], description_lines: list[TextLine]
+) -> list[TextBlock]:
+    """Divide the description lines into blocks based on the layer identifier entries.
+
+    Args:
+        layer_identifier_entries (list[TextLine]): The layer identifier entries.
+        description_lines (list[TextLine]): All lines constituting the material description.
+
+    Returns:
+        list[TextBlock]: The blocks of the material description.
+    """
+    blocks = []
+    line_index = 0
+    for layer_identifier_idx, _layer_index in enumerate(layer_identifier_entries):
+        next_layer_identifier = (
+            layer_identifier_entries[layer_identifier_idx + 1]
+            if layer_identifier_idx + 1 < len(layer_identifier_entries)
+            else None
+        )
+
+        matched_block = matching_blocks(description_lines, line_index, next_layer_identifier)
+        line_index += sum([len(block.lines) for block in matched_block])
+        blocks.extend(matched_block)
+
+    return blocks
+
+
+def matching_blocks(
+    all_lines: list[TextLine], line_index: int, next_layer_identifier: TextLine | None
+) -> list[TextBlock]:
+    """Adds lines to a block until the next layer identifier is reached.
+
+    Args:
+        all_lines (list[TextLine]): All TextLine objects constituting the material description.
+        line_index (int): The index of the last line that is already assigned to a block.
+        next_layer_identifier (TextLine | None): The next layer identifier.
+
+    Returns:
+        list[TextBlock]: The next block or an empty list if no lines are added.
+    """
+    y1_threshold = None
+    if next_layer_identifier:
+        next_interval_start_rect = next_layer_identifier.rect
+        y1_threshold = next_interval_start_rect.y0 + next_interval_start_rect.height / 2
+
+    matched_lines = []
+
+    for current_line in all_lines[line_index:]:
+        if y1_threshold is None or current_line.rect.y1 < y1_threshold:
+            matched_lines.append(current_line)
+        else:
+            break
+
+    if len(matched_lines):
+        return [TextBlock(matched_lines)]
+    else:
+        return []
+
+
 def get_description_blocks(
     description_lines: list[TextLine],
     geometric_lines: list[Line],