swisstopo · redur · May 22, 2024 · May 17, 2024 · May 21, 2024 · May 21, 2024
diff --git a/config/matching_params.yml b/config/matching_params.yml
@@ -46,31 +46,44 @@ material_description:
   fr:
     including_expressions:
       - sol
-      - végétal
+      - végétal  # remove accents generally; ocr might be wrong
       - dallage
       - terre
       - bitume
       - bitumineux
-      - grave d'infrastructure
+      - grave d'infrastructure  # what happens if we remove this?
       - sable
       - limon
       - gravier
       - asphalte
-      - humus
+      - humus  # hummus maybe?
       - brun
       - gris
+      - grise
       - mou
       - dur
+      - dure
+      - ferme
       - racine
       - revetement
       - pierre
       - beige
       - beton
       - craie
       - marne
-      - materiau de base
+      - materiau
       - matrice sableuse
-      - enrobé
+      - enrobé  # accent --> check what happens if it's removed
+      - terrain
+      - remblais
+      - remblai
+      - molasse
+      - phase
+      - formations
+      - limoneuse
+      - argileuse
+      - argileux
+      - mousse
     excluding_expressions:
       - monsieur
       - fin

diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
@@ -5,11 +5,17 @@
 
 import fitz
 
+from stratigraphy import DATAPATH
 from stratigraphy.util import find_depth_columns
 from stratigraphy.util.dataclasses import Line
 from stratigraphy.util.depthcolumn import DepthColumn
-from stratigraphy.util.find_description import get_description_blocks, get_description_lines
+from stratigraphy.util.find_description import (
+    get_description_blocks,
+    get_description_blocks_from_layer_index,
+    get_description_lines,
+)
 from stratigraphy.util.interval import BoundaryInterval, Interval
+from stratigraphy.util.layer_index_column import find_layer_index_column, find_layer_index_column_entries
 from stratigraphy.util.line import TextLine, TextWord
 from stratigraphy.util.textblock import TextBlock, block_distance
 from stratigraphy.util.util import (
@@ -81,14 +87,63 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict
             depth_column_entries, words, depth_column_params=params["depth_column_params"]
         )
     )
+
+    # Detect Layer Index Columns
+    layer_index_entries = find_layer_index_column_entries(words)
+    layer_index_columns = find_layer_index_column(layer_index_entries) if layer_index_entries else []
+    if layer_index_columns:
+        layer_index_pairs = []
+        for layer_index_column in layer_index_columns:
+            material_description_rect = find_material_description_column(
+                lines, layer_index_column, language, **params["material_description"]
+            )
+            if material_description_rect:
+                layer_index_pairs.append((layer_index_column, material_description_rect))
+
+        # Obtain the best pair. In contrast do depth columns, there only ever is one layer index column per page.
+        if layer_index_pairs:
+            layer_index_pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1]))
+            layer_index_column, material_description_rect = layer_index_pairs[-1]
+            # split the material description rect into blocks.
+            description_lines = get_description_lines(lines, material_description_rect)
+            blocks = get_description_blocks_from_layer_index(layer_index_column.entries, description_lines)
+
+            predictions = [{"material_description": block.to_json()} for block in blocks]
+            predictions = parse_and_remove_empty_predictions(predictions)
+
+            json_filtered_pairs = [
+                {
+                    "depth_column": None,
+                    "material_description_rect": [
+                        material_description_rect.x0,
+                        material_description_rect.y0,
+                        material_description_rect.x1,
+                        material_description_rect.y1,
+                    ],
+                }
+            ]
+
+            # Visualization: To be dropped before merging to main.
+            for layer_index_column in layer_index_columns:
+                fitz.utils.draw_rect(
+                    page, layer_index_column.rect() * page.derotation_matrix, color=fitz.utils.getColor("blue")
+                )
+            for block in blocks:
+                fitz.utils.draw_rect(page, block.rect * page.derotation_matrix, color=fitz.utils.getColor("red"))
+            fitz.utils.draw_rect(
+                page, material_description_rect * page.derotation_matrix, color=fitz.utils.getColor("blue")
+            )
+            page.parent.save(DATAPATH / "_temp" / "output.pdf", garbage=4, deflate=True, clean=True)
+
+            return predictions, json_filtered_pairs
+
     pairs = []
     for depth_column in depth_columns:
         material_description_rect = find_material_description_column(
             lines, depth_column, language, **params["material_description"]
         )
         if material_description_rect:
             pairs.append((depth_column, material_description_rect))
-
     # lowest score first
     pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words))
 
@@ -101,7 +156,7 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict
     filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete]
 
     groups = []  # list of matched depth intervals and text blocks
-    # groups is of the form: ["depth_interval": BoundaryInterval, "block": TextBlock]
+    # groups is of the form: [{"depth_interval": BoundaryInterval, "block": TextBlock}]
     if len(filtered_pairs):  # match depth column items with material description
         for depth_column, material_description_rect in filtered_pairs:
             description_lines = get_description_lines(lines, material_description_rect)

diff --git a/src/stratigraphy/util/find_description.py b/src/stratigraphy/util/find_description.py
@@ -24,16 +24,75 @@ def get_description_lines(lines: list[TextLine], material_description_rect: fitz
     Returns:
         list[TextLine]: The filtered lines.
     """
+    if not lines:
+        return []
     filtered_lines = [
         line
         for line in lines
         if line.rect.x0 < material_description_rect.x1 - 0.4 * material_description_rect.width
         if material_description_rect.contains(line.rect)
     ]
-
     return sorted([line for line in filtered_lines if line], key=lambda line: line.rect.y0)
 
 
+def get_description_blocks_from_layer_index(
+    layer_index_entries: list[TextLine], description_lines: list[TextLine]
+) -> list[TextBlock]:
+    """Divide the description lines into blocks based on the layer index entries.
+
+    Args:
+        layer_index_entries (list[TextLine]): The layer index entries.
+        description_lines (list[TextLine]): All lines constituting the material description.
+
+    Returns:
+        list[TextBlock]: The blocks of the material description.
+    """
+    blocks = []
+    line_index = 0
+    for layer_index_idx, _layer_index in enumerate(layer_index_entries):
+        # don't allow a layer above depth 0
+
+        next_layer_index = (
+            layer_index_entries[layer_index_idx + 1] if layer_index_idx + 1 < len(layer_index_entries) else None
+        )
+
+        matched_block = matching_blocks(description_lines, line_index, next_layer_index)
+        line_index += sum([len(block.lines) for block in matched_block])
+        blocks.extend(matched_block)
+
+    return blocks
+
+
+def matching_blocks(all_lines: list[TextLine], line_index: int, next_layer_index: TextLine | None) -> list[TextBlock]:
+    """Adds lines to a block until the next layer index is reached.
+
+    Args:
+        all_lines (list[TextLine]): All TextLine objects constituting the material description.
+        line_index (int): The index of the last line that is already assigned to a block.
+        next_layer_index (TextLine | None): The next layer index.
+
+    Returns:
+        list[TextBlock]: The next block or an empty list if no lines are added.
+    """
+    y1_threshold = None
+    if next_layer_index:
+        next_interval_start_rect = next_layer_index.rect
+        y1_threshold = next_interval_start_rect.y0 + next_interval_start_rect.height / 2
+
+    matched_lines = []
+
+    for current_line in all_lines[line_index:]:
+        if y1_threshold is None or current_line.rect.y1 < y1_threshold:
+            matched_lines.append(current_line)
+        else:
+            break
+
+    if len(matched_lines):
+        return [TextBlock(matched_lines)]
+    else:
+        return []
+
+
 def get_description_blocks(
     description_lines: list[TextLine],
     geometric_lines: list[Line],

diff --git a/src/stratigraphy/util/layer_index_column.py b/src/stratigraphy/util/layer_index_column.py
@@ -0,0 +1,162 @@
+"""Module for the LayerIndexColumn class."""
+
+import re
+
+import fitz
+
+from stratigraphy.util.line import TextLine
+
+
+class LayerIndexColumn:
+    """Class for a layer index column."""
+
+    def __init__(self, entries: list[TextLine]):
+        """Initialize the LayerIndexColumn object.
+
+        Args:
+            entries (list[TextLine]): The entries corresponding to the layer indices.
+        """
+        self.entries = entries
+
+    @property
+    def max_x0(self) -> float:
+        return max([rect.x0 for rect in self.rects()])
+
+    @property
+    def min_x1(self) -> float:
+        return min([rect.x1 for rect in self.rects()])
+
+    def rect(self) -> fitz.Rect:
+        """Get the rectangle of the layer index column.
+
+        Returns:
+            fitz.Rect: The rectangle of the layer index column.
+        """
+        x0 = min([rect.x0 for rect in self.rects()])
+        x1 = max([rect.x1 for rect in self.rects()])
+        y0 = min([rect.y0 for rect in self.rects()])
+        y1 = max([rect.y1 for rect in self.rects()])
+        return fitz.Rect(x0, y0, x1, y1)
+
+    def rects(self) -> list[fitz.Rect]:
+        return [entry.rect for entry in self.entries]
+
+    def add_entry(self, entry: TextLine):
+        """Add a new layer index column entry to the layer index column.
+
+        Args:
+            entry (TextLine): The layer index column entry to be added.
+        """
+        self.entries.append(entry)
+
+    def can_be_appended(self, rect: fitz.Rect) -> bool:
+        """Checks if a new layer index column entry can be appended to the current layer index column.
+
+        The checks are:
+        - The width of the new rectangle is greater than the width of the current layer index column. Or;
+        - The middle of the new rectangle is within the horizontal boundaries of the current layer index column.
+        - The new rectangle intersects with the minimal horizontal boundaries of the current layer index column.
+
+
+        Args:
+            rect (fitz.Rect): Rect of the layer index column entry to be appended.
+
+        Returns:
+            bool: True if the new layer index column entry can be appended, False otherwise.
+        """
+        new_middle = (rect.x0 + rect.x1) / 2
+        if (self.rect().width < rect.width or self.rect().x0 < new_middle < self.rect().x1) and (
+            rect.x0 <= self.min_x1 and self.max_x0 <= rect.x1
+        ):
+            return True
+        return False
+
+    def strictly_contains(self, other):
+        return len(other.entries) < len(self.entries) and all(
+            other_entry in self.entries for other_entry in other.entries
+        )
+
+    def is_contained(self, rect: fitz.Rect) -> bool:
+        """Check if the layer index column is contained in another rectangle.
+
+        Args:
+            rect (fitz.Rect): The rectangle to check if it contains the layer index column.
+
+        Returns:
+            bool: True if the layer index column is contained in the rectangle, False otherwise.
+        """
+        return (
+            rect.x0 <= self.rect().x0
+            and self.rect().x1 <= rect.x1
+            and rect.y0 <= self.rect().y0
+            and self.rect().y1 <= rect.y1
+        )
+
+    def noise_count(self, words):
+        return 0
+
+
+def find_layer_index_column_entries(all_words: list[TextLine]) -> list:
+    r"""Find the layer index column entries.
+
+    Regex explanation:
+    - \b is a word boundary. This ensures that the match must start at the beginning of a word.
+    - [\da-z]+ matches one or more (+) alphanumeric characters (\d for digits and a-z for lowercase letters).
+    - \) matches a closing parenthesis. The backslash is necessary because parentheses are special characters
+      in regular expressions, so we need to escape it to match a literal parenthesis.
+    This regular expression will match strings like "1)", "2)", "a)", "b)", "1a4)", "6de)", etc.
+
+    Args:
+        all_words (ist[TextLine]): The words to search for layer index columns.
+
+    Returns:
+        list: The layer index column entries.
+    """
+    entries = []
+    for line in sorted(all_words, key=lambda line: line.rect.y0):
+        regex = re.compile(r"\b[\da-z-]+\)")
+        match = regex.match(line.text)
+        if match and len(line.text) < 7:
+            entries.append(line)
+    return entries
+
+
+def find_layer_index_column(entries: list[TextLine]) -> list[LayerIndexColumn]:
+    """Find the layer index column given the index column entries.
+
+    Note: Similar to find_depth_columns.find_depth_columns. Refactoring may be desired.
+
+    Args:
+        entries (list[TextLine]): The layer index column entries.
+
+    Returns:
+        list[LayerIndexColumn]: The found layer index columns.
+    """
+    layer_index_columns = [LayerIndexColumn([entries[0]])]
+    for entry in entries[1:]:
+        has_match = False
+        for column in layer_index_columns:
+            if column.can_be_appended(entry.rect):
+                column.add_entry(entry)
+                has_match = True
+        if not has_match:
+            layer_index_columns.append(LayerIndexColumn([entry]))
+
+        # only keep columns whose entries are not fully contained in a different column
+        layer_index_columns = [
+            column
+            for column in layer_index_columns
+            if all(not other.strictly_contains(column) for other in layer_index_columns)
+        ]
+        # check if the column rect is a subset of another column rect. If so, merge the entries and sort them by y0.
+        for column in layer_index_columns:
+            for other in layer_index_columns:
+                if column != other and column.is_contained(other.rect()):
+                    for entry in other.entries:
+                        if entry not in column.entries:
+                            column.entries.append(entry)
+                    column.entries.sort(key=lambda entry: entry.rect.y0)
+                    layer_index_columns.remove(other)
+                    break
+    layer_index_columns = [column for column in layer_index_columns if len(column.entries) > 2]
+    return layer_index_columns