Close LGVISIUM-52: Moved the page information within the JSON

swisstopo · Jul 31, 2024 · 29fa7b9 · 29fa7b9
1 parent efc655d
commit 29fa7b9
Show file tree

Hide file tree

Showing 25 changed files with 455 additions and 268 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,21 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Run boreholes-extract-all",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "src.stratigraphy.main",
+            "args": [
+                "-i", "data/zurich",
+                "-g", "data/zurich_ground_truth.json"
+            ],
+            "cwd": "${workspaceFolder}",
+            "justMyCode": true,
+            "python": "./swisstopo/bin/python3",
+        }
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+    "cSpell.words": [
+        "fitz"
+    ]
+}
diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py
@@ -252,7 +252,8 @@ def evaluate_layer_extraction(predictions: dict, number_of_truth_values: dict) -
 
 
 def create_predictions_objects(
-    predictions: dict, ground_truth_path: Path | None
+    predictions: dict,
+    ground_truth_path: Path | None,
 ) -> tuple[dict[FilePredictions], dict]:
     """Create predictions objects from the predictions and evaluate them against the ground truth.
 

diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
@@ -30,7 +30,9 @@
 logger = logging.getLogger(__name__)
 
 
-def process_page(lines: list[TextLine], geometric_lines, language: str, **params: dict) -> list[dict]:
+def process_page(
+    lines: list[TextLine], geometric_lines, language: str, page_number: int, **params: dict
+) -> list[dict]:
     """Process a single page of a pdf.
 
     Finds all descriptions and depth intervals on the page and matches them.
@@ -39,6 +41,7 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params
         lines (list[TextLine]): all the text lines on the page.
         geometric_lines (list[Line]): The geometric lines of the page.
         language (str): The language of the page.
+        page_number (int): The page number.
         **params (dict): Additional parameters for the matching pipeline.
 
     Returns:
@@ -67,7 +70,7 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params
     # If there is a layer identifier column, then we use this directly.
     # Else, we search for depth columns. We could also think of some scoring mechanism to decide which one to use.
     if not pairs:
-        depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True)
+        depth_column_entries = find_depth_columns.depth_column_entries(words, page_number, include_splits=True)
         layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words)
 
         used_entry_rects = []
@@ -77,13 +80,13 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params
 
         depth_column_entries = [
             entry
-            for entry in find_depth_columns.depth_column_entries(words, include_splits=False)
+            for entry in find_depth_columns.depth_column_entries(words, page_number, include_splits=False)
             if entry.rect not in used_entry_rects
         ]
         depth_columns: list[DepthColumn] = layer_depth_columns
         depth_columns.extend(
             find_depth_columns.find_depth_columns(
-                depth_column_entries, words, depth_column_params=params["depth_column_params"]
+                depth_column_entries, words, page_number, depth_column_params=params["depth_column_params"]
             )
         )
 
@@ -106,12 +109,12 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params
 
     groups = []  # list of matched depth intervals and text blocks
     # groups is of the form: [{"depth_interval": BoundaryInterval, "block": TextBlock}]
-    if len(filtered_pairs):  # match depth column items with material description
+    if filtered_pairs:  # match depth column items with material description
         for depth_column, material_description_rect in filtered_pairs:
             description_lines = get_description_lines(lines, material_description_rect)
             if len(description_lines) > 1:
                 new_groups = match_columns(
-                    depth_column, description_lines, geometric_lines, material_description_rect, **params
+                    depth_column, description_lines, geometric_lines, material_description_rect, page_number, **params
                 )
                 groups.extend(new_groups)
         json_filtered_pairs = [
@@ -157,9 +160,11 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params
                 ]
             )
     predictions = [
-        {"material_description": group["block"].to_json(), "depth_interval": group["depth_interval"].to_json()}
-        if "depth_interval" in group
-        else {"material_description": group["block"].to_json()}
+        (
+            {"material_description": group["block"].to_json(), "depth_interval": group["depth_interval"].to_json()}
+            if "depth_interval" in group
+            else {"material_description": group["block"].to_json()}
+        )
         for group in groups
     ]
     predictions = parse_and_remove_empty_predictions(predictions)
@@ -205,6 +210,7 @@ def match_columns(
     description_lines: list[TextLine],
     geometric_lines: list[Line],
     material_description_rect: fitz.Rect,
+    page_number: int,
     **params: dict,
 ) -> list:
     """Match the depth column entries with the description lines.
@@ -218,6 +224,7 @@ def match_columns(
         description_lines (list[TextLine]): The description lines.
         geometric_lines (list[Line]): The geometric lines.
         material_description_rect (fitz.Rect): The material description rectangle.
+        page_number (int): The page number.
         **params (dict): Additional parameters for the matching pipeline.
 
     Returns:
@@ -235,7 +242,7 @@ def match_columns(
         blocks = get_description_blocks_from_layer_identifier(depth_column.entries, description_lines)
         groups = []
         for block in blocks:
-            depth_interval = depth_column.get_depth_interval(block)
+            depth_interval = depth_column.get_depth_interval(block, page_number)
             if depth_interval:
                 groups.append({"depth_interval": depth_interval, "block": block})
             else:
@@ -320,7 +327,7 @@ def merge_blocks_by_vertical_spacing(blocks: list[TextBlock], target_merge_count
             merged_blocks.append(current_merged_block)
             current_merged_block = new_block
 
-    if len(current_merged_block.lines):
+    if current_merged_block.lines:
         merged_blocks.append(current_merged_block)
     return merged_blocks
 
@@ -355,7 +362,7 @@ def split_blocks_by_textline_length(blocks: list[TextBlock], target_split_count:
                     split_blocks.append(TextBlock(current_block_lines))
                     cutoff_values.remove(line.rect.x1)
                     current_block_lines = []
-            if len(current_block_lines):
+            if current_block_lines:
                 split_blocks.append(TextBlock(current_block_lines))
                 current_block_lines = []
             if (
@@ -386,7 +393,7 @@ def find_material_description_column(
             if x_overlap(line.rect, depth_column.rect()) and line.rect.y0 < depth_column.rect().y0
         ]
 
-        min_y0 = max(line.rect.y0 for line in above_depth_column) if len(above_depth_column) else -1
+        min_y0 = max(line.rect.y0 for line in above_depth_column) if above_depth_column else -1
 
         def check_y0_condition(y0):
             return y0 > min_y0 and y0 < depth_column.rect().y1
@@ -409,7 +416,7 @@ def check_y0_condition(y0):
         ]
 
         def filter_coverage(coverage):
-            if len(coverage):
+            if coverage:
                 min_x0 = min(line.rect.x0 for line in coverage)
                 max_x1 = max(line.rect.x1 for line in coverage)
                 x0_threshold = max_x1 - 0.4 * (

diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
@@ -173,30 +173,36 @@ def start_pipeline(
                         predictions[filename]["metadata"] = {"coordinates": coordinates.to_json()}
                     else:
                         predictions[filename]["metadata"] = {"coordinates": None}
+
+                    layer_predictions_list = []
+                    depths_materials_column_pairs_list = []
+                    page_heights_list = []
+                    page_widths_list = []
                     for page_index, page in enumerate(doc):
                         page_number = page_index + 1
                         logger.info("Processing page %s", page_number)
 
-                        text_lines = extract_text_lines(page)
+                        text_lines = extract_text_lines(page, page_number)
                         geometric_lines = extract_lines(page, line_detection_params)
                         layer_predictions, depths_materials_column_pairs = process_page(
-                            text_lines, geometric_lines, language, **matching_params
+                            text_lines, geometric_lines, language, page_number, **matching_params
                         )
-                        # Add remove duplicates here!
+
+                        # TODO: Add remove duplicates here!
                         if page_index > 0:
                             layer_predictions = remove_duplicate_layers(
                                 doc[page_index - 1],
                                 page,
-                                predictions[filename][f"page_{page_number - 1}"]["layers"],
+                                layer_predictions_list,
                                 layer_predictions,
                                 matching_params["img_template_probability_threshold"],
                             )
-                        predictions[filename][f"page_{page_number}"] = {
-                            "layers": layer_predictions,
-                            "depths_materials_column_pairs": depths_materials_column_pairs,
-                            "page_height": page.rect.height,
-                            "page_width": page.rect.width,
-                        }
+
+                        layer_predictions_list.extend(layer_predictions)
+                        depths_materials_column_pairs_list.extend(depths_materials_column_pairs)
+                        page_heights_list.append(page.rect.height)
+                        page_widths_list.append(page.rect.width)
+
                         if draw_lines:  # could be changed to if draw_lines and mflow_tracking:
                             if not mlflow_tracking:
                                 logger.warning(
@@ -208,11 +214,18 @@ def start_pipeline(
                                 )
                                 mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png")
 
+                    predictions[filename]["layers"] = layer_predictions_list
+                    predictions[filename]["depths_materials_column_pairs"] = depths_materials_column_pairs_list
+                    predictions[filename]["page_height"] = page_heights_list
+                    predictions[filename]["page_width"] = page_widths_list
+
+                    assert len(page_heights_list) == len(page_widths_list) == doc.page_count, "Page count mismatch."
+
     logger.info("Writing predictions to JSON file %s", predictions_path)
     with open(predictions_path, "w") as file:
         file.write(json.dumps(predictions))
 
-    # evaluate the predictions; if file doesnt exist, the predictions are not changed.
+    # evaluate the predictions; if file does not exist, the predictions are not changed.
     predictions, number_of_truth_values = create_predictions_objects(predictions, ground_truth_path)
 
     if not skip_draw_predictions:

diff --git a/src/stratigraphy/util/boundarydepthcolumnvalidator.py b/src/stratigraphy/util/boundarydepthcolumnvalidator.py
@@ -63,27 +63,27 @@ def is_valid(self, column: BoundaryDepthColumn, corr_coef_threshold: float = 0.9
 
         return corr_coef and corr_coef > corr_coef_threshold
 
-    def reduce_until_valid(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn:
+    def reduce_until_valid(self, column: BoundaryDepthColumn, page_number: int) -> BoundaryDepthColumn:
         """Removes entries from the depth column until it fulfills the is_valid condition.
 
         is_valid checks whether there is too much noise (i.e. other text) in the column and whether the entries are
         linearly correlated with their vertical position.
 
         Args:
             column (BoundaryDepthColumn): The depth column to validate
-
+            page_number (int): The page number of the depth column
         Returns:
             BoundaryDepthColumn: The current depth column with entries removed until it is valid.
         """
         while column:
             if self.is_valid(column):
                 return column
-            elif self.correct_OCR_mistakes(column) is not None:
-                return self.correct_OCR_mistakes(column)
+            elif self.correct_OCR_mistakes(column, page_number) is not None:
+                return self.correct_OCR_mistakes(column, page_number)
             else:
                 column = column.remove_entry_by_correlation_gradient()
 
-    def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn | None:
+    def correct_OCR_mistakes(self, column: BoundaryDepthColumn, page_number: int) -> BoundaryDepthColumn | None:
         """Corrects OCR mistakes in the depth column entries.
 
         Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the
@@ -102,22 +102,23 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu
 
         Args:
             column (BoundaryDepthColumn): The depth column to validate
+            page_number (int): The page number of the depth column
 
         Returns:
             BoundaryDepthColumn | None: The corrected depth column, or None if no correction was possible.
         """
         new_columns = [BoundaryDepthColumn()]
         for entry in column.entries:
             new_columns = [
-                BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value)])
+                BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value, page_number)])
                 for column in new_columns
                 for new_value in _value_alternatives(entry.value)
             ]
             # Immediately require strictly increasing values, to avoid exponential complexity when many implausible
             # alternative values are suggested
             new_columns = [column for column in new_columns if column.is_strictly_increasing()]
 
-        if len(new_columns):
+        if new_columns:
             best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef())
 
             # We require a higher correlation coefficient when we've already corrected a mistake.

diff --git a/src/stratigraphy/util/coordinate_extraction.py b/src/stratigraphy/util/coordinate_extraction.py
@@ -312,8 +312,9 @@ def extract_coordinates(self) -> Coordinate | None:
         Returns:
             Coordinate | None: the extracted coordinates (if any)
         """
-        for page in self.doc:
-            lines = extract_text_lines(page)
+        for page_idx, page in enumerate(self.doc):
+            page_number = page_idx + 1
+            lines = extract_text_lines(page, page_number)
             page_number = page.number + 1  # page.number is 0-based
 
             found_coordinates = (

diff --git a/src/stratigraphy/util/depthcolumn.py b/src/stratigraphy/util/depthcolumn.py
@@ -111,7 +111,7 @@ def break_on_mismatch(self) -> list[LayerDepthColumn]:
                 segment_start = index
 
         final_segment = self.entries[segment_start:]
-        if len(final_segment):
+        if final_segment:
             segments.append(final_segment)
 
         return [LayerDepthColumn(segment) for segment in segments]
@@ -338,7 +338,7 @@ def break_on_double_descending(self) -> list[BoundaryDepthColumn]:
                 segment_start = index
 
         final_segment = self.entries[segment_start:]
-        if len(final_segment):
+        if final_segment:
             segments.append(final_segment)
 
         return [BoundaryDepthColumn(segment) for segment in segments]