Merge pull request #36 from swisstopo/feat/remove_duplicate_layers

Feat/remove duplicate layers
swisstopo · May 3, 2024 · 6256722 · 6256722 · github-actions · May 3, 2024
2 parents 178b1b5 + ad26ce5
commit 6256722
Show file tree

Hide file tree

Showing 6 changed files with 117 additions and 9 deletions.
diff --git a/config/matching_params.yml b/config/matching_params.yml
@@ -1,6 +1,7 @@
 
 block_line_ratio: 0.20
 left_line_length_threshold: 7
+img_template_probability_threshold: 0.62
 
 material_description:
   de:

diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py
@@ -105,7 +105,10 @@ def get_scores(
     if len(document_level_metrics["precision"]):
         overall_precision = sum(document_level_metrics["precision"]) / len(document_level_metrics["precision"])
         overall_recall = sum(document_level_metrics["recall"]) / len(document_level_metrics["recall"])
-        overall_depth_interval_accuracy = sum(depth_interval_accuracies) / len(depth_interval_accuracies)
+        try:
+            overall_depth_interval_accuracy = sum(depth_interval_accuracies) / len(depth_interval_accuracies)
+        except ZeroDivisionError:
+            overall_depth_interval_accuracy = None
     else:
         overall_precision = 0
         overall_recall = 0

diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
@@ -14,6 +14,7 @@
 from stratigraphy.extract import process_page
 from stratigraphy.line_detection import extract_lines, line_detection_params
 from stratigraphy.util.draw import draw_predictions
+from stratigraphy.util.duplicate_detection import remove_duplicate_layers
 from stratigraphy.util.language_detection import detect_language_of_document
 from stratigraphy.util.plot_utils import plot_lines
 from stratigraphy.util.util import flatten, read_params
@@ -117,7 +118,6 @@ def start_pipeline(
     Note: This function is used to be called from the label-studio backend, whereas the click_pipeline function
     is called from the CLI.
 
-    \f
     Args:
         input_directory (Path): The directory containing the pdf files. Can also be the path to a single pdf file.
         ground_truth_path (Path): The path to the ground truth file json file.
@@ -172,7 +172,14 @@ def start_pipeline(
                         layer_predictions, depths_materials_column_pairs = process_page(
                             page, geometric_lines, language, **matching_params
                         )
-
+                        # Add remove duplicates here!
+                        if page_index > 0:
+                            layer_predictions = remove_duplicate_layers(
+                                doc[page_index - 1],
+                                page,
+                                layer_predictions,
+                                matching_params["img_template_probability_threshold"],
+                            )
                         predictions[filename][f"page_{page_number}"] = {
                             "layers": layer_predictions,
                             "depths_materials_column_pairs": depths_materials_column_pairs,

diff --git a/src/stratigraphy/util/duplicate_detection.py b/src/stratigraphy/util/duplicate_detection.py
@@ -0,0 +1,81 @@
+"""This module contains functionality for detecting duplicate layers across pdf pages."""
+
+import logging
+
+import cv2
+import fitz
+import numpy as np
+
+from stratigraphy.util.plot_utils import convert_page_to_opencv_img
+
+logger = logging.getLogger(__name__)
+
+
+def remove_duplicate_layers(
+    previous_page: fitz.Page,
+    current_page: fitz.Page,
+    layer_predictions: list[dict],
+    img_template_probability_threshold: float,
+) -> list[dict]:
+    """Remove duplicate layers from the current page based on the layers of the previous page.
+
+    We check if a layer on the current page is present on the previous page. This is done by extracting
+    an image of the layer and check if that image is present in the previous page by applying template matching.
+
+    The check tests if any given layer is present on the previous page as well. If so, all layers before that layer
+    are removed as they are considered duplicates. If we have 3 consecutive layers that are not duplicates, we assume
+    that there is no further overlap between the pages and stop the search.
+
+    Args:
+        previous_page (fitz.Page): The previous page.
+        current_page (fitz.Page): The current page containing the layers.
+        layer_predictions (list[dict]): The layers of the current page.
+        img_template_probability_threshold (float): The threshold for the template matching probability
+                                                    to consider a layer a duplicate.
+
+    Returns:
+        list[dict]: The layers of the current page without duplicates.
+    """
+    scale_factor = 3
+    current_page_image = convert_page_to_opencv_img(
+        current_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY
+    )
+    previous_page_image = convert_page_to_opencv_img(
+        previous_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY
+    )
+
+    sorted_layers = sorted(layer_predictions, key=lambda x: x["material_description"]["rect"][1])
+    first_non_duplicated_layer_index = 0
+    count_consecutive_non_duplicate_layers = 0
+    for layer_index, layer in enumerate(sorted_layers):
+        if (
+            count_consecutive_non_duplicate_layers >= 3
+        ):  # if we have three consecutive non-duplicate layers, we can assume that there is no further page overlap.
+            break
+        [x0, y_start, x1, y_end] = layer["material_description"]["rect"]
+        x_start = int(scale_factor * min(x0, current_page.rect.width * 0.2))  # 0.2 is a magic number that works well
+        x_end = int(scale_factor * min(max(x1, current_page.rect.width * 0.8), previous_page.rect.width - 1))
+        y_start = int(scale_factor * max(y_start, 0))  # do not go higher up as otherwise we remove too many layers.
+        y_end = int(scale_factor * min(y_end + 5, previous_page.rect.height - 1, current_page.rect.height - 1))
+        # y_start and y_end define the upper and lower bound of the image used to compare to the previous page
+        # and determine if there is an overlap. We add 5 pixel to y_end to add a bit more context to the image
+        # as the material_description bounding box is very tight around the text. Furthermore, we need to ensure
+        # that the template is smaller than the previous and the current page.
+        # y_start should not be lowered further as otherwise the we include potential overlap to the previous page
+        # that belongs to the previous layer.
+
+        layer_image = current_page_image[y_start:y_end, x_start:x_end]
+        try:
+            img_template_probablility_match = np.max(
+                cv2.matchTemplate(previous_page_image, layer_image, cv2.TM_CCOEFF_NORMED)
+            )
+        except cv2.error:  # there can be strange correlation errors here.
+            # Just ignore them as it is only a few over the complete dataset
+            logger.warning("Error in template matching. Skipping layer.")
+            img_template_probablility_match = 0
+        if img_template_probablility_match > img_template_probability_threshold:
+            first_non_duplicated_layer_index = layer_index + 1  # all layers before this layer are duplicates
+            count_consecutive_non_duplicate_layers = 0
+        else:
+            count_consecutive_non_duplicate_layers += 1
+    return sorted_layers[first_non_duplicated_layer_index:]
diff --git a/src/stratigraphy/util/plot_utils.py b/src/stratigraphy/util/plot_utils.py
@@ -38,10 +38,20 @@ def _draw_lines(open_cv_img, lines, scale_factor=1):
     return open_cv_img
 
 
-def _convert_page_to_opencv_img(page, scale_factor):
+def convert_page_to_opencv_img(page: fitz.Page, scale_factor: float, color_mode=cv2.COLOR_RGB2BGR) -> np.array:
+    """Converts a fitz.Page object to an OpenCV image.
+
+    Args:
+        page (fitz.Page): The page to convert to an OpenCV image.
+        scale_factor (float): Applied scale factor to the image.
+        color_mode (_type_, optional): _description_. Defaults to cv2.COLOR_RGB2BGR.
+
+    Returns:
+        np.array: The OpenCV image.
+    """
     pix = page.get_pixmap(matrix=fitz.Matrix(scale_factor, scale_factor))
     img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
-    open_cv_img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+    open_cv_img = cv2.cvtColor(img, color_mode)
     return open_cv_img
 
 
@@ -76,7 +86,7 @@ def plot_lines(page: fitz.Page, lines: list[Line], scale_factor: float = 2) -> c
         lines (ArrayLike): The lines detected in the pdf.
         scale_factor (float, optional): The scale factor to apply to the pdf. Defaults to 2.
     """
-    open_cv_img = _convert_page_to_opencv_img(page, scale_factor=scale_factor)
+    open_cv_img = convert_page_to_opencv_img(page, scale_factor=scale_factor)
 
     open_cv_img = _draw_lines(open_cv_img, lines, scale_factor=scale_factor)
 
@@ -103,7 +113,7 @@ def draw_blocks_and_lines(page: fitz.Page, blocks: list[TextBlock], lines: list[
             color=fitz.utils.getColor("orange"),
         )
 
-    open_cv_img = _convert_page_to_opencv_img(page, scale_factor=2)
+    open_cv_img = convert_page_to_opencv_img(page, scale_factor=2)
 
     if lines is not None:
         open_cv_img = _draw_lines(open_cv_img, lines, scale_factor=scale_factor)

diff --git a/src/stratigraphy/util/predictions.py b/src/stratigraphy/util/predictions.py
@@ -1,6 +1,7 @@
 """This module contains classes for predictions."""
 
 import contextlib
+import logging
 import uuid
 from collections import defaultdict
 from dataclasses import dataclass, field
@@ -14,6 +15,8 @@
 from stratigraphy.util.textblock import MaterialDescription, TextBlock
 from stratigraphy.util.util import parse_text
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class LayerPrediction:
@@ -36,6 +39,10 @@ class PagePredictions:
     page_height: int
     depths_materials_columns_pairs: list[dict] = None
 
+    def __post__init__(self):
+        """Sort layers by their occurence on the page."""
+        self.layers = sorted(self.layers, key=lambda layer: layer.material_description.rect.y0)
+
 
 class FilePredictions:
     """A class to represent predictions for a single file."""
@@ -44,8 +51,7 @@ def __init__(self, pages: list[PagePredictions], file_name: str, language: str):
         self.pages = pages
         self.file_name = file_name
         self.language = language
-        if self.pages:
-            self.layers = sum([page.layers for page in self.pages], [])
+        self.layers = sum([page.layers for page in self.pages], [])
 
     @staticmethod
     def create_from_json(predictions_for_file: dict, file_name: str):
File	Stmts	Miss	Cover	Missing
src/stratigraphy
__init__.py	8	1	88%	11
extract.py	189	189	0%	3–460
get_files.py	21	21	0%	3–48
line_detection.py	29	29	0%	3–88
main.py	85	85	0%	3–225
src/stratigraphy/util
dataclasses.py	30	3	90%	34–36
depthcolumn.py	204	67	67%	26, 30, 51, 57, 60–61, 85, 88, 95, 102, 110–111, 121, 138–154, 187, 210, 226–234, 244, 249, 256, 263, 268, 286, 296, 299–306, 321–322, 365–407
depthcolumnentry.py	20	4	80%	12, 15, 27, 34
description_block_splitter.py	70	2	97%	24, 139
draw.py	62	62	0%	3–184
duplicate_detection.py	32	32	0%	3–81
find_depth_columns.py	82	4	95%	57–58, 149–150
find_description.py	39	6	85%	27–34, 111–114
geometric_line_utilities.py	123	37	70%	74–88, 111–115, 214–237, 261, 311
interval.py	107	52	51%	25–28, 32–35, 40, 45, 48, 100–146, 166, 171–187
language_detection.py	18	18	0%	3–43
line.py	49	26	47%	25, 42, 51, 65–95, 98
plot_utils.py	44	44	0%	3–121
predictions.py	154	154	0%	3–322
textblock.py	74	8	89%	27, 51, 63, 75, 98, 119, 127, 155
util.py	40	22	45%	15–18, 22, 26, 40–47, 61–63, 87–88, 100–105
TOTAL	1480	866	41%