From ce5d453157b61223a42c8e4196b84a9550d5387c Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Mon, 27 May 2024 10:12:58 +0200 Subject: [PATCH] Minor updates; improve docstrings. --- src/stratigraphy/main.py | 2 +- src/stratigraphy/util/duplicate_detection.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index bfeda595..d1a2a284 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -183,7 +183,7 @@ def start_pipeline( if page_index > 0: layer_predictions = remove_duplicate_layers( doc[page_index - 1], - doc[page_index], + page, predictions[filename][f"page_{page_number - 1}"]["layers"], layer_predictions, matching_params["img_template_probability_threshold"], diff --git a/src/stratigraphy/util/duplicate_detection.py b/src/stratigraphy/util/duplicate_detection.py index 2edffa88..accf0d5a 100644 --- a/src/stratigraphy/util/duplicate_detection.py +++ b/src/stratigraphy/util/duplicate_detection.py @@ -21,6 +21,10 @@ def remove_duplicate_layers( ) -> list[dict]: """Remove duplicate layers from the current page based on the layers of the previous page. + We check if a layer on the current page is present on the previous page. If we have 3 consecutive layers that are + not duplicates, we assume that there is no further overlap between the pages and stop the search. If we find a + duplicate, all layers up to including the duplicate layer are removed. + If the page contains a depth column, we compare the depth intervals and the material description to determine duplicate layers. If there is no depth column, we use template matching to compare the layers. @@ -32,7 +36,7 @@ def remove_duplicate_layers( img_template_probability_threshold (float): The threshold for the template matching probability Returns: - list[dict]: _description_ + list[dict]: The layers of the current page without duplicates. """ sorted_layers = sorted(current_layers, key=lambda x: x["material_description"]["rect"][1]) first_non_duplicated_layer_index = 0 @@ -96,6 +100,10 @@ def check_duplicate_layer_by_template_matching( ) -> bool: """Check if the current layer is a duplicate of a layer on the previous page by using template matching. + This is done by extracting an image of the layer and check if that image is present in the previous page + by applying template matching onto the previous page. This checks if the image of the current layer is present + in the previous page. + Args: previous_page (fitz.Page): The previous page. current_page (fitz.Page): The current page.