From 459c5d963dcb1fc20c9207c373900f1ccc6d60b4 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Mon, 27 May 2024 09:59:12 +0200 Subject: [PATCH] Improve duplicate detection to use depth information --- src/stratigraphy/main.py | 3 +- src/stratigraphy/util/duplicate_detection.py | 139 +++++++++++++------ 2 files changed, 100 insertions(+), 42 deletions(-) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index 5a6deff0..bfeda595 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -183,7 +183,8 @@ def start_pipeline( if page_index > 0: layer_predictions = remove_duplicate_layers( doc[page_index - 1], - page, + doc[page_index], + predictions[filename][f"page_{page_number - 1}"]["layers"], layer_predictions, matching_params["img_template_probability_threshold"], ) diff --git a/src/stratigraphy/util/duplicate_detection.py b/src/stratigraphy/util/duplicate_detection.py index 2a3780a2..2edffa88 100644 --- a/src/stratigraphy/util/duplicate_detection.py +++ b/src/stratigraphy/util/duplicate_detection.py @@ -4,6 +4,7 @@ import cv2 import fitz +import Levenshtein import numpy as np from stratigraphy.util.plot_utils import convert_page_to_opencv_img @@ -14,37 +15,26 @@ def remove_duplicate_layers( previous_page: fitz.Page, current_page: fitz.Page, - layer_predictions: list[dict], + previous_layers: fitz.Page, + current_layers: list[dict], img_template_probability_threshold: float, ) -> list[dict]: """Remove duplicate layers from the current page based on the layers of the previous page. - We check if a layer on the current page is present on the previous page. This is done by extracting - an image of the layer and check if that image is present in the previous page by applying template matching. - - The check tests if any given layer is present on the previous page as well. If so, all layers before that layer - are removed as they are considered duplicates. If we have 3 consecutive layers that are not duplicates, we assume - that there is no further overlap between the pages and stop the search. + If the page contains a depth column, we compare the depth intervals and the material description to determine + duplicate layers. If there is no depth column, we use template matching to compare the layers. Args: previous_page (fitz.Page): The previous page. - current_page (fitz.Page): The current page containing the layers. - layer_predictions (list[dict]): The layers of the current page. + current_page (fitz.Page): The current page containing the layers to check for duplicates. + previous_layers (fitz.Page): The layers of the previous page. + current_layers (list[dict]): The layers of the current page. img_template_probability_threshold (float): The threshold for the template matching probability - to consider a layer a duplicate. Returns: - list[dict]: The layers of the current page without duplicates. + list[dict]: _description_ """ - scale_factor = 3 - current_page_image = convert_page_to_opencv_img( - current_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY - ) - previous_page_image = convert_page_to_opencv_img( - previous_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY - ) - - sorted_layers = sorted(layer_predictions, key=lambda x: x["material_description"]["rect"][1]) + sorted_layers = sorted(current_layers, key=lambda x: x["material_description"]["rect"][1]) first_non_duplicated_layer_index = 0 count_consecutive_non_duplicate_layers = 0 for layer_index, layer in enumerate(sorted_layers): @@ -52,30 +42,97 @@ def remove_duplicate_layers( count_consecutive_non_duplicate_layers >= 3 ): # if we have three consecutive non-duplicate layers, we can assume that there is no further page overlap. break - [x0, y_start, x1, y_end] = layer["material_description"]["rect"] - x_start = int(scale_factor * min(x0, current_page.rect.width * 0.2)) # 0.2 is a magic number that works well - x_end = int(scale_factor * min(max(x1, current_page.rect.width * 0.8), previous_page.rect.width - 1)) - y_start = int(scale_factor * max(y_start, 0)) # do not go higher up as otherwise we remove too many layers. - y_end = int(scale_factor * min(y_end + 5, previous_page.rect.height - 1, current_page.rect.height - 1)) - # y_start and y_end define the upper and lower bound of the image used to compare to the previous page - # and determine if there is an overlap. We add 5 pixel to y_end to add a bit more context to the image - # as the material_description bounding box is very tight around the text. Furthermore, we need to ensure - # that the template is smaller than the previous and the current page. - # y_start should not be lowered further as otherwise the we include potential overlap to the previous page - # that belongs to the previous layer. - - layer_image = current_page_image[y_start:y_end, x_start:x_end] - try: - img_template_probablility_match = np.max( - cv2.matchTemplate(previous_page_image, layer_image, cv2.TM_CCOEFF_NORMED) + + # check if current layer has an overlapping layer on the previous page. + # for that purpose compare depth interval as well as material description text. + duplicate_condition = False + if "depth_interval" not in layer: # in this case we use template matching + duplicate_condition = check_duplicate_layer_by_template_matching( + previous_page, current_page, layer, img_template_probability_threshold ) - except cv2.error: # there can be strange correlation errors here. - # Just ignore them as it is only a few over the complete dataset - logger.warning("Error in template matching. Skipping layer.") - img_template_probablility_match = 0 - if img_template_probablility_match > img_template_probability_threshold: + + else: # in this case we compare the depth interval and material description + current_material_description = layer["material_description"] + current_depth_interval = layer["depth_interval"] + for previous_layer in previous_layers: + if "depth_interval" not in previous_layer: + # It may happen, that a layer on the previous page does not have depth interval assigned. + # In this case we skip the comparison. This should only happen in some edge cases, as we + # assume that when the current page has a depth column, that the previous page also contains a + # depth column. We assume overlapping pages and a depth column should extend over both pages. + continue + + previous_material_description = previous_layer["material_description"] + previous_depth_interval = previous_layer["depth_interval"] + + # start values for the depth intervals may be None. End values are always explicitly set. + current_depth_interval_start = ( + current_depth_interval["start"]["value"] if current_depth_interval["start"] is not None else None + ) + previous_depth_interval_start = ( + previous_depth_interval["start"]["value"] if previous_depth_interval["start"] is not None else None + ) + # check if material description is the same + if ( + Levenshtein.ratio(current_material_description["text"], previous_material_description["text"]) + > 0.9 + and current_depth_interval_start == previous_depth_interval_start + and current_depth_interval["end"].get("value") == previous_depth_interval["end"].get("value") + ): + duplicate_condition = True + print("Duplicate condition met") + break + + if duplicate_condition: first_non_duplicated_layer_index = layer_index + 1 # all layers before this layer are duplicates count_consecutive_non_duplicate_layers = 0 else: count_consecutive_non_duplicate_layers += 1 return sorted_layers[first_non_duplicated_layer_index:] + + +def check_duplicate_layer_by_template_matching( + previous_page: fitz.Page, current_page: fitz.Page, current_layer: dict, img_template_probability_threshold: float +) -> bool: + """Check if the current layer is a duplicate of a layer on the previous page by using template matching. + + Args: + previous_page (fitz.Page): The previous page. + current_page (fitz.Page): The current page. + current_layer (dict): The current layer that is checked for a duplicate. + img_template_probability_threshold (float): The threshold for the template matching probability + to consider a layer a duplicate. + + Returns: + bool: True if the layer is a duplicate, False otherwise. + """ + scale_factor = 3 + current_page_image = convert_page_to_opencv_img( + current_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY + ) + previous_page_image = convert_page_to_opencv_img( + previous_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY + ) + + [x0, y_start, x1, y_end] = current_layer["material_description"]["rect"] + x_start = int(scale_factor * min(x0, current_page.rect.width * 0.2)) # 0.2 is a magic number that works well + x_end = int(scale_factor * min(max(x1, current_page.rect.width * 0.8), previous_page.rect.width - 1)) + y_start = int(scale_factor * max(y_start, 0)) # do not go higher up as otherwise we remove too many layers. + y_end = int(scale_factor * min(y_end + 5, previous_page.rect.height - 1, current_page.rect.height - 1)) + # y_start and y_end define the upper and lower bound of the image used to compare to the previous page + # and determine if there is an overlap. We add 5 pixel to y_end to add a bit more context to the image + # as the material_description bounding box is very tight around the text. Furthermore, we need to ensure + # that the template is smaller than the previous and the current page. + # y_start should not be lowered further as otherwise the we include potential overlap to the previous page + # that belongs to the previous layer. + + layer_image = current_page_image[y_start:y_end, x_start:x_end] + try: + img_template_probablility_match = np.max( + cv2.matchTemplate(previous_page_image, layer_image, cv2.TM_CCOEFF_NORMED) + ) + except cv2.error: # there can be strange correlation errors here. + # Just ignore them as it is only a few over the complete dataset + logger.warning("Error in template matching. Skipping layer.") + return False + return img_template_probablility_match > img_template_probability_threshold