From 459c5d963dcb1fc20c9207c373900f1ccc6d60b4 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Mon, 27 May 2024 09:59:12 +0200 Subject: [PATCH 1/5] Improve duplicate detection to use depth information --- src/stratigraphy/main.py | 3 +- src/stratigraphy/util/duplicate_detection.py | 139 +++++++++++++------ 2 files changed, 100 insertions(+), 42 deletions(-) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index 5a6deff0..bfeda595 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -183,7 +183,8 @@ def start_pipeline( if page_index > 0: layer_predictions = remove_duplicate_layers( doc[page_index - 1], - page, + doc[page_index], + predictions[filename][f"page_{page_number - 1}"]["layers"], layer_predictions, matching_params["img_template_probability_threshold"], ) diff --git a/src/stratigraphy/util/duplicate_detection.py b/src/stratigraphy/util/duplicate_detection.py index 2a3780a2..2edffa88 100644 --- a/src/stratigraphy/util/duplicate_detection.py +++ b/src/stratigraphy/util/duplicate_detection.py @@ -4,6 +4,7 @@ import cv2 import fitz +import Levenshtein import numpy as np from stratigraphy.util.plot_utils import convert_page_to_opencv_img @@ -14,37 +15,26 @@ def remove_duplicate_layers( previous_page: fitz.Page, current_page: fitz.Page, - layer_predictions: list[dict], + previous_layers: fitz.Page, + current_layers: list[dict], img_template_probability_threshold: float, ) -> list[dict]: """Remove duplicate layers from the current page based on the layers of the previous page. - We check if a layer on the current page is present on the previous page. This is done by extracting - an image of the layer and check if that image is present in the previous page by applying template matching. - - The check tests if any given layer is present on the previous page as well. If so, all layers before that layer - are removed as they are considered duplicates. If we have 3 consecutive layers that are not duplicates, we assume - that there is no further overlap between the pages and stop the search. + If the page contains a depth column, we compare the depth intervals and the material description to determine + duplicate layers. If there is no depth column, we use template matching to compare the layers. Args: previous_page (fitz.Page): The previous page. - current_page (fitz.Page): The current page containing the layers. - layer_predictions (list[dict]): The layers of the current page. + current_page (fitz.Page): The current page containing the layers to check for duplicates. + previous_layers (fitz.Page): The layers of the previous page. + current_layers (list[dict]): The layers of the current page. img_template_probability_threshold (float): The threshold for the template matching probability - to consider a layer a duplicate. Returns: - list[dict]: The layers of the current page without duplicates. + list[dict]: _description_ """ - scale_factor = 3 - current_page_image = convert_page_to_opencv_img( - current_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY - ) - previous_page_image = convert_page_to_opencv_img( - previous_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY - ) - - sorted_layers = sorted(layer_predictions, key=lambda x: x["material_description"]["rect"][1]) + sorted_layers = sorted(current_layers, key=lambda x: x["material_description"]["rect"][1]) first_non_duplicated_layer_index = 0 count_consecutive_non_duplicate_layers = 0 for layer_index, layer in enumerate(sorted_layers): @@ -52,30 +42,97 @@ def remove_duplicate_layers( count_consecutive_non_duplicate_layers >= 3 ): # if we have three consecutive non-duplicate layers, we can assume that there is no further page overlap. break - [x0, y_start, x1, y_end] = layer["material_description"]["rect"] - x_start = int(scale_factor * min(x0, current_page.rect.width * 0.2)) # 0.2 is a magic number that works well - x_end = int(scale_factor * min(max(x1, current_page.rect.width * 0.8), previous_page.rect.width - 1)) - y_start = int(scale_factor * max(y_start, 0)) # do not go higher up as otherwise we remove too many layers. - y_end = int(scale_factor * min(y_end + 5, previous_page.rect.height - 1, current_page.rect.height - 1)) - # y_start and y_end define the upper and lower bound of the image used to compare to the previous page - # and determine if there is an overlap. We add 5 pixel to y_end to add a bit more context to the image - # as the material_description bounding box is very tight around the text. Furthermore, we need to ensure - # that the template is smaller than the previous and the current page. - # y_start should not be lowered further as otherwise the we include potential overlap to the previous page - # that belongs to the previous layer. - - layer_image = current_page_image[y_start:y_end, x_start:x_end] - try: - img_template_probablility_match = np.max( - cv2.matchTemplate(previous_page_image, layer_image, cv2.TM_CCOEFF_NORMED) + + # check if current layer has an overlapping layer on the previous page. + # for that purpose compare depth interval as well as material description text. + duplicate_condition = False + if "depth_interval" not in layer: # in this case we use template matching + duplicate_condition = check_duplicate_layer_by_template_matching( + previous_page, current_page, layer, img_template_probability_threshold ) - except cv2.error: # there can be strange correlation errors here. - # Just ignore them as it is only a few over the complete dataset - logger.warning("Error in template matching. Skipping layer.") - img_template_probablility_match = 0 - if img_template_probablility_match > img_template_probability_threshold: + + else: # in this case we compare the depth interval and material description + current_material_description = layer["material_description"] + current_depth_interval = layer["depth_interval"] + for previous_layer in previous_layers: + if "depth_interval" not in previous_layer: + # It may happen, that a layer on the previous page does not have depth interval assigned. + # In this case we skip the comparison. This should only happen in some edge cases, as we + # assume that when the current page has a depth column, that the previous page also contains a + # depth column. We assume overlapping pages and a depth column should extend over both pages. + continue + + previous_material_description = previous_layer["material_description"] + previous_depth_interval = previous_layer["depth_interval"] + + # start values for the depth intervals may be None. End values are always explicitly set. + current_depth_interval_start = ( + current_depth_interval["start"]["value"] if current_depth_interval["start"] is not None else None + ) + previous_depth_interval_start = ( + previous_depth_interval["start"]["value"] if previous_depth_interval["start"] is not None else None + ) + # check if material description is the same + if ( + Levenshtein.ratio(current_material_description["text"], previous_material_description["text"]) + > 0.9 + and current_depth_interval_start == previous_depth_interval_start + and current_depth_interval["end"].get("value") == previous_depth_interval["end"].get("value") + ): + duplicate_condition = True + print("Duplicate condition met") + break + + if duplicate_condition: first_non_duplicated_layer_index = layer_index + 1 # all layers before this layer are duplicates count_consecutive_non_duplicate_layers = 0 else: count_consecutive_non_duplicate_layers += 1 return sorted_layers[first_non_duplicated_layer_index:] + + +def check_duplicate_layer_by_template_matching( + previous_page: fitz.Page, current_page: fitz.Page, current_layer: dict, img_template_probability_threshold: float +) -> bool: + """Check if the current layer is a duplicate of a layer on the previous page by using template matching. + + Args: + previous_page (fitz.Page): The previous page. + current_page (fitz.Page): The current page. + current_layer (dict): The current layer that is checked for a duplicate. + img_template_probability_threshold (float): The threshold for the template matching probability + to consider a layer a duplicate. + + Returns: + bool: True if the layer is a duplicate, False otherwise. + """ + scale_factor = 3 + current_page_image = convert_page_to_opencv_img( + current_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY + ) + previous_page_image = convert_page_to_opencv_img( + previous_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY + ) + + [x0, y_start, x1, y_end] = current_layer["material_description"]["rect"] + x_start = int(scale_factor * min(x0, current_page.rect.width * 0.2)) # 0.2 is a magic number that works well + x_end = int(scale_factor * min(max(x1, current_page.rect.width * 0.8), previous_page.rect.width - 1)) + y_start = int(scale_factor * max(y_start, 0)) # do not go higher up as otherwise we remove too many layers. + y_end = int(scale_factor * min(y_end + 5, previous_page.rect.height - 1, current_page.rect.height - 1)) + # y_start and y_end define the upper and lower bound of the image used to compare to the previous page + # and determine if there is an overlap. We add 5 pixel to y_end to add a bit more context to the image + # as the material_description bounding box is very tight around the text. Furthermore, we need to ensure + # that the template is smaller than the previous and the current page. + # y_start should not be lowered further as otherwise the we include potential overlap to the previous page + # that belongs to the previous layer. + + layer_image = current_page_image[y_start:y_end, x_start:x_end] + try: + img_template_probablility_match = np.max( + cv2.matchTemplate(previous_page_image, layer_image, cv2.TM_CCOEFF_NORMED) + ) + except cv2.error: # there can be strange correlation errors here. + # Just ignore them as it is only a few over the complete dataset + logger.warning("Error in template matching. Skipping layer.") + return False + return img_template_probablility_match > img_template_probability_threshold From ce5d453157b61223a42c8e4196b84a9550d5387c Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Mon, 27 May 2024 10:12:58 +0200 Subject: [PATCH 2/5] Minor updates; improve docstrings. --- src/stratigraphy/main.py | 2 +- src/stratigraphy/util/duplicate_detection.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index bfeda595..d1a2a284 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -183,7 +183,7 @@ def start_pipeline( if page_index > 0: layer_predictions = remove_duplicate_layers( doc[page_index - 1], - doc[page_index], + page, predictions[filename][f"page_{page_number - 1}"]["layers"], layer_predictions, matching_params["img_template_probability_threshold"], diff --git a/src/stratigraphy/util/duplicate_detection.py b/src/stratigraphy/util/duplicate_detection.py index 2edffa88..accf0d5a 100644 --- a/src/stratigraphy/util/duplicate_detection.py +++ b/src/stratigraphy/util/duplicate_detection.py @@ -21,6 +21,10 @@ def remove_duplicate_layers( ) -> list[dict]: """Remove duplicate layers from the current page based on the layers of the previous page. + We check if a layer on the current page is present on the previous page. If we have 3 consecutive layers that are + not duplicates, we assume that there is no further overlap between the pages and stop the search. If we find a + duplicate, all layers up to including the duplicate layer are removed. + If the page contains a depth column, we compare the depth intervals and the material description to determine duplicate layers. If there is no depth column, we use template matching to compare the layers. @@ -32,7 +36,7 @@ def remove_duplicate_layers( img_template_probability_threshold (float): The threshold for the template matching probability Returns: - list[dict]: _description_ + list[dict]: The layers of the current page without duplicates. """ sorted_layers = sorted(current_layers, key=lambda x: x["material_description"]["rect"][1]) first_non_duplicated_layer_index = 0 @@ -96,6 +100,10 @@ def check_duplicate_layer_by_template_matching( ) -> bool: """Check if the current layer is a duplicate of a layer on the previous page by using template matching. + This is done by extracting an image of the layer and check if that image is present in the previous page + by applying template matching onto the previous page. This checks if the image of the current layer is present + in the previous page. + Args: previous_page (fitz.Page): The previous page. current_page (fitz.Page): The current page. From 5788e4dff89a5c9a55ab0473d728f8d946fdcd8b Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Tue, 28 May 2024 15:31:33 +0200 Subject: [PATCH 3/5] Update logging behavior. --- src/stratigraphy/extract.py | 2 +- src/stratigraphy/main.py | 3 +-- src/stratigraphy/util/duplicate_detection.py | 3 ++- src/stratigraphy/util/geometric_line_utilities.py | 2 +- src/stratigraphy/util/plot_utils.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index cb9732aa..d77d7363 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -27,7 +27,7 @@ x_overlap_significant_smallest, ) -logging.basicConfig(level=logging.INFO) +logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") logger = logging.getLogger(__name__) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index d1a2a284..91b7b860 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -23,8 +23,7 @@ load_dotenv() mlflow_tracking = os.getenv("MLFLOW_TRACKING") == "True" # Checks whether MLFlow tracking is enabled - -logging.basicConfig(level=logging.INFO) +logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") logger = logging.getLogger(__name__) matching_params = read_params("matching_params.yml") diff --git a/src/stratigraphy/util/duplicate_detection.py b/src/stratigraphy/util/duplicate_detection.py index accf0d5a..f0b130a5 100644 --- a/src/stratigraphy/util/duplicate_detection.py +++ b/src/stratigraphy/util/duplicate_detection.py @@ -10,6 +10,7 @@ from stratigraphy.util.plot_utils import convert_page_to_opencv_img logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") def remove_duplicate_layers( @@ -84,7 +85,7 @@ def remove_duplicate_layers( and current_depth_interval["end"].get("value") == previous_depth_interval["end"].get("value") ): duplicate_condition = True - print("Duplicate condition met") + logger.info("Removing duplicate layer.") break if duplicate_condition: diff --git a/src/stratigraphy/util/geometric_line_utilities.py b/src/stratigraphy/util/geometric_line_utilities.py index 87ad54cb..fae4c5eb 100644 --- a/src/stratigraphy/util/geometric_line_utilities.py +++ b/src/stratigraphy/util/geometric_line_utilities.py @@ -11,7 +11,7 @@ from stratigraphy.util.dataclasses import Line, Point from stratigraphy.util.linesquadtree import LinesQuadTree -logging.basicConfig(level=logging.INFO) +logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") logger = logging.getLogger(__name__) diff --git a/src/stratigraphy/util/plot_utils.py b/src/stratigraphy/util/plot_utils.py index d99db801..b87a1abb 100644 --- a/src/stratigraphy/util/plot_utils.py +++ b/src/stratigraphy/util/plot_utils.py @@ -9,7 +9,7 @@ from stratigraphy.util.dataclasses import Line from stratigraphy.util.textblock import TextBlock -logging.basicConfig(level=logging.INFO) +logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") logger = logging.getLogger(__name__) From b38ab1b7c07d6ff4565c6274e21da6324e8a2a16 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Tue, 28 May 2024 15:35:35 +0200 Subject: [PATCH 4/5] Correct type hint --- src/stratigraphy/util/duplicate_detection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/stratigraphy/util/duplicate_detection.py b/src/stratigraphy/util/duplicate_detection.py index f0b130a5..b5020a53 100644 --- a/src/stratigraphy/util/duplicate_detection.py +++ b/src/stratigraphy/util/duplicate_detection.py @@ -16,7 +16,7 @@ def remove_duplicate_layers( previous_page: fitz.Page, current_page: fitz.Page, - previous_layers: fitz.Page, + previous_layers: list[dict], current_layers: list[dict], img_template_probability_threshold: float, ) -> list[dict]: @@ -32,7 +32,7 @@ def remove_duplicate_layers( Args: previous_page (fitz.Page): The previous page. current_page (fitz.Page): The current page containing the layers to check for duplicates. - previous_layers (fitz.Page): The layers of the previous page. + previous_layers (list[dict]): The layers of the previous page. current_layers (list[dict]): The layers of the current page. img_template_probability_threshold (float): The threshold for the template matching probability From 3477db9dbdbb5a4bf2ebd20273e9f2aa753fef06 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 29 May 2024 08:42:24 +0200 Subject: [PATCH 5/5] Remove logging config from files except main. --- src/stratigraphy/extract.py | 1 - src/stratigraphy/util/duplicate_detection.py | 1 - src/stratigraphy/util/geometric_line_utilities.py | 1 - src/stratigraphy/util/plot_utils.py | 1 - 4 files changed, 4 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index d77d7363..28b01485 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -27,7 +27,6 @@ x_overlap_significant_smallest, ) -logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") logger = logging.getLogger(__name__) diff --git a/src/stratigraphy/util/duplicate_detection.py b/src/stratigraphy/util/duplicate_detection.py index b5020a53..2cc5f900 100644 --- a/src/stratigraphy/util/duplicate_detection.py +++ b/src/stratigraphy/util/duplicate_detection.py @@ -10,7 +10,6 @@ from stratigraphy.util.plot_utils import convert_page_to_opencv_img logger = logging.getLogger(__name__) -logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") def remove_duplicate_layers( diff --git a/src/stratigraphy/util/geometric_line_utilities.py b/src/stratigraphy/util/geometric_line_utilities.py index fae4c5eb..627a1598 100644 --- a/src/stratigraphy/util/geometric_line_utilities.py +++ b/src/stratigraphy/util/geometric_line_utilities.py @@ -11,7 +11,6 @@ from stratigraphy.util.dataclasses import Line, Point from stratigraphy.util.linesquadtree import LinesQuadTree -logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") logger = logging.getLogger(__name__) diff --git a/src/stratigraphy/util/plot_utils.py b/src/stratigraphy/util/plot_utils.py index b87a1abb..5e43bebc 100644 --- a/src/stratigraphy/util/plot_utils.py +++ b/src/stratigraphy/util/plot_utils.py @@ -9,7 +9,6 @@ from stratigraphy.util.dataclasses import Line from stratigraphy.util.textblock import TextBlock -logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") logger = logging.getLogger(__name__)