Skip to content

Commit

Permalink
Improve duplicate detection to use depth information
Browse files Browse the repository at this point in the history
  • Loading branch information
redur committed May 27, 2024
1 parent d26fbf7 commit 459c5d9
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 42 deletions.
3 changes: 2 additions & 1 deletion src/stratigraphy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,8 @@ def start_pipeline(
if page_index > 0:
layer_predictions = remove_duplicate_layers(
doc[page_index - 1],
page,
doc[page_index],
predictions[filename][f"page_{page_number - 1}"]["layers"],
layer_predictions,
matching_params["img_template_probability_threshold"],
)
Expand Down
139 changes: 98 additions & 41 deletions src/stratigraphy/util/duplicate_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import cv2
import fitz
import Levenshtein
import numpy as np

from stratigraphy.util.plot_utils import convert_page_to_opencv_img
Expand All @@ -14,68 +15,124 @@
def remove_duplicate_layers(
previous_page: fitz.Page,
current_page: fitz.Page,
layer_predictions: list[dict],
previous_layers: fitz.Page,
current_layers: list[dict],
img_template_probability_threshold: float,
) -> list[dict]:
"""Remove duplicate layers from the current page based on the layers of the previous page.
We check if a layer on the current page is present on the previous page. This is done by extracting
an image of the layer and check if that image is present in the previous page by applying template matching.
The check tests if any given layer is present on the previous page as well. If so, all layers before that layer
are removed as they are considered duplicates. If we have 3 consecutive layers that are not duplicates, we assume
that there is no further overlap between the pages and stop the search.
If the page contains a depth column, we compare the depth intervals and the material description to determine
duplicate layers. If there is no depth column, we use template matching to compare the layers.
Args:
previous_page (fitz.Page): The previous page.
current_page (fitz.Page): The current page containing the layers.
layer_predictions (list[dict]): The layers of the current page.
current_page (fitz.Page): The current page containing the layers to check for duplicates.
previous_layers (fitz.Page): The layers of the previous page.
current_layers (list[dict]): The layers of the current page.
img_template_probability_threshold (float): The threshold for the template matching probability
to consider a layer a duplicate.
Returns:
list[dict]: The layers of the current page without duplicates.
list[dict]: _description_
"""
scale_factor = 3
current_page_image = convert_page_to_opencv_img(
current_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY
)
previous_page_image = convert_page_to_opencv_img(
previous_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY
)

sorted_layers = sorted(layer_predictions, key=lambda x: x["material_description"]["rect"][1])
sorted_layers = sorted(current_layers, key=lambda x: x["material_description"]["rect"][1])
first_non_duplicated_layer_index = 0
count_consecutive_non_duplicate_layers = 0
for layer_index, layer in enumerate(sorted_layers):
if (
count_consecutive_non_duplicate_layers >= 3
): # if we have three consecutive non-duplicate layers, we can assume that there is no further page overlap.
break
[x0, y_start, x1, y_end] = layer["material_description"]["rect"]
x_start = int(scale_factor * min(x0, current_page.rect.width * 0.2)) # 0.2 is a magic number that works well
x_end = int(scale_factor * min(max(x1, current_page.rect.width * 0.8), previous_page.rect.width - 1))
y_start = int(scale_factor * max(y_start, 0)) # do not go higher up as otherwise we remove too many layers.
y_end = int(scale_factor * min(y_end + 5, previous_page.rect.height - 1, current_page.rect.height - 1))
# y_start and y_end define the upper and lower bound of the image used to compare to the previous page
# and determine if there is an overlap. We add 5 pixel to y_end to add a bit more context to the image
# as the material_description bounding box is very tight around the text. Furthermore, we need to ensure
# that the template is smaller than the previous and the current page.
# y_start should not be lowered further as otherwise the we include potential overlap to the previous page
# that belongs to the previous layer.

layer_image = current_page_image[y_start:y_end, x_start:x_end]
try:
img_template_probablility_match = np.max(
cv2.matchTemplate(previous_page_image, layer_image, cv2.TM_CCOEFF_NORMED)

# check if current layer has an overlapping layer on the previous page.
# for that purpose compare depth interval as well as material description text.
duplicate_condition = False
if "depth_interval" not in layer: # in this case we use template matching
duplicate_condition = check_duplicate_layer_by_template_matching(
previous_page, current_page, layer, img_template_probability_threshold
)
except cv2.error: # there can be strange correlation errors here.
# Just ignore them as it is only a few over the complete dataset
logger.warning("Error in template matching. Skipping layer.")
img_template_probablility_match = 0
if img_template_probablility_match > img_template_probability_threshold:

else: # in this case we compare the depth interval and material description
current_material_description = layer["material_description"]
current_depth_interval = layer["depth_interval"]
for previous_layer in previous_layers:
if "depth_interval" not in previous_layer:
# It may happen, that a layer on the previous page does not have depth interval assigned.
# In this case we skip the comparison. This should only happen in some edge cases, as we
# assume that when the current page has a depth column, that the previous page also contains a
# depth column. We assume overlapping pages and a depth column should extend over both pages.
continue

previous_material_description = previous_layer["material_description"]
previous_depth_interval = previous_layer["depth_interval"]

# start values for the depth intervals may be None. End values are always explicitly set.
current_depth_interval_start = (
current_depth_interval["start"]["value"] if current_depth_interval["start"] is not None else None
)
previous_depth_interval_start = (
previous_depth_interval["start"]["value"] if previous_depth_interval["start"] is not None else None
)
# check if material description is the same
if (
Levenshtein.ratio(current_material_description["text"], previous_material_description["text"])
> 0.9
and current_depth_interval_start == previous_depth_interval_start
and current_depth_interval["end"].get("value") == previous_depth_interval["end"].get("value")
):
duplicate_condition = True
print("Duplicate condition met")
break

if duplicate_condition:
first_non_duplicated_layer_index = layer_index + 1 # all layers before this layer are duplicates
count_consecutive_non_duplicate_layers = 0
else:
count_consecutive_non_duplicate_layers += 1
return sorted_layers[first_non_duplicated_layer_index:]


def check_duplicate_layer_by_template_matching(
previous_page: fitz.Page, current_page: fitz.Page, current_layer: dict, img_template_probability_threshold: float
) -> bool:
"""Check if the current layer is a duplicate of a layer on the previous page by using template matching.
Args:
previous_page (fitz.Page): The previous page.
current_page (fitz.Page): The current page.
current_layer (dict): The current layer that is checked for a duplicate.
img_template_probability_threshold (float): The threshold for the template matching probability
to consider a layer a duplicate.
Returns:
bool: True if the layer is a duplicate, False otherwise.
"""
scale_factor = 3
current_page_image = convert_page_to_opencv_img(
current_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY
)
previous_page_image = convert_page_to_opencv_img(
previous_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY
)

[x0, y_start, x1, y_end] = current_layer["material_description"]["rect"]
x_start = int(scale_factor * min(x0, current_page.rect.width * 0.2)) # 0.2 is a magic number that works well
x_end = int(scale_factor * min(max(x1, current_page.rect.width * 0.8), previous_page.rect.width - 1))
y_start = int(scale_factor * max(y_start, 0)) # do not go higher up as otherwise we remove too many layers.
y_end = int(scale_factor * min(y_end + 5, previous_page.rect.height - 1, current_page.rect.height - 1))
# y_start and y_end define the upper and lower bound of the image used to compare to the previous page
# and determine if there is an overlap. We add 5 pixel to y_end to add a bit more context to the image
# as the material_description bounding box is very tight around the text. Furthermore, we need to ensure
# that the template is smaller than the previous and the current page.
# y_start should not be lowered further as otherwise the we include potential overlap to the previous page
# that belongs to the previous layer.

layer_image = current_page_image[y_start:y_end, x_start:x_end]
try:
img_template_probablility_match = np.max(
cv2.matchTemplate(previous_page_image, layer_image, cv2.TM_CCOEFF_NORMED)
)
except cv2.error: # there can be strange correlation errors here.
# Just ignore them as it is only a few over the complete dataset
logger.warning("Error in template matching. Skipping layer.")
return False
return img_template_probablility_match > img_template_probability_threshold

0 comments on commit 459c5d9

Please sign in to comment.