Skip to content

Commit

Permalink
Merge pull request #36 from swisstopo/feat/remove_duplicate_layers
Browse files Browse the repository at this point in the history
Feat/remove duplicate layers
  • Loading branch information
redur authored May 3, 2024
2 parents 178b1b5 + ad26ce5 commit 6256722
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 9 deletions.
1 change: 1 addition & 0 deletions config/matching_params.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

block_line_ratio: 0.20
left_line_length_threshold: 7
img_template_probability_threshold: 0.62

material_description:
de:
Expand Down
5 changes: 4 additions & 1 deletion src/stratigraphy/benchmark/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,10 @@ def get_scores(
if len(document_level_metrics["precision"]):
overall_precision = sum(document_level_metrics["precision"]) / len(document_level_metrics["precision"])
overall_recall = sum(document_level_metrics["recall"]) / len(document_level_metrics["recall"])
overall_depth_interval_accuracy = sum(depth_interval_accuracies) / len(depth_interval_accuracies)
try:
overall_depth_interval_accuracy = sum(depth_interval_accuracies) / len(depth_interval_accuracies)
except ZeroDivisionError:
overall_depth_interval_accuracy = None
else:
overall_precision = 0
overall_recall = 0
Expand Down
11 changes: 9 additions & 2 deletions src/stratigraphy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from stratigraphy.extract import process_page
from stratigraphy.line_detection import extract_lines, line_detection_params
from stratigraphy.util.draw import draw_predictions
from stratigraphy.util.duplicate_detection import remove_duplicate_layers
from stratigraphy.util.language_detection import detect_language_of_document
from stratigraphy.util.plot_utils import plot_lines
from stratigraphy.util.util import flatten, read_params
Expand Down Expand Up @@ -117,7 +118,6 @@ def start_pipeline(
Note: This function is used to be called from the label-studio backend, whereas the click_pipeline function
is called from the CLI.
\f
Args:
input_directory (Path): The directory containing the pdf files. Can also be the path to a single pdf file.
ground_truth_path (Path): The path to the ground truth file json file.
Expand Down Expand Up @@ -172,7 +172,14 @@ def start_pipeline(
layer_predictions, depths_materials_column_pairs = process_page(
page, geometric_lines, language, **matching_params
)

# Add remove duplicates here!
if page_index > 0:
layer_predictions = remove_duplicate_layers(
doc[page_index - 1],
page,
layer_predictions,
matching_params["img_template_probability_threshold"],
)
predictions[filename][f"page_{page_number}"] = {
"layers": layer_predictions,
"depths_materials_column_pairs": depths_materials_column_pairs,
Expand Down
81 changes: 81 additions & 0 deletions src/stratigraphy/util/duplicate_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""This module contains functionality for detecting duplicate layers across pdf pages."""

import logging

import cv2
import fitz
import numpy as np

from stratigraphy.util.plot_utils import convert_page_to_opencv_img

logger = logging.getLogger(__name__)


def remove_duplicate_layers(
previous_page: fitz.Page,
current_page: fitz.Page,
layer_predictions: list[dict],
img_template_probability_threshold: float,
) -> list[dict]:
"""Remove duplicate layers from the current page based on the layers of the previous page.
We check if a layer on the current page is present on the previous page. This is done by extracting
an image of the layer and check if that image is present in the previous page by applying template matching.
The check tests if any given layer is present on the previous page as well. If so, all layers before that layer
are removed as they are considered duplicates. If we have 3 consecutive layers that are not duplicates, we assume
that there is no further overlap between the pages and stop the search.
Args:
previous_page (fitz.Page): The previous page.
current_page (fitz.Page): The current page containing the layers.
layer_predictions (list[dict]): The layers of the current page.
img_template_probability_threshold (float): The threshold for the template matching probability
to consider a layer a duplicate.
Returns:
list[dict]: The layers of the current page without duplicates.
"""
scale_factor = 3
current_page_image = convert_page_to_opencv_img(
current_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY
)
previous_page_image = convert_page_to_opencv_img(
previous_page, scale_factor=scale_factor, color_mode=cv2.COLOR_BGR2GRAY
)

sorted_layers = sorted(layer_predictions, key=lambda x: x["material_description"]["rect"][1])
first_non_duplicated_layer_index = 0
count_consecutive_non_duplicate_layers = 0
for layer_index, layer in enumerate(sorted_layers):
if (
count_consecutive_non_duplicate_layers >= 3
): # if we have three consecutive non-duplicate layers, we can assume that there is no further page overlap.
break
[x0, y_start, x1, y_end] = layer["material_description"]["rect"]
x_start = int(scale_factor * min(x0, current_page.rect.width * 0.2)) # 0.2 is a magic number that works well
x_end = int(scale_factor * min(max(x1, current_page.rect.width * 0.8), previous_page.rect.width - 1))
y_start = int(scale_factor * max(y_start, 0)) # do not go higher up as otherwise we remove too many layers.
y_end = int(scale_factor * min(y_end + 5, previous_page.rect.height - 1, current_page.rect.height - 1))
# y_start and y_end define the upper and lower bound of the image used to compare to the previous page
# and determine if there is an overlap. We add 5 pixel to y_end to add a bit more context to the image
# as the material_description bounding box is very tight around the text. Furthermore, we need to ensure
# that the template is smaller than the previous and the current page.
# y_start should not be lowered further as otherwise the we include potential overlap to the previous page
# that belongs to the previous layer.

layer_image = current_page_image[y_start:y_end, x_start:x_end]
try:
img_template_probablility_match = np.max(
cv2.matchTemplate(previous_page_image, layer_image, cv2.TM_CCOEFF_NORMED)
)
except cv2.error: # there can be strange correlation errors here.
# Just ignore them as it is only a few over the complete dataset
logger.warning("Error in template matching. Skipping layer.")
img_template_probablility_match = 0
if img_template_probablility_match > img_template_probability_threshold:
first_non_duplicated_layer_index = layer_index + 1 # all layers before this layer are duplicates
count_consecutive_non_duplicate_layers = 0
else:
count_consecutive_non_duplicate_layers += 1
return sorted_layers[first_non_duplicated_layer_index:]
18 changes: 14 additions & 4 deletions src/stratigraphy/util/plot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,20 @@ def _draw_lines(open_cv_img, lines, scale_factor=1):
return open_cv_img


def _convert_page_to_opencv_img(page, scale_factor):
def convert_page_to_opencv_img(page: fitz.Page, scale_factor: float, color_mode=cv2.COLOR_RGB2BGR) -> np.array:
"""Converts a fitz.Page object to an OpenCV image.
Args:
page (fitz.Page): The page to convert to an OpenCV image.
scale_factor (float): Applied scale factor to the image.
color_mode (_type_, optional): _description_. Defaults to cv2.COLOR_RGB2BGR.
Returns:
np.array: The OpenCV image.
"""
pix = page.get_pixmap(matrix=fitz.Matrix(scale_factor, scale_factor))
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
open_cv_img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
open_cv_img = cv2.cvtColor(img, color_mode)
return open_cv_img


Expand Down Expand Up @@ -76,7 +86,7 @@ def plot_lines(page: fitz.Page, lines: list[Line], scale_factor: float = 2) -> c
lines (ArrayLike): The lines detected in the pdf.
scale_factor (float, optional): The scale factor to apply to the pdf. Defaults to 2.
"""
open_cv_img = _convert_page_to_opencv_img(page, scale_factor=scale_factor)
open_cv_img = convert_page_to_opencv_img(page, scale_factor=scale_factor)

open_cv_img = _draw_lines(open_cv_img, lines, scale_factor=scale_factor)

Expand All @@ -103,7 +113,7 @@ def draw_blocks_and_lines(page: fitz.Page, blocks: list[TextBlock], lines: list[
color=fitz.utils.getColor("orange"),
)

open_cv_img = _convert_page_to_opencv_img(page, scale_factor=2)
open_cv_img = convert_page_to_opencv_img(page, scale_factor=2)

if lines is not None:
open_cv_img = _draw_lines(open_cv_img, lines, scale_factor=scale_factor)
Expand Down
10 changes: 8 additions & 2 deletions src/stratigraphy/util/predictions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""This module contains classes for predictions."""

import contextlib
import logging
import uuid
from collections import defaultdict
from dataclasses import dataclass, field
Expand All @@ -14,6 +15,8 @@
from stratigraphy.util.textblock import MaterialDescription, TextBlock
from stratigraphy.util.util import parse_text

logger = logging.getLogger(__name__)


@dataclass
class LayerPrediction:
Expand All @@ -36,6 +39,10 @@ class PagePredictions:
page_height: int
depths_materials_columns_pairs: list[dict] = None

def __post__init__(self):
"""Sort layers by their occurence on the page."""
self.layers = sorted(self.layers, key=lambda layer: layer.material_description.rect.y0)


class FilePredictions:
"""A class to represent predictions for a single file."""
Expand All @@ -44,8 +51,7 @@ def __init__(self, pages: list[PagePredictions], file_name: str, language: str):
self.pages = pages
self.file_name = file_name
self.language = language
if self.pages:
self.layers = sum([page.layers for page in self.pages], [])
self.layers = sum([page.layers for page in self.pages], [])

@staticmethod
def create_from_json(predictions_for_file: dict, file_name: str):
Expand Down

1 comment on commit 6256722

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
src/stratigraphy
   __init__.py8188%11
   extract.py1891890%3–460
   get_files.py21210%3–48
   line_detection.py29290%3–88
   main.py85850%3–225
src/stratigraphy/util
   dataclasses.py30390%34–36
   depthcolumn.py2046767%26, 30, 51, 57, 60–61, 85, 88, 95, 102, 110–111, 121, 138–154, 187, 210, 226–234, 244, 249, 256, 263, 268, 286, 296, 299–306, 321–322, 365–407
   depthcolumnentry.py20480%12, 15, 27, 34
   description_block_splitter.py70297%24, 139
   draw.py62620%3–184
   duplicate_detection.py32320%3–81
   find_depth_columns.py82495%57–58, 149–150
   find_description.py39685%27–34, 111–114
   geometric_line_utilities.py1233770%74–88, 111–115, 214–237, 261, 311
   interval.py1075251%25–28, 32–35, 40, 45, 48, 100–146, 166, 171–187
   language_detection.py18180%3–43
   line.py492647%25, 42, 51, 65–95, 98
   plot_utils.py44440%3–121
   predictions.py1541540%3–322
   textblock.py74889%27, 51, 63, 75, 98, 119, 127, 155
   util.py402245%15–18, 22, 26, 40–47, 61–63, 87–88, 100–105
TOTAL148086641% 

Tests Skipped Failures Errors Time
45 0 💤 0 ❌ 0 🔥 1.057s ⏱️

Please sign in to comment.