Merge pull request #66 from swisstopo/feat/correct_.NN

Improve depth entry recognition for cases such as '.80'.
swisstopo · Jun 25, 2024 · f686692 · f686692 · github-actions · Jun 25, 2024
2 parents b96efa8 + 5407194
commit f686692
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 28 deletions.
diff --git a/src/stratigraphy/util/boundarydepthcolumnvalidator.py b/src/stratigraphy/util/boundarydepthcolumnvalidator.py
@@ -56,7 +56,7 @@ def is_valid(self, column: BoundaryDepthColumn, corr_coef_threshold: float = 0.9
         ):
             return False
         # Check if the entries are strictly increasing.
-        if not all(i.value < j.value for i, j in zip(column.entries, column.entries[1:], strict=False)):
+        if not column.is_strictly_increasing():
             return False
 
         corr_coef = column.pearson_correlation_coef()
@@ -89,7 +89,14 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu
         Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the
         highest pearson correlation coefficient is selected and checked for validity.
 
-        This is useful if one entry has an OCR mistake, and the column is not valid because of it.
+        This is useful if one or more entries have an OCR mistake, and the column is not valid because of it.
+
+        Currently, there is no limit on the number of corrections per depth column. Indeed, there are examples of depth
+        columns with multiple OCR errors on different depth values. On the other hand, allowing an unlimited number of
+        corrections increases the risk, that a random column of different values is incorrectly accepted as a depth
+        column after making the corrections, especially if the column has a low number of entries. A more robust
+        solution might be to allow corrections on less than 50% of all entries, or something similar. However, we
+        currently don't have enough examples to properly tune this parameter.
 
         Note: Common mistakes should be extended as needed.
 
@@ -99,34 +106,43 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu
         Returns:
             BoundaryDepthColumn | None: The corrected depth column, or None if no correction was possible.
         """
-        new_columns = []
-        for remove_index in range(len(column.entries)):
-            new_columns.append(
-                BoundaryDepthColumn(
-                    [
-                        entry if index != remove_index else _correct_entry(entry)
-                        for index, entry in enumerate(column.entries)
-                    ],
-                ),
-            )
-        best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef())
-
-        # We require a higher correlation coefficient when we've already corrected a mistake.
-        if self.is_valid(best_column, corr_coef_threshold=0.999):
-            return best_column
-        else:
-            return None
-
-
-def _correct_entry(entry: DepthColumnEntry) -> DepthColumnEntry:
+        new_columns = [BoundaryDepthColumn()]
+        for entry in column.entries:
+            new_columns = [
+                BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value)])
+                for column in new_columns
+                for new_value in _value_alternatives(entry.value)
+            ]
+            # Immediately require strictly increasing values, to avoid exponential complexity when many implausible
+            # alternative values are suggested
+            new_columns = [column for column in new_columns if column.is_strictly_increasing()]
+
+        if len(new_columns):
+            best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef())
+
+            # We require a higher correlation coefficient when we've already corrected a mistake.
+            if self.is_valid(best_column, corr_coef_threshold=0.999):
+                return best_column
+
+        return None
+
+
+def _value_alternatives(value: float) -> set[float]:
     """Corrects frequent OCR errors in depth column entries.
 
     Args:
-        entry (DepthColumnEntry): The depth column entry to correct.
+        value (float): The depth values to find plausible alternatives for
 
     Returns:
-        DepthColumnEntry: The corrected depth column entry.
+        set(float): all plausible values (including the original one)
     """
-    text_value = str(entry.value)
-    text_value = text_value.replace("4", "1")  # In older documents, OCR sometimes mistakes 1 for 4
-    return DepthColumnEntry(entry.rect, float(text_value))
+    alternatives = {value}
+    # In older documents, OCR sometimes mistakes 1 for 4
+    alternatives.add(float(str(value).replace("4", "1")))
+
+    # replace a pattern such as '.80' with '0.80'. These cases are already converted
+    # to '80.0' when depth entries are recognized.
+    if value.is_integer():
+        alternatives.add(value / 100)
+
+    return alternatives
diff --git a/src/stratigraphy/util/depthcolumn.py b/src/stratigraphy/util/depthcolumn.py
@@ -229,11 +229,14 @@ def valid_initial_segment(self, rect: fitz.Rect) -> BoundaryDepthColumn:
                 return initial_segment
         return BoundaryDepthColumn()
 
-    def strictly_contains(self, other: BoundaryDepthColumn):
+    def strictly_contains(self, other: BoundaryDepthColumn) -> bool:
         return len(other.entries) < len(self.entries) and all(
             other_entry in self.entries for other_entry in other.entries
         )
 
+    def is_strictly_increasing(self) -> bool:
+        return all(i.value < j.value for i, j in zip(self.entries, self.entries[1:], strict=False))
+
     def depth_intervals(self) -> list[BoundaryInterval]:
         """Creates a list of depth intervals from the depth column entries.
File	Stmts	Miss	Cover	Missing
src/stratigraphy
__init__.py	8	1	88%	11
extract.py	188	188	0%	3–482
get_files.py	21	21	0%	3–48
line_detection.py	26	26	0%	3–76
main.py	94	94	0%	3–237
src/stratigraphy/util
boundarydepthcolumnvalidator.py	41	20	51%	47, 57, 60, 81–84, 109–127, 139–148
coordinate_extraction.py	127	7	94%	31, 62, 75–76, 80, 205, 328
dataclasses.py	32	3	91%	37–39
depthcolumn.py	194	64	67%	26, 30, 51, 57, 60–61, 85, 88, 95, 102, 110–111, 121, 138–154, 192, 229, 248–256, 267, 272, 279, 310, 315–322, 337–338, 381–423
depthcolumnentry.py	26	7	73%	12, 15, 29–30, 33, 45, 52
description_block_splitter.py	70	2	97%	24, 139
draw.py	80	80	0%	3–244
duplicate_detection.py	51	51	0%	3–146
extract_text.py	27	2	93%	38–39
find_depth_columns.py	91	6	93%	42–43, 71, 83, 176–177
find_description.py	63	28	56%	27–35, 50–63, 79–95, 172–175
geometric_line_utilities.py	86	2	98%	82, 132
interval.py	106	55	48%	24–27, 31–34, 39, 44, 47, 57–59, 99–145, 166, 171–187
language_detection.py	18	18	0%	3–45
layer_identifier_column.py	91	91	0%	3–227
line.py	49	4	92%	25, 42, 51, 98
linesquadtree.py	46	1	98%	76
plot_utils.py	43	43	0%	3–120
predictions.py	187	187	0%	3–387
textblock.py	74	8	89%	27, 51, 63, 75, 98, 119, 127, 155
util.py	40	18	55%	22, 40–47, 61–63, 87–88, 100–105
TOTAL	1879	1027	45%