Minor improvments.

swisstopo · Jun 20, 2024 · f45c853 · f45c853
1 parent a8e3002
commit f45c853
Showing 1 changed file with 40 additions and 23 deletions.
diff --git a/src/stratigraphy/util/depthcolumn.py b/src/stratigraphy/util/depthcolumn.py
@@ -262,7 +262,7 @@ def depth_intervals(self) -> list[BoundaryInterval]:
         return depth_intervals
 
     def significant_arithmetic_progression(self) -> bool:
-        if len(self.entries) < 6:
+        if len(self.entries) < 7:
             return self.is_arithmetic_progression()
         else:
             # to allow for OCR errors or gaps in the progression, we only require a segment of length 7 that is an
@@ -322,10 +322,7 @@ def is_valid(self, all_words: list[TextLine]) -> bool:
         ):
             return False
         # Check if the entries are strictly increasing.
-        if (
-            not all(i.value < j.value for i, j in zip(self.entries, self.entries[1:], strict=False))
-            and len(self.entries) < 15
-        ):
+        if not all(i.value < j.value for i, j in zip(self.entries, self.entries[1:], strict=False)):
             return False
 
         corr_coef = self.pearson_correlation_coef()
@@ -379,31 +376,43 @@ def reduce_until_valid(self, all_words: list[TextLine]) -> BoundaryDepthColumn:
         while current:
             if current.is_valid(all_words):
                 return current
-            elif current.correct_typos(all_words) is not None:
-                return current.correct_typos(all_words)
+            elif current.correct_OCR_mistakes(all_words) is not None:
+                return current.correct_OCR_mistakes(all_words)
             else:
                 current = current.remove_entry_by_correlation_gradient()
 
-    def correct_typos(self, all_words) -> BoundaryDepthColumn | None:
+    def correct_OCR_mistakes(self, all_words: list[TextLine]) -> BoundaryDepthColumn | None:
+        """Corrects OCR mistakes in the depth column entries.
+
+        Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the
+        hightest pearson correlation coefficient is selected and checked for validity.
+
+        This is useful if one entry has an OCR mistake, and the column is not valid because of it.
+
+        Note: Common mistakes should be extended as needed.
+
+        Args:
+            all_words (list[TextLine]): A list of all text lines on the page.
+
+        Returns:
+            BoundaryDepthColumn | None: The corrected depth column, or None if no correction was possible.
+        """
         new_columns = []
-        for remove_index, remove_entry in enumerate(self.entries):
+        for remove_index in range(len(self.entries)):
             new_columns.append(
-                (
-                    remove_entry,
-                    BoundaryDepthColumn(
-                        self.noise_count_threshold,
-                        self.noise_count_offset,
-                        [
-                            entry if index != remove_index else _correct_entry(entry)
-                            for index, entry in enumerate(self.entries)
-                        ],
-                    ),
-                )
+                BoundaryDepthColumn(
+                    self.noise_count_threshold,
+                    self.noise_count_offset,
+                    [
+                        entry if index != remove_index else _correct_entry(entry)
+                        for index, entry in enumerate(self.entries)
+                    ],
+                ),
             )
-        _removed_entry, best_columns = max(new_columns, key=lambda column: column[1].pearson_correlation_coef())
+        best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef())
 
-        if best_columns.is_valid(all_words):
-            return best_columns
+        if best_column.is_valid(all_words):
+            return best_column
         else:
             return None
 
@@ -526,6 +535,14 @@ def identify_groups(
 
 
 def _correct_entry(entry: DepthColumnEntry) -> DepthColumnEntry:
+    """Corrects frequent OCR errors in depth column entries.
+
+    Args:
+        entry (DepthColumnEntry): The depth column entry to correct.
+
+    Returns:
+        DepthColumnEntry: The corrected depth column entry.
+    """
     text_value = str(entry.value)
     text_value = text_value.replace("4", "1")  # In older documents, OCR sometimes mistakes 1 for 4
     return DepthColumnEntry(entry.rect, float(text_value))