Merge pull request #52 from swisstopo/feat/improve_is_valid

Feat/improve is valid
swisstopo · May 31, 2024 · 71bfee2 · 71bfee2 · github-actions · May 31, 2024
2 parents f575b6d + 0f2fc22
commit 71bfee2
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 10 deletions.
diff --git a/config/matching_params.yml b/config/matching_params.yml
@@ -3,9 +3,9 @@ block_line_ratio: 0.20
 left_line_length_threshold: 7
 img_template_probability_threshold: 0.62
 
-depth_column_params:
+depth_column_params:  # these params should be optimized as soon as there is reliable evaluation data
   noise_count_threshold: 1.25
-  noise_count_offset: 0
+  noise_count_offset: 2.5
 
 material_description:
   de:

diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py
@@ -248,7 +248,6 @@ def evaluate_layer_extraction(predictions: dict, number_of_truth_values: dict) -
                 _metrics[metric_type] = value
             else:
                 _metrics[f"{language}_{metric_type}"] = value
-    print(_metrics)
     return _metrics, document_level_metrics
 
 

diff --git a/src/stratigraphy/util/depthcolumn.py b/src/stratigraphy/util/depthcolumn.py
@@ -286,7 +286,10 @@ def is_arithmetic_progression(self) -> bool:
             return False
 
         scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item()
-        return abs(scale_pearson_correlation_coef) >= 0.999
+        if len(self.entries) < 6:  # It is more likely that fewer entries are accidently very much correlated
+            return abs(scale_pearson_correlation_coef) >= 0.9999
+        else:
+            return abs(scale_pearson_correlation_coef) >= 0.999
 
     def is_valid(self, all_words: list[TextLine]) -> bool:
         """Checks whether the depth column is valid.
@@ -309,8 +312,14 @@ def is_valid(self, all_words: list[TextLine]) -> bool:
         if len(self.entries) < 3:
             return False
 
-        # When too much other text is in the column, then it is probably not valid
-        if self.noise_count(all_words) > self.noise_count_threshold * len(self.entries) - self.noise_count_offset:
+        # When too much other text is in the column, then it is probably not valid.
+        # The quadratic behavior of the noise count check makes the check strictoer for columns with few entries
+        # than columns with more entries. The more entries we have, the less likely it is that we found them by chance.
+        # TODO: Once evaluation data is of good enough qualities, we should optimize for the parameter below.
+        if (
+            self.noise_count(all_words)
+            > self.noise_count_threshold * (len(self.entries) - self.noise_count_offset) ** 2
+        ):
             return False
 
         corr_coef = self.pearson_correlation_coef()

diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py
@@ -25,7 +25,9 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis
     for word in sorted(all_words, key=lambda word: word.rect.y0):
         try:
             input_string = word.text.strip().replace(",", ".")
-            regex = re.compile(r"^-?([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
+            regex = re.compile(r"^-?\.?([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
+            # numbers such as '.40' are not supported. The reason is that sometimes the OCR
+            # recognizes a '-' as a '.' and we just ommit the leading '.' to avoid this issue.
             match = regex.match(input_string)
             if match:
                 value = value_as_float(match.group(1))
@@ -191,7 +193,6 @@ def find_depth_columns(
         # that does not match the descriptions
         if not column.significant_arithmetic_progression()
     ]
-
     return sorted(
         [column for column in numeric_columns if column and column.is_valid(all_words)],
         key=lambda column: len(column.entries),

diff --git a/src/stratigraphy/util/language_detection.py b/src/stratigraphy/util/language_detection.py
@@ -36,8 +36,8 @@ def detect_language_of_document(doc: fitz.Document) -> str:
     try:
         language = detect(text)
     except LangDetectException:
-        language = "de"
+        language = "de"  # TODO: default language should be read from config
 
-    if language not in ["de", "fr"]:
+    if language not in ["de", "fr"]:  # TODO: This should be read from the config
         language = "de"
     return language
diff --git a/tests/test_find_depth_columns.py b/tests/test_find_depth_columns.py
@@ -35,6 +35,21 @@ def test_depth_column_entries_with_splits():  # noqa: D103
     assert entries[3].value == 40.0, "The fourth entry should have a value of 40.0"
 
 
+def test_depth_column_entries_with_leading_character():  # noqa: D103
+    all_words = [
+        TextWord(fitz.Rect(0, 0, 5, 1), "0.00m"),
+        TextWord(fitz.Rect(0, 2, 5, 3), ".2m"),  # this is a test for an ocr error from '-2m' to '.2m'
+        TextWord(fitz.Rect(0, 4, 5, 5), "-3.0m"),
+        TextWord(fitz.Rect(0, 6, 5, 7), ".4.2m"),
+    ]
+    entries = depth_column_entries(all_words, include_splits=True)
+    assert len(entries) == 4, "There should be 4 entries"
+    assert entries[0].value == 0.0, "The first entry should have a value of 0"
+    assert entries[1].value == 2.0, "The second entry should have a value of 2.0"
+    assert entries[2].value == 3.0, "The third entry should have a value of 3.0"
+    assert entries[3].value == 4.2, "The fourth entry should have a value of 4.2"
+
+
 all_words_find_depth_column = [
     TextWord(fitz.Rect(0, 0, 5, 1), "10.00m"),
     TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand"),
File	Stmts	Miss	Cover	Missing
src/stratigraphy
__init__.py	8	1	88%	11
extract.py	210	210	0%	3–506
get_files.py	21	21	0%	3–48
line_detection.py	26	26	0%	3–76
main.py	91	91	0%	3–232
src/stratigraphy/util
coordinate_extraction.py	116	20	83%	25, 45, 49, 53, 57–65, 86, 171, 191, 280, 283–284, 288, 300
dataclasses.py	32	3	91%	37–39
depthcolumn.py	208	67	68%	26, 30, 51, 57, 60–61, 85, 88, 95, 102, 110–111, 121, 138–154, 199, 238, 254–262, 274, 279, 286, 313, 323, 352, 373, 376–387, 402–403, 448–490
depthcolumnentry.py	20	4	80%	12, 15, 27, 34
description_block_splitter.py	70	2	97%	24, 139
draw.py	73	73	0%	3–225
duplicate_detection.py	51	51	0%	3–146
find_depth_columns.py	89	6	93%	41–42, 70, 82, 175–176
find_description.py	63	28	56%	27–35, 50–63, 79–95, 172–175
geometric_line_utilities.py	86	2	98%	82, 132
interval.py	107	52	51%	25–28, 32–35, 40, 45, 48, 100–146, 167, 172–188
language_detection.py	18	18	0%	3–43
layer_identifier_column.py	91	91	0%	3–227
line.py	49	26	47%	25, 42, 51, 65–95, 98
linesquadtree.py	46	1	98%	76
plot_utils.py	43	43	0%	3–120
predictions.py	186	186	0%	3–386
textblock.py	74	8	89%	27, 51, 63, 75, 98, 119, 127, 155
util.py	40	22	45%	15–18, 22, 26, 40–47, 61–63, 87–88, 100–105
TOTAL	1818	1052	42%