Add comments.

swisstopo · May 31, 2024 · dc1798b · dc1798b
1 parent a68635c
commit dc1798b
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 3 deletions.
diff --git a/config/matching_params.yml b/config/matching_params.yml
@@ -3,7 +3,7 @@ block_line_ratio: 0.20
 left_line_length_threshold: 7
 img_template_probability_threshold: 0.62
 
-depth_column_params:
+depth_column_params:  # these params should be optimized as soon as there is reliable evaluation data
   noise_count_threshold: 1.25
   noise_count_offset: 2.5
 

diff --git a/src/stratigraphy/util/depthcolumn.py b/src/stratigraphy/util/depthcolumn.py
@@ -286,7 +286,7 @@ def is_arithmetic_progression(self) -> bool:
             return False
 
         scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item()
-        if len(self.entries) < 6:
+        if len(self.entries) < 6:  # It is more likely that fewer entries are accidently very much correlated
             return abs(scale_pearson_correlation_coef) >= 0.9999
         else:
             return abs(scale_pearson_correlation_coef) >= 0.999
@@ -312,7 +312,10 @@ def is_valid(self, all_words: list[TextLine]) -> bool:
         if len(self.entries) < 3:
             return False
 
-        # When too much other text is in the column, then it is probably not valid
+        # When too much other text is in the column, then it is probably not valid.
+        # The quadratic behavior of the noise count check makes the check strictoer for columns with few entries
+        # than columns with more entries. The more entries we have, the less likely it is that we found them by chance.
+        # TODO: Once evaluation data is of good enough qualities, we should optimize for the parameter below.
         if (
             self.noise_count(all_words)
             > self.noise_count_threshold * (len(self.entries) - self.noise_count_offset) ** 2