Skip to content

Commit

Permalink
Add comments.
Browse files Browse the repository at this point in the history
  • Loading branch information
redur committed May 31, 2024
1 parent a68635c commit dc1798b
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 3 deletions.
2 changes: 1 addition & 1 deletion config/matching_params.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ block_line_ratio: 0.20
left_line_length_threshold: 7
img_template_probability_threshold: 0.62

depth_column_params:
depth_column_params: # these params should be optimized as soon as there is reliable evaluation data
noise_count_threshold: 1.25
noise_count_offset: 2.5

Expand Down
7 changes: 5 additions & 2 deletions src/stratigraphy/util/depthcolumn.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def is_arithmetic_progression(self) -> bool:
return False

scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item()
if len(self.entries) < 6:
if len(self.entries) < 6: # It is more likely that fewer entries are accidently very much correlated
return abs(scale_pearson_correlation_coef) >= 0.9999
else:
return abs(scale_pearson_correlation_coef) >= 0.999
Expand All @@ -312,7 +312,10 @@ def is_valid(self, all_words: list[TextLine]) -> bool:
if len(self.entries) < 3:
return False

# When too much other text is in the column, then it is probably not valid
# When too much other text is in the column, then it is probably not valid.
# The quadratic behavior of the noise count check makes the check strictoer for columns with few entries
# than columns with more entries. The more entries we have, the less likely it is that we found them by chance.
# TODO: Once evaluation data is of good enough qualities, we should optimize for the parameter below.
if (
self.noise_count(all_words)
> self.noise_count_threshold * (len(self.entries) - self.noise_count_offset) ** 2
Expand Down

0 comments on commit dc1798b

Please sign in to comment.