Skip to content

Commit

Permalink
Improve is_valid criterion.
Browse files Browse the repository at this point in the history
The noise check within the is_valid criterion is now adjusted to check
short depth columns (i.e. few entries) more strictly than longer depth columns (i.e. more entries).
This is achieved by applying a quadratic behavior onto the number of entries.
  • Loading branch information
redur committed May 31, 2024
1 parent 9f25f7d commit c9804e9
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 8 deletions.
4 changes: 2 additions & 2 deletions config/matching_params.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ block_line_ratio: 0.20
left_line_length_threshold: 7
img_template_probability_threshold: 0.62

depth_column_params:
depth_column_params: # these params should be optimized as soon as there is reliable evaluation data
noise_count_threshold: 1.25
noise_count_offset: 0
noise_count_offset: 2.5

material_description:
de:
Expand Down
1 change: 0 additions & 1 deletion src/stratigraphy/benchmark/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,6 @@ def evaluate_layer_extraction(predictions: dict, number_of_truth_values: dict) -
_metrics[metric_type] = value
else:
_metrics[f"{language}_{metric_type}"] = value
print(_metrics)
return _metrics, document_level_metrics


Expand Down
15 changes: 12 additions & 3 deletions src/stratigraphy/util/depthcolumn.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,10 @@ def is_arithmetic_progression(self) -> bool:
return False

scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item()
return abs(scale_pearson_correlation_coef) >= 0.999
if len(self.entries) < 6: # It is more likely that fewer entries are accidently very much correlated
return abs(scale_pearson_correlation_coef) >= 0.9999
else:
return abs(scale_pearson_correlation_coef) >= 0.999

def is_valid(self, all_words: list[TextLine]) -> bool:
"""Checks whether the depth column is valid.
Expand All @@ -309,8 +312,14 @@ def is_valid(self, all_words: list[TextLine]) -> bool:
if len(self.entries) < 3:
return False

# When too much other text is in the column, then it is probably not valid
if self.noise_count(all_words) > self.noise_count_threshold * len(self.entries) - self.noise_count_offset:
# When too much other text is in the column, then it is probably not valid.
# The quadratic behavior of the noise count check makes the check strictoer for columns with few entries
# than columns with more entries. The more entries we have, the less likely it is that we found them by chance.
# TODO: Once evaluation data is of good enough qualities, we should optimize for the parameter below.
if (
self.noise_count(all_words)
> self.noise_count_threshold * (len(self.entries) - self.noise_count_offset) ** 2
):
return False

corr_coef = self.pearson_correlation_coef()
Expand Down
3 changes: 1 addition & 2 deletions src/stratigraphy/util/find_depth_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis
for word in sorted(all_words, key=lambda word: word.rect.y0):
try:
input_string = word.text.strip().replace(",", ".")
regex = re.compile(r"^-?([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
regex = re.compile(r"^-?\.?([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
match = regex.match(input_string)
if match:
value = value_as_float(match.group(1))
Expand Down Expand Up @@ -191,7 +191,6 @@ def find_depth_columns(
# that does not match the descriptions
if not column.significant_arithmetic_progression()
]

return sorted(
[column for column in numeric_columns if column and column.is_valid(all_words)],
key=lambda column: len(column.entries),
Expand Down

0 comments on commit c9804e9

Please sign in to comment.