From 9f25f7df9c5081c5318e5a96dd71d80a5568e722 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 29 May 2024 10:55:44 +0200 Subject: [PATCH 1/3] Add TODO comments regarding language configuration. --- src/stratigraphy/util/language_detection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/stratigraphy/util/language_detection.py b/src/stratigraphy/util/language_detection.py index 9f0a21b5..8a313e2d 100644 --- a/src/stratigraphy/util/language_detection.py +++ b/src/stratigraphy/util/language_detection.py @@ -36,8 +36,8 @@ def detect_language_of_document(doc: fitz.Document) -> str: try: language = detect(text) except LangDetectException: - language = "de" + language = "de" # TODO: default language should be read from config - if language not in ["de", "fr"]: + if language not in ["de", "fr"]: # TODO: This should be read from the config language = "de" return language From c9804e9e58c68506344ff08ba8b2d71ab0b5db07 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Thu, 30 May 2024 13:42:39 +0200 Subject: [PATCH 2/3] Improve is_valid criterion. The noise check within the is_valid criterion is now adjusted to check short depth columns (i.e. few entries) more strictly than longer depth columns (i.e. more entries). This is achieved by applying a quadratic behavior onto the number of entries. --- config/matching_params.yml | 4 ++-- src/stratigraphy/benchmark/score.py | 1 - src/stratigraphy/util/depthcolumn.py | 15 ++++++++++++--- src/stratigraphy/util/find_depth_columns.py | 3 +-- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/config/matching_params.yml b/config/matching_params.yml index 10455cf2..e3885dec 100644 --- a/config/matching_params.yml +++ b/config/matching_params.yml @@ -3,9 +3,9 @@ block_line_ratio: 0.20 left_line_length_threshold: 7 img_template_probability_threshold: 0.62 -depth_column_params: +depth_column_params: # these params should be optimized as soon as there is reliable evaluation data noise_count_threshold: 1.25 - noise_count_offset: 0 + noise_count_offset: 2.5 material_description: de: diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py index b7d95229..9232f1d2 100644 --- a/src/stratigraphy/benchmark/score.py +++ b/src/stratigraphy/benchmark/score.py @@ -248,7 +248,6 @@ def evaluate_layer_extraction(predictions: dict, number_of_truth_values: dict) - _metrics[metric_type] = value else: _metrics[f"{language}_{metric_type}"] = value - print(_metrics) return _metrics, document_level_metrics diff --git a/src/stratigraphy/util/depthcolumn.py b/src/stratigraphy/util/depthcolumn.py index c426e89c..cc972a96 100644 --- a/src/stratigraphy/util/depthcolumn.py +++ b/src/stratigraphy/util/depthcolumn.py @@ -286,7 +286,10 @@ def is_arithmetic_progression(self) -> bool: return False scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item() - return abs(scale_pearson_correlation_coef) >= 0.999 + if len(self.entries) < 6: # It is more likely that fewer entries are accidently very much correlated + return abs(scale_pearson_correlation_coef) >= 0.9999 + else: + return abs(scale_pearson_correlation_coef) >= 0.999 def is_valid(self, all_words: list[TextLine]) -> bool: """Checks whether the depth column is valid. @@ -309,8 +312,14 @@ def is_valid(self, all_words: list[TextLine]) -> bool: if len(self.entries) < 3: return False - # When too much other text is in the column, then it is probably not valid - if self.noise_count(all_words) > self.noise_count_threshold * len(self.entries) - self.noise_count_offset: + # When too much other text is in the column, then it is probably not valid. + # The quadratic behavior of the noise count check makes the check strictoer for columns with few entries + # than columns with more entries. The more entries we have, the less likely it is that we found them by chance. + # TODO: Once evaluation data is of good enough qualities, we should optimize for the parameter below. + if ( + self.noise_count(all_words) + > self.noise_count_threshold * (len(self.entries) - self.noise_count_offset) ** 2 + ): return False corr_coef = self.pearson_correlation_coef() diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py index 5ba730a0..c5d18777 100644 --- a/src/stratigraphy/util/find_depth_columns.py +++ b/src/stratigraphy/util/find_depth_columns.py @@ -25,7 +25,7 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis for word in sorted(all_words, key=lambda word: word.rect.y0): try: input_string = word.text.strip().replace(",", ".") - regex = re.compile(r"^-?([0-9]+(\.[0-9]+)?)[müMN\\.]*$") + regex = re.compile(r"^-?\.?([0-9]+(\.[0-9]+)?)[müMN\\.]*$") match = regex.match(input_string) if match: value = value_as_float(match.group(1)) @@ -191,7 +191,6 @@ def find_depth_columns( # that does not match the descriptions if not column.significant_arithmetic_progression() ] - return sorted( [column for column in numeric_columns if column and column.is_valid(all_words)], key=lambda column: len(column.entries), From 0f2fc22ebcb2aa45c5b5fa029f99dc6e3680ff97 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Fri, 31 May 2024 15:29:58 +0200 Subject: [PATCH 3/3] Add documentation and tests for find_depth_column. --- src/stratigraphy/util/find_depth_columns.py | 2 ++ tests/test_find_depth_columns.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py index c5d18777..432cc449 100644 --- a/src/stratigraphy/util/find_depth_columns.py +++ b/src/stratigraphy/util/find_depth_columns.py @@ -26,6 +26,8 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis try: input_string = word.text.strip().replace(",", ".") regex = re.compile(r"^-?\.?([0-9]+(\.[0-9]+)?)[müMN\\.]*$") + # numbers such as '.40' are not supported. The reason is that sometimes the OCR + # recognizes a '-' as a '.' and we just ommit the leading '.' to avoid this issue. match = regex.match(input_string) if match: value = value_as_float(match.group(1)) diff --git a/tests/test_find_depth_columns.py b/tests/test_find_depth_columns.py index 25535497..4ff107bc 100644 --- a/tests/test_find_depth_columns.py +++ b/tests/test_find_depth_columns.py @@ -35,6 +35,21 @@ def test_depth_column_entries_with_splits(): # noqa: D103 assert entries[3].value == 40.0, "The fourth entry should have a value of 40.0" +def test_depth_column_entries_with_leading_character(): # noqa: D103 + all_words = [ + TextWord(fitz.Rect(0, 0, 5, 1), "0.00m"), + TextWord(fitz.Rect(0, 2, 5, 3), ".2m"), # this is a test for an ocr error from '-2m' to '.2m' + TextWord(fitz.Rect(0, 4, 5, 5), "-3.0m"), + TextWord(fitz.Rect(0, 6, 5, 7), ".4.2m"), + ] + entries = depth_column_entries(all_words, include_splits=True) + assert len(entries) == 4, "There should be 4 entries" + assert entries[0].value == 0.0, "The first entry should have a value of 0" + assert entries[1].value == 2.0, "The second entry should have a value of 2.0" + assert entries[2].value == 3.0, "The third entry should have a value of 3.0" + assert entries[3].value == 4.2, "The fourth entry should have a value of 4.2" + + all_words_find_depth_column = [ TextWord(fitz.Rect(0, 0, 5, 1), "10.00m"), TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand"),