Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/improve is valid #52

Merged
merged 3 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions config/matching_params.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ block_line_ratio: 0.20
left_line_length_threshold: 7
img_template_probability_threshold: 0.62

depth_column_params:
depth_column_params: # these params should be optimized as soon as there is reliable evaluation data
noise_count_threshold: 1.25
noise_count_offset: 0
noise_count_offset: 2.5

material_description:
de:
Expand Down
1 change: 0 additions & 1 deletion src/stratigraphy/benchmark/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,6 @@ def evaluate_layer_extraction(predictions: dict, number_of_truth_values: dict) -
_metrics[metric_type] = value
else:
_metrics[f"{language}_{metric_type}"] = value
print(_metrics)
return _metrics, document_level_metrics


Expand Down
15 changes: 12 additions & 3 deletions src/stratigraphy/util/depthcolumn.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,10 @@ def is_arithmetic_progression(self) -> bool:
return False

scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item()
return abs(scale_pearson_correlation_coef) >= 0.999
if len(self.entries) < 6: # It is more likely that fewer entries are accidently very much correlated
return abs(scale_pearson_correlation_coef) >= 0.9999
else:
return abs(scale_pearson_correlation_coef) >= 0.999

def is_valid(self, all_words: list[TextLine]) -> bool:
"""Checks whether the depth column is valid.
Expand All @@ -309,8 +312,14 @@ def is_valid(self, all_words: list[TextLine]) -> bool:
if len(self.entries) < 3:
return False

# When too much other text is in the column, then it is probably not valid
if self.noise_count(all_words) > self.noise_count_threshold * len(self.entries) - self.noise_count_offset:
# When too much other text is in the column, then it is probably not valid.
# The quadratic behavior of the noise count check makes the check strictoer for columns with few entries
# than columns with more entries. The more entries we have, the less likely it is that we found them by chance.
# TODO: Once evaluation data is of good enough qualities, we should optimize for the parameter below.
if (
self.noise_count(all_words)
> self.noise_count_threshold * (len(self.entries) - self.noise_count_offset) ** 2
):
return False

corr_coef = self.pearson_correlation_coef()
Expand Down
5 changes: 3 additions & 2 deletions src/stratigraphy/util/find_depth_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis
for word in sorted(all_words, key=lambda word: word.rect.y0):
try:
input_string = word.text.strip().replace(",", ".")
regex = re.compile(r"^-?([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
regex = re.compile(r"^-?\.?([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We now support numbers such as .40 that sometimes occur.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The input .40 is now extracted with value 40, not as 0.40. Is that really what we want?

I would also suggest adding this as a test case in test_find_depth_columns.py.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked again, and actually it's rather that sometimes a '-' is recognized as a '.' in older borehole profiles that have this "handwritten style". Then the behavior is totally desired.

But I also found an occurrence of '.80'. See here: A531.pdf

For our dataset, it is for now better to use the current behaviour.

# numbers such as '.40' are not supported. The reason is that sometimes the OCR
# recognizes a '-' as a '.' and we just ommit the leading '.' to avoid this issue.
match = regex.match(input_string)
if match:
value = value_as_float(match.group(1))
Expand Down Expand Up @@ -191,7 +193,6 @@ def find_depth_columns(
# that does not match the descriptions
if not column.significant_arithmetic_progression()
]

return sorted(
[column for column in numeric_columns if column and column.is_valid(all_words)],
key=lambda column: len(column.entries),
Expand Down
4 changes: 2 additions & 2 deletions src/stratigraphy/util/language_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ def detect_language_of_document(doc: fitz.Document) -> str:
try:
language = detect(text)
except LangDetectException:
language = "de"
language = "de" # TODO: default language should be read from config
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is bothering me for the moment. Right now you need to adjust the code to extend the extraction to other languages. This should be doable from the config files. I believe there are other places where language is hard-coded in form of keywords. (e.g. coordinate extraction)

I will open an issue for it.


if language not in ["de", "fr"]:
if language not in ["de", "fr"]: # TODO: This should be read from the config
language = "de"
return language
15 changes: 15 additions & 0 deletions tests/test_find_depth_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,21 @@ def test_depth_column_entries_with_splits(): # noqa: D103
assert entries[3].value == 40.0, "The fourth entry should have a value of 40.0"


def test_depth_column_entries_with_leading_character(): # noqa: D103
all_words = [
TextWord(fitz.Rect(0, 0, 5, 1), "0.00m"),
TextWord(fitz.Rect(0, 2, 5, 3), ".2m"), # this is a test for an ocr error from '-2m' to '.2m'
TextWord(fitz.Rect(0, 4, 5, 5), "-3.0m"),
TextWord(fitz.Rect(0, 6, 5, 7), ".4.2m"),
]
entries = depth_column_entries(all_words, include_splits=True)
assert len(entries) == 4, "There should be 4 entries"
assert entries[0].value == 0.0, "The first entry should have a value of 0"
assert entries[1].value == 2.0, "The second entry should have a value of 2.0"
assert entries[2].value == 3.0, "The third entry should have a value of 3.0"
assert entries[3].value == 4.2, "The fourth entry should have a value of 4.2"


all_words_find_depth_column = [
TextWord(fitz.Rect(0, 0, 5, 1), "10.00m"),
TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand"),
Expand Down
Loading