Skip to content

Commit

Permalink
Merge pull request #52 from swisstopo/feat/improve_is_valid
Browse files Browse the repository at this point in the history
Feat/improve is valid
  • Loading branch information
redur authored May 31, 2024
2 parents f575b6d + 0f2fc22 commit 71bfee2
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 10 deletions.
4 changes: 2 additions & 2 deletions config/matching_params.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ block_line_ratio: 0.20
left_line_length_threshold: 7
img_template_probability_threshold: 0.62

depth_column_params:
depth_column_params: # these params should be optimized as soon as there is reliable evaluation data
noise_count_threshold: 1.25
noise_count_offset: 0
noise_count_offset: 2.5

material_description:
de:
Expand Down
1 change: 0 additions & 1 deletion src/stratigraphy/benchmark/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,6 @@ def evaluate_layer_extraction(predictions: dict, number_of_truth_values: dict) -
_metrics[metric_type] = value
else:
_metrics[f"{language}_{metric_type}"] = value
print(_metrics)
return _metrics, document_level_metrics


Expand Down
15 changes: 12 additions & 3 deletions src/stratigraphy/util/depthcolumn.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,10 @@ def is_arithmetic_progression(self) -> bool:
return False

scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item()
return abs(scale_pearson_correlation_coef) >= 0.999
if len(self.entries) < 6: # It is more likely that fewer entries are accidently very much correlated
return abs(scale_pearson_correlation_coef) >= 0.9999
else:
return abs(scale_pearson_correlation_coef) >= 0.999

def is_valid(self, all_words: list[TextLine]) -> bool:
"""Checks whether the depth column is valid.
Expand All @@ -309,8 +312,14 @@ def is_valid(self, all_words: list[TextLine]) -> bool:
if len(self.entries) < 3:
return False

# When too much other text is in the column, then it is probably not valid
if self.noise_count(all_words) > self.noise_count_threshold * len(self.entries) - self.noise_count_offset:
# When too much other text is in the column, then it is probably not valid.
# The quadratic behavior of the noise count check makes the check strictoer for columns with few entries
# than columns with more entries. The more entries we have, the less likely it is that we found them by chance.
# TODO: Once evaluation data is of good enough qualities, we should optimize for the parameter below.
if (
self.noise_count(all_words)
> self.noise_count_threshold * (len(self.entries) - self.noise_count_offset) ** 2
):
return False

corr_coef = self.pearson_correlation_coef()
Expand Down
5 changes: 3 additions & 2 deletions src/stratigraphy/util/find_depth_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis
for word in sorted(all_words, key=lambda word: word.rect.y0):
try:
input_string = word.text.strip().replace(",", ".")
regex = re.compile(r"^-?([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
regex = re.compile(r"^-?\.?([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
# numbers such as '.40' are not supported. The reason is that sometimes the OCR
# recognizes a '-' as a '.' and we just ommit the leading '.' to avoid this issue.
match = regex.match(input_string)
if match:
value = value_as_float(match.group(1))
Expand Down Expand Up @@ -191,7 +193,6 @@ def find_depth_columns(
# that does not match the descriptions
if not column.significant_arithmetic_progression()
]

return sorted(
[column for column in numeric_columns if column and column.is_valid(all_words)],
key=lambda column: len(column.entries),
Expand Down
4 changes: 2 additions & 2 deletions src/stratigraphy/util/language_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ def detect_language_of_document(doc: fitz.Document) -> str:
try:
language = detect(text)
except LangDetectException:
language = "de"
language = "de" # TODO: default language should be read from config

if language not in ["de", "fr"]:
if language not in ["de", "fr"]: # TODO: This should be read from the config
language = "de"
return language
15 changes: 15 additions & 0 deletions tests/test_find_depth_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,21 @@ def test_depth_column_entries_with_splits(): # noqa: D103
assert entries[3].value == 40.0, "The fourth entry should have a value of 40.0"


def test_depth_column_entries_with_leading_character(): # noqa: D103
all_words = [
TextWord(fitz.Rect(0, 0, 5, 1), "0.00m"),
TextWord(fitz.Rect(0, 2, 5, 3), ".2m"), # this is a test for an ocr error from '-2m' to '.2m'
TextWord(fitz.Rect(0, 4, 5, 5), "-3.0m"),
TextWord(fitz.Rect(0, 6, 5, 7), ".4.2m"),
]
entries = depth_column_entries(all_words, include_splits=True)
assert len(entries) == 4, "There should be 4 entries"
assert entries[0].value == 0.0, "The first entry should have a value of 0"
assert entries[1].value == 2.0, "The second entry should have a value of 2.0"
assert entries[2].value == 3.0, "The third entry should have a value of 3.0"
assert entries[3].value == 4.2, "The fourth entry should have a value of 4.2"


all_words_find_depth_column = [
TextWord(fitz.Rect(0, 0, 5, 1), "10.00m"),
TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand"),
Expand Down

1 comment on commit 71bfee2

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
src/stratigraphy
   __init__.py8188%11
   extract.py2102100%3–506
   get_files.py21210%3–48
   line_detection.py26260%3–76
   main.py91910%3–232
src/stratigraphy/util
   coordinate_extraction.py1162083%25, 45, 49, 53, 57–65, 86, 171, 191, 280, 283–284, 288, 300
   dataclasses.py32391%37–39
   depthcolumn.py2086768%26, 30, 51, 57, 60–61, 85, 88, 95, 102, 110–111, 121, 138–154, 199, 238, 254–262, 274, 279, 286, 313, 323, 352, 373, 376–387, 402–403, 448–490
   depthcolumnentry.py20480%12, 15, 27, 34
   description_block_splitter.py70297%24, 139
   draw.py73730%3–225
   duplicate_detection.py51510%3–146
   find_depth_columns.py89693%41–42, 70, 82, 175–176
   find_description.py632856%27–35, 50–63, 79–95, 172–175
   geometric_line_utilities.py86298%82, 132
   interval.py1075251%25–28, 32–35, 40, 45, 48, 100–146, 167, 172–188
   language_detection.py18180%3–43
   layer_identifier_column.py91910%3–227
   line.py492647%25, 42, 51, 65–95, 98
   linesquadtree.py46198%76
   plot_utils.py43430%3–120
   predictions.py1861860%3–386
   textblock.py74889%27, 51, 63, 75, 98, 119, 127, 155
   util.py402245%15–18, 22, 26, 40–47, 61–63, 87–88, 100–105
TOTAL1818105242% 

Tests Skipped Failures Errors Time
58 0 💤 0 ❌ 0 🔥 0.608s ⏱️

Please sign in to comment.