Skip to content

Commit

Permalink
Merge pull request #66 from swisstopo/feat/correct_.NN
Browse files Browse the repository at this point in the history
Improve depth entry recognition for cases such as '.80'.
  • Loading branch information
redur authored Jun 25, 2024
2 parents b96efa8 + 5407194 commit f686692
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 28 deletions.
70 changes: 43 additions & 27 deletions src/stratigraphy/util/boundarydepthcolumnvalidator.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def is_valid(self, column: BoundaryDepthColumn, corr_coef_threshold: float = 0.9
):
return False
# Check if the entries are strictly increasing.
if not all(i.value < j.value for i, j in zip(column.entries, column.entries[1:], strict=False)):
if not column.is_strictly_increasing():
return False

corr_coef = column.pearson_correlation_coef()
Expand Down Expand Up @@ -89,7 +89,14 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu
Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the
highest pearson correlation coefficient is selected and checked for validity.
This is useful if one entry has an OCR mistake, and the column is not valid because of it.
This is useful if one or more entries have an OCR mistake, and the column is not valid because of it.
Currently, there is no limit on the number of corrections per depth column. Indeed, there are examples of depth
columns with multiple OCR errors on different depth values. On the other hand, allowing an unlimited number of
corrections increases the risk, that a random column of different values is incorrectly accepted as a depth
column after making the corrections, especially if the column has a low number of entries. A more robust
solution might be to allow corrections on less than 50% of all entries, or something similar. However, we
currently don't have enough examples to properly tune this parameter.
Note: Common mistakes should be extended as needed.
Expand All @@ -99,34 +106,43 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu
Returns:
BoundaryDepthColumn | None: The corrected depth column, or None if no correction was possible.
"""
new_columns = []
for remove_index in range(len(column.entries)):
new_columns.append(
BoundaryDepthColumn(
[
entry if index != remove_index else _correct_entry(entry)
for index, entry in enumerate(column.entries)
],
),
)
best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef())

# We require a higher correlation coefficient when we've already corrected a mistake.
if self.is_valid(best_column, corr_coef_threshold=0.999):
return best_column
else:
return None


def _correct_entry(entry: DepthColumnEntry) -> DepthColumnEntry:
new_columns = [BoundaryDepthColumn()]
for entry in column.entries:
new_columns = [
BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value)])
for column in new_columns
for new_value in _value_alternatives(entry.value)
]
# Immediately require strictly increasing values, to avoid exponential complexity when many implausible
# alternative values are suggested
new_columns = [column for column in new_columns if column.is_strictly_increasing()]

if len(new_columns):
best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef())

# We require a higher correlation coefficient when we've already corrected a mistake.
if self.is_valid(best_column, corr_coef_threshold=0.999):
return best_column

return None


def _value_alternatives(value: float) -> set[float]:
"""Corrects frequent OCR errors in depth column entries.
Args:
entry (DepthColumnEntry): The depth column entry to correct.
value (float): The depth values to find plausible alternatives for
Returns:
DepthColumnEntry: The corrected depth column entry.
set(float): all plausible values (including the original one)
"""
text_value = str(entry.value)
text_value = text_value.replace("4", "1") # In older documents, OCR sometimes mistakes 1 for 4
return DepthColumnEntry(entry.rect, float(text_value))
alternatives = {value}
# In older documents, OCR sometimes mistakes 1 for 4
alternatives.add(float(str(value).replace("4", "1")))

# replace a pattern such as '.80' with '0.80'. These cases are already converted
# to '80.0' when depth entries are recognized.
if value.is_integer():
alternatives.add(value / 100)

return alternatives
5 changes: 4 additions & 1 deletion src/stratigraphy/util/depthcolumn.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,11 +229,14 @@ def valid_initial_segment(self, rect: fitz.Rect) -> BoundaryDepthColumn:
return initial_segment
return BoundaryDepthColumn()

def strictly_contains(self, other: BoundaryDepthColumn):
def strictly_contains(self, other: BoundaryDepthColumn) -> bool:
return len(other.entries) < len(self.entries) and all(
other_entry in self.entries for other_entry in other.entries
)

def is_strictly_increasing(self) -> bool:
return all(i.value < j.value for i, j in zip(self.entries, self.entries[1:], strict=False))

def depth_intervals(self) -> list[BoundaryInterval]:
"""Creates a list of depth intervals from the depth column entries.
Expand Down

1 comment on commit f686692

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
src/stratigraphy
   __init__.py8188%11
   extract.py1881880%3–482
   get_files.py21210%3–48
   line_detection.py26260%3–76
   main.py94940%3–237
src/stratigraphy/util
   boundarydepthcolumnvalidator.py412051%47, 57, 60, 81–84, 109–127, 139–148
   coordinate_extraction.py127794%31, 62, 75–76, 80, 205, 328
   dataclasses.py32391%37–39
   depthcolumn.py1946467%26, 30, 51, 57, 60–61, 85, 88, 95, 102, 110–111, 121, 138–154, 192, 229, 248–256, 267, 272, 279, 310, 315–322, 337–338, 381–423
   depthcolumnentry.py26773%12, 15, 29–30, 33, 45, 52
   description_block_splitter.py70297%24, 139
   draw.py80800%3–244
   duplicate_detection.py51510%3–146
   extract_text.py27293%38–39
   find_depth_columns.py91693%42–43, 71, 83, 176–177
   find_description.py632856%27–35, 50–63, 79–95, 172–175
   geometric_line_utilities.py86298%82, 132
   interval.py1065548%24–27, 31–34, 39, 44, 47, 57–59, 99–145, 166, 171–187
   language_detection.py18180%3–45
   layer_identifier_column.py91910%3–227
   line.py49492%25, 42, 51, 98
   linesquadtree.py46198%76
   plot_utils.py43430%3–120
   predictions.py1871870%3–387
   textblock.py74889%27, 51, 63, 75, 98, 119, 127, 155
   util.py401855%22, 40–47, 61–63, 87–88, 100–105
TOTAL1879102745% 

Tests Skipped Failures Errors Time
61 0 💤 0 ❌ 0 🔥 0.925s ⏱️

Please sign in to comment.