Skip to content

Commit

Permalink
Minor improvments.
Browse files Browse the repository at this point in the history
  • Loading branch information
redur committed Jun 20, 2024
1 parent a8e3002 commit f45c853
Showing 1 changed file with 40 additions and 23 deletions.
63 changes: 40 additions & 23 deletions src/stratigraphy/util/depthcolumn.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def depth_intervals(self) -> list[BoundaryInterval]:
return depth_intervals

def significant_arithmetic_progression(self) -> bool:
if len(self.entries) < 6:
if len(self.entries) < 7:
return self.is_arithmetic_progression()
else:
# to allow for OCR errors or gaps in the progression, we only require a segment of length 7 that is an
Expand Down Expand Up @@ -322,10 +322,7 @@ def is_valid(self, all_words: list[TextLine]) -> bool:
):
return False
# Check if the entries are strictly increasing.
if (
not all(i.value < j.value for i, j in zip(self.entries, self.entries[1:], strict=False))
and len(self.entries) < 15
):
if not all(i.value < j.value for i, j in zip(self.entries, self.entries[1:], strict=False)):
return False

corr_coef = self.pearson_correlation_coef()
Expand Down Expand Up @@ -379,31 +376,43 @@ def reduce_until_valid(self, all_words: list[TextLine]) -> BoundaryDepthColumn:
while current:
if current.is_valid(all_words):
return current
elif current.correct_typos(all_words) is not None:
return current.correct_typos(all_words)
elif current.correct_OCR_mistakes(all_words) is not None:
return current.correct_OCR_mistakes(all_words)
else:
current = current.remove_entry_by_correlation_gradient()

def correct_typos(self, all_words) -> BoundaryDepthColumn | None:
def correct_OCR_mistakes(self, all_words: list[TextLine]) -> BoundaryDepthColumn | None:
"""Corrects OCR mistakes in the depth column entries.
Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the
hightest pearson correlation coefficient is selected and checked for validity.
This is useful if one entry has an OCR mistake, and the column is not valid because of it.
Note: Common mistakes should be extended as needed.
Args:
all_words (list[TextLine]): A list of all text lines on the page.
Returns:
BoundaryDepthColumn | None: The corrected depth column, or None if no correction was possible.
"""
new_columns = []
for remove_index, remove_entry in enumerate(self.entries):
for remove_index in range(len(self.entries)):
new_columns.append(
(
remove_entry,
BoundaryDepthColumn(
self.noise_count_threshold,
self.noise_count_offset,
[
entry if index != remove_index else _correct_entry(entry)
for index, entry in enumerate(self.entries)
],
),
)
BoundaryDepthColumn(
self.noise_count_threshold,
self.noise_count_offset,
[
entry if index != remove_index else _correct_entry(entry)
for index, entry in enumerate(self.entries)
],
),
)
_removed_entry, best_columns = max(new_columns, key=lambda column: column[1].pearson_correlation_coef())
best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef())

if best_columns.is_valid(all_words):
return best_columns
if best_column.is_valid(all_words):
return best_column
else:
return None

Expand Down Expand Up @@ -526,6 +535,14 @@ def identify_groups(


def _correct_entry(entry: DepthColumnEntry) -> DepthColumnEntry:
"""Corrects frequent OCR errors in depth column entries.
Args:
entry (DepthColumnEntry): The depth column entry to correct.
Returns:
DepthColumnEntry: The corrected depth column entry.
"""
text_value = str(entry.value)
text_value = text_value.replace("4", "1") # In older documents, OCR sometimes mistakes 1 for 4
return DepthColumnEntry(entry.rect, float(text_value))

0 comments on commit f45c853

Please sign in to comment.