Skip to content

Commit

Permalink
Merge pull request #46 from swisstopo/feat/extend_geneva_layouts
Browse files Browse the repository at this point in the history
Feat/extend geneva layouts
  • Loading branch information
redur authored May 23, 2024
2 parents e156378 + 9bae4a1 commit d26fbf7
Show file tree
Hide file tree
Showing 3 changed files with 189 additions and 117 deletions.
141 changes: 63 additions & 78 deletions src/stratigraphy/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import fitz

from stratigraphy import DATAPATH
from stratigraphy.util import find_depth_columns
from stratigraphy.util.dataclasses import Line
from stratigraphy.util.depthcolumn import DepthColumn
Expand All @@ -16,6 +15,7 @@
)
from stratigraphy.util.interval import BoundaryInterval, Interval
from stratigraphy.util.layer_identifier_column import (
LayerIdentifierColumn,
find_layer_identifier_column,
find_layer_identifier_column_entries,
)
Expand Down Expand Up @@ -71,86 +71,55 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict
lines.append(TextLine(current_line_words))
current_line_words = []

depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True)
layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words)

used_entry_rects = []
for column in layer_depth_columns:
for entry in column.entries:
used_entry_rects.extend([entry.start.rect, entry.end.rect])

depth_column_entries = [
entry
for entry in find_depth_columns.depth_column_entries(words, include_splits=False)
if entry.rect not in used_entry_rects
]
depth_columns: list[DepthColumn] = layer_depth_columns
depth_columns.extend(
find_depth_columns.find_depth_columns(
depth_column_entries, words, depth_column_params=params["depth_column_params"]
)
)

# Detect Layer Index Columns
layer_identifier_entries = find_layer_identifier_column_entries(words)
layer_identifier_entries = find_layer_identifier_column_entries(lines)
layer_identifier_columns = (
find_layer_identifier_column(layer_identifier_entries) if layer_identifier_entries else []
)
pairs = []
if layer_identifier_columns:
layer_identifier_pairs = []
for layer_identifier_column in layer_identifier_columns:
material_description_rect = find_material_description_column(
lines, layer_identifier_column, language, **params["material_description"]
)
if material_description_rect:
layer_identifier_pairs.append((layer_identifier_column, material_description_rect))

# Obtain the best pair. In contrast to depth columns, there only ever is one layer index column per page.
if layer_identifier_pairs:
layer_identifier_pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1]))
layer_identifier_column, material_description_rect = layer_identifier_pairs[-1]
# split the material description rect into blocks.
description_lines = get_description_lines(lines, material_description_rect)
blocks = get_description_blocks_from_layer_identifier(layer_identifier_column.entries, description_lines)

predictions = [{"material_description": block.to_json()} for block in blocks]
predictions = parse_and_remove_empty_predictions(predictions)

json_filtered_pairs = [
{
"depth_column": None,
"material_description_rect": [
material_description_rect.x0,
material_description_rect.y0,
material_description_rect.x1,
material_description_rect.y1,
],
}
]

# Visualization: To be dropped before merging to main.
for layer_identifier_column in layer_identifier_columns:
fitz.utils.draw_rect(
page, layer_identifier_column.rect() * page.derotation_matrix, color=fitz.utils.getColor("blue")
)
for block in blocks:
fitz.utils.draw_rect(page, block.rect * page.derotation_matrix, color=fitz.utils.getColor("red"))
fitz.utils.draw_rect(
page, material_description_rect * page.derotation_matrix, color=fitz.utils.getColor("blue")
pairs.append((layer_identifier_column, material_description_rect))

# Obtain the best pair. In contrast do depth columns, there only ever is one layer index column per page.
if pairs:
pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1]))

# If there is a layer identifier column, then we use this directly.
# Else, we search for depth columns. We could also think of some scoring mechanism to decide which one to use.
if not pairs:
depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True)
layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words)

used_entry_rects = []
for column in layer_depth_columns:
for entry in column.entries:
used_entry_rects.extend([entry.start.rect, entry.end.rect])

depth_column_entries = [
entry
for entry in find_depth_columns.depth_column_entries(words, include_splits=False)
if entry.rect not in used_entry_rects
]
depth_columns: list[DepthColumn] = layer_depth_columns
depth_columns.extend(
find_depth_columns.find_depth_columns(
depth_column_entries, words, depth_column_params=params["depth_column_params"]
)
page.parent.save(DATAPATH / "_temp" / "output.pdf", garbage=4, deflate=True, clean=True)

return predictions, json_filtered_pairs

pairs = []
for depth_column in depth_columns:
material_description_rect = find_material_description_column(
lines, depth_column, language, **params["material_description"]
)
if material_description_rect:
pairs.append((depth_column, material_description_rect))
# lowest score first
pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words))

for depth_column in depth_columns:
material_description_rect = find_material_description_column(
lines, depth_column, language, **params["material_description"]
)
if material_description_rect:
pairs.append((depth_column, material_description_rect))
# lowest score first
pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words))

to_delete = []
for i, (_depth_column, material_description_rect) in enumerate(pairs):
Expand Down Expand Up @@ -257,7 +226,7 @@ def score_column_match(


def match_columns(
depth_column: DepthColumn,
depth_column: DepthColumn | LayerIdentifierColumn,
description_lines: list[TextLine],
geometric_lines: list[Line],
material_description_rect: fitz.Rect,
Expand All @@ -266,10 +235,11 @@ def match_columns(
"""Match the depth column entries with the description lines.
This function identifies groups of depth intervals and text blocks that are likely to match.
In this process, the number of text blocks is adjusted to match the number of depth intervals.
Makes a distinction between DepthColumn and LayerIdentifierColumn and obtains the corresponding text blocks
as well as their depth intervals where present.
Args:
depth_column (DepthColumn): The depth column.
depth_column (DepthColumn | LayerIdentifierColumn): The depth column.
description_lines (list[TextLine]): The description lines.
geometric_lines (list[Line]): The geometric lines.
material_description_rect (fitz.Rect): The material description rectangle.
Expand All @@ -278,13 +248,28 @@ def match_columns(
Returns:
list: The matched depth intervals and text blocks.
"""
return [
element
for group in depth_column.identify_groups(
description_lines, geometric_lines, material_description_rect, **params
if isinstance(depth_column, DepthColumn):
return [
element
for group in depth_column.identify_groups(
description_lines, geometric_lines, material_description_rect, **params
)
for element in transform_groups(group["depth_intervals"], group["blocks"], **params)
]
elif isinstance(depth_column, LayerIdentifierColumn):
blocks = get_description_blocks_from_layer_identifier(depth_column.entries, description_lines)
groups = []
for block in blocks:
depth_interval = depth_column.get_depth_interval(block)
if depth_interval:
groups.append({"depth_interval": depth_interval, "block": block})
else:
groups.append({"block": block})
return groups
else:
raise ValueError(
f"depth_column must be a DepthColumn or a LayerIdentifierColumn object. Got {type(depth_column)}."
)
for element in transform_groups(group["depth_intervals"], group["blocks"], **params)
]


def transform_groups(
Expand Down
64 changes: 43 additions & 21 deletions src/stratigraphy/util/find_depth_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,6 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis
Returns:
list[DepthColumnEntry]: The extracted depth column entries.
"""

def value_as_float(string_value: str) -> float: # noqa: D103
# OCR sometimes tends to miss the decimal comma
parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value)
return abs(float(parsed_text))

entries = []
for word in sorted(all_words, key=lambda word: word.rect.y0):
try:
Expand All @@ -38,26 +32,54 @@ def value_as_float(string_value: str) -> float: # noqa: D103
entries.append(DepthColumnEntry(word.rect, value))
elif include_splits:
# support for e.g. "1.10-1.60m" extracted as a single word
regex2 = re.compile(r"^-?([0-9]+(\.[0-9]+)?)[müMN\\.]*\W+([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
match2 = regex2.match(input_string)

if match2:
value1 = value_as_float(match2.group(1))
first_half_rect = fitz.Rect(
word.rect.x0, word.rect.y0, word.rect.x1 - word.rect.width / 2, word.rect.y1
)
entries.append(DepthColumnEntry(first_half_rect, value1))

value2 = value_as_float(match2.group(3))
second_half_rect = fitz.Rect(
word.rect.x0 + word.rect.width / 2, word.rect.y0, word.rect.x1, word.rect.y1
)
entries.append(DepthColumnEntry(second_half_rect, value2))
layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect)
entries.extend(
[layer_depth_column_entry.start, layer_depth_column_entry.end] if layer_depth_column_entry else []
)
except ValueError:
pass
return entries


def value_as_float(string_value: str) -> float: # noqa: D103
# OCR sometimes tends to miss the decimal comma
parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value)
return abs(float(parsed_text))


def extract_layer_depth_interval(
text: str, rect: fitz.Rect, require_start_of_string: bool = True
) -> LayerDepthColumnEntry | None:
"""Extracts a LayerDepthColumnEntry from a string.
Args:
text (str): The string to extract the depth interval from.
rect (fitz.Rect): The rectangle of the text.
require_start_of_string (bool, optional): Whether the number to extract needs to be
at the start of a string. Defaults to True.
Returns:
LayerDepthColumnEntry | None: The extracted LayerDepthColumnEntry or None if none is found.
"""
input_string = text.strip().replace(",", ".")

query = r"-?([0-9]+(\.[0-9]+)?)[müMN\]*[\s-]+([0-9]+(\.[0-9]+)?)[müMN\\.]*"
if not require_start_of_string:
query = r".*?" + query
regex = re.compile(query)
match = regex.match(input_string)
if match:
value1 = value_as_float(match.group(1))
first_half_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 - rect.width / 2, rect.y1)

value2 = value_as_float(match.group(3))
second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1)
return LayerDepthColumnEntry(
DepthColumnEntry(first_half_rect, value1), DepthColumnEntry(second_half_rect, value2)
)
return None


def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[TextWord]) -> list[LayerDepthColumn]:
"""Finds all layer depth columns.
Expand Down
Loading

1 comment on commit d26fbf7

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
src/stratigraphy
   __init__.py8188%11
   extract.py2112110%3–507
   get_files.py21210%3–48
   line_detection.py26260%3–76
   main.py91910%3–232
src/stratigraphy/util
   coordinate_extraction.py1283176%30, 50, 54, 58–66, 143, 163, 235–241, 250–252, 268–282
   dataclasses.py32391%37–39
   depthcolumn.py2066767%26, 30, 51, 57, 60–61, 85, 88, 95, 102, 110–111, 121, 138–154, 199, 238, 254–262, 274, 279, 286, 310, 314, 343, 364, 367–378, 393–394, 439–481
   depthcolumnentry.py20480%12, 15, 27, 34
   description_block_splitter.py70297%24, 139
   draw.py73730%3–225
   duplicate_detection.py32320%3–81
   find_depth_columns.py89693%39–40, 68, 80, 173–174
   find_description.py632856%27–35, 50–63, 79–95, 172–175
   geometric_line_utilities.py87298%83, 133
   interval.py1075251%25–28, 32–35, 40, 45, 48, 100–146, 167, 172–188
   language_detection.py18180%3–43
   layer_identifier_column.py91910%3–227
   line.py492647%25, 42, 51, 65–95, 98
   linesquadtree.py46198%76
   plot_utils.py44440%3–121
   predictions.py1871870%3–385
   textblock.py74889%27, 51, 63, 75, 98, 119, 127, 155
   util.py402245%15–18, 22, 26, 40–47, 61–63, 87–88, 100–105
TOTAL1813104742% 

Tests Skipped Failures Errors Time
58 0 💤 0 ❌ 0 🔥 0.615s ⏱️

Please sign in to comment.