Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Material Block recognition for Geneva Layout #45

Merged
merged 4 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions config/matching_params.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,31 +46,44 @@ material_description:
fr:
including_expressions:
- sol
- végétal
- végétal # remove accents generally; ocr might be wrong
- dallage
- terre
- bitume
- bitumineux
- grave d'infrastructure
- grave d'infrastructure # what happens if we remove this?
- sable
- limon
- gravier
- asphalte
- humus
- humus # hummus maybe?
- brun
- gris
- grise
- mou
- dur
- dure
- ferme
- racine
- revetement
- pierre
- beige
- beton
- craie
- marne
- materiau de base
- materiau
- matrice sableuse
- enrobé
- enrobé # accent --> check what happens if it's removed
- terrain
- remblais
- remblai
- molasse
- phase
- formations
- limoneuse
- argileuse
- argileux
- mousse
excluding_expressions:
- monsieur
- fin
Comment on lines 46 to 89
Copy link
Contributor Author

@redur redur May 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked our expressions with a native French speaker who also has experience with NLP with the French language. We looked at borehole profiles together and extended the list. Also, he suggested to remove all accents because OCR frequently makes errors there.

That's the reasons for the changes and comments here.

Expand Down
68 changes: 64 additions & 4 deletions src/stratigraphy/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,20 @@

import fitz

from stratigraphy import DATAPATH
from stratigraphy.util import find_depth_columns
from stratigraphy.util.dataclasses import Line
from stratigraphy.util.depthcolumn import DepthColumn
from stratigraphy.util.find_description import get_description_blocks, get_description_lines
from stratigraphy.util.find_description import (
get_description_blocks,
get_description_blocks_from_layer_identifier,
get_description_lines,
)
from stratigraphy.util.interval import BoundaryInterval, Interval
from stratigraphy.util.layer_identifier_column import (
find_layer_identifier_column,
find_layer_identifier_column_entries,
)
from stratigraphy.util.line import TextLine, TextWord
from stratigraphy.util.textblock import TextBlock, block_distance
from stratigraphy.util.util import (
Expand Down Expand Up @@ -41,7 +50,7 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict
for x0, y0, x1, y1, word, block_no, line_no, _word_no in fitz.utils.get_text(page, "words"):
rect = fitz.Rect(x0, y0, x1, y1) * page.rotation_matrix
text_word = TextWord(rect, word)
words.append(TextLine([text_word]))
words.append(text_word)
key = f"{block_no}_{line_no}"
if key not in words_by_line:
words_by_line[key] = []
Expand Down Expand Up @@ -81,14 +90,65 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict
depth_column_entries, words, depth_column_params=params["depth_column_params"]
)
)

# Detect Layer Index Columns
layer_identifier_entries = find_layer_identifier_column_entries(words)
layer_identifier_columns = (
find_layer_identifier_column(layer_identifier_entries) if layer_identifier_entries else []
)
if layer_identifier_columns:
layer_identifier_pairs = []
for layer_identifier_column in layer_identifier_columns:
material_description_rect = find_material_description_column(
lines, layer_identifier_column, language, **params["material_description"]
)
if material_description_rect:
layer_identifier_pairs.append((layer_identifier_column, material_description_rect))

# Obtain the best pair. In contrast to depth columns, there only ever is one layer index column per page.
if layer_identifier_pairs:
layer_identifier_pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1]))
layer_identifier_column, material_description_rect = layer_identifier_pairs[-1]
# split the material description rect into blocks.
description_lines = get_description_lines(lines, material_description_rect)
blocks = get_description_blocks_from_layer_identifier(layer_identifier_column.entries, description_lines)

predictions = [{"material_description": block.to_json()} for block in blocks]
predictions = parse_and_remove_empty_predictions(predictions)

json_filtered_pairs = [
{
"depth_column": None,
"material_description_rect": [
material_description_rect.x0,
material_description_rect.y0,
material_description_rect.x1,
material_description_rect.y1,
],
}
]

# Visualization: To be dropped before merging to main.
for layer_identifier_column in layer_identifier_columns:
fitz.utils.draw_rect(
page, layer_identifier_column.rect() * page.derotation_matrix, color=fitz.utils.getColor("blue")
)
for block in blocks:
fitz.utils.draw_rect(page, block.rect * page.derotation_matrix, color=fitz.utils.getColor("red"))
fitz.utils.draw_rect(
page, material_description_rect * page.derotation_matrix, color=fitz.utils.getColor("blue")
)
page.parent.save(DATAPATH / "_temp" / "output.pdf", garbage=4, deflate=True, clean=True)

return predictions, json_filtered_pairs
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now, I just exit here if there's a layer index column.

Some thoughts:
I believe it should be possible to consider a layer index column a special case of a depth column, and continue the "normal" way. I believe this will be "cleaner" to move on.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed


pairs = []
for depth_column in depth_columns:
material_description_rect = find_material_description_column(
lines, depth_column, language, **params["material_description"]
)
if material_description_rect:
pairs.append((depth_column, material_description_rect))

# lowest score first
pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words))

Expand All @@ -101,7 +161,7 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict
filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete]

groups = [] # list of matched depth intervals and text blocks
# groups is of the form: ["depth_interval": BoundaryInterval, "block": TextBlock]
# groups is of the form: [{"depth_interval": BoundaryInterval, "block": TextBlock}]
if len(filtered_pairs): # match depth column items with material description
for depth_column, material_description_rect in filtered_pairs:
description_lines = get_description_lines(lines, material_description_rect)
Expand Down
26 changes: 13 additions & 13 deletions src/stratigraphy/util/find_depth_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@

from stratigraphy.util.depthcolumn import BoundaryDepthColumn, LayerDepthColumn
from stratigraphy.util.depthcolumnentry import DepthColumnEntry, LayerDepthColumnEntry
from stratigraphy.util.line import TextLine
from stratigraphy.util.line import TextWord


def depth_column_entries(all_words: list[TextLine], include_splits: bool) -> list[DepthColumnEntry]:
def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]:
"""Find all depth column entries given a list of TextLine objects.

Note: Only depths up to two digits before the decimal point are supported.

Args:
all_words (list[TextLine]): List of Text lines to extract depth column entries from.
all_words (list[TextWord]): List of text words to extract depth column entries from.
include_splits (bool): Whether to include split entries.

Returns:
Expand All @@ -28,14 +28,14 @@ def value_as_float(string_value: str) -> float: # noqa: D103
return abs(float(parsed_text))

entries = []
for line in sorted(all_words, key=lambda line: line.rect.y0):
for word in sorted(all_words, key=lambda word: word.rect.y0):
try:
input_string = line.text.strip().replace(",", ".")
input_string = word.text.strip().replace(",", ".")
regex = re.compile(r"^-?([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
match = regex.match(input_string)
if match:
value = value_as_float(match.group(1))
entries.append(DepthColumnEntry(line.rect, value))
entries.append(DepthColumnEntry(word.rect, value))
elif include_splits:
# support for e.g. "1.10-1.60m" extracted as a single word
regex2 = re.compile(r"^-?([0-9]+(\.[0-9]+)?)[müMN\\.]*\W+([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
Expand All @@ -44,33 +44,33 @@ def value_as_float(string_value: str) -> float: # noqa: D103
if match2:
value1 = value_as_float(match2.group(1))
first_half_rect = fitz.Rect(
line.rect.x0, line.rect.y0, line.rect.x1 - line.rect.width / 2, line.rect.y1
word.rect.x0, word.rect.y0, word.rect.x1 - word.rect.width / 2, word.rect.y1
)
entries.append(DepthColumnEntry(first_half_rect, value1))

value2 = value_as_float(match2.group(3))
second_half_rect = fitz.Rect(
line.rect.x0 + line.rect.width / 2, line.rect.y0, line.rect.x1, line.rect.y1
word.rect.x0 + word.rect.width / 2, word.rect.y0, word.rect.x1, word.rect.y1
)
entries.append(DepthColumnEntry(second_half_rect, value2))
except ValueError:
pass
return entries


def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[TextLine]) -> list[LayerDepthColumn]:
def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[TextWord]) -> list[LayerDepthColumn]:
"""Finds all layer depth columns.

Generates a list of LayerDepthColumnEntry objects by finding conseucutive pairs of DepthColumnEntry objects.
Different columns are grouped together in LayerDepthColumn objects. Finally a list of LayerDepthColumn objects,
Generates a list of LayerDepthColumnEntry objects by finding consecutive pairs of DepthColumnEntry objects.
Different columns are grouped together in LayerDepthColumn objects. Finally, a list of LayerDepthColumn objects,
one for each column, is returned.

A layer corresponds to a material layer. The layer is defined using a start and end point (e.g. 1.10-1.60m).
The start and end points are represented as DepthColumnEntry objects.

Args:
entries (list[DepthColumnEntry]): List of depth column entries.
all_words (list[TextLine]): List of all TextLine objects.
all_words (list[TextWord]): List of all TextWord objects.

Returns:
list[LayerDepthColumn]: List of all layer depth columns identified.
Expand Down Expand Up @@ -125,7 +125,7 @@ def find_pair(entry: DepthColumnEntry) -> DepthColumnEntry | None: # noqa: D103


def find_depth_columns(
entries: list[DepthColumnEntry], all_words: list[TextLine], depth_column_params: dict
entries: list[DepthColumnEntry], all_words: list[TextWord], depth_column_params: dict
) -> list[BoundaryDepthColumn]:
"""Construct all possible BoundaryDepthColumn objects from the given DepthColumnEntry objects.

Expand Down
63 changes: 62 additions & 1 deletion src/stratigraphy/util/find_description.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,77 @@ def get_description_lines(lines: list[TextLine], material_description_rect: fitz
Returns:
list[TextLine]: The filtered lines.
"""
if not lines:
return []
filtered_lines = [
line
for line in lines
if line.rect.x0 < material_description_rect.x1 - 0.4 * material_description_rect.width
if material_description_rect.contains(line.rect)
]

return sorted([line for line in filtered_lines if line], key=lambda line: line.rect.y0)


def get_description_blocks_from_layer_identifier(
layer_identifier_entries: list[TextLine], description_lines: list[TextLine]
) -> list[TextBlock]:
"""Divide the description lines into blocks based on the layer identifier entries.

Args:
layer_identifier_entries (list[TextLine]): The layer identifier entries.
description_lines (list[TextLine]): All lines constituting the material description.

Returns:
list[TextBlock]: The blocks of the material description.
"""
blocks = []
line_index = 0
for layer_identifier_idx, _layer_index in enumerate(layer_identifier_entries):
next_layer_identifier = (
layer_identifier_entries[layer_identifier_idx + 1]
if layer_identifier_idx + 1 < len(layer_identifier_entries)
else None
)

matched_block = matching_blocks(description_lines, line_index, next_layer_identifier)
line_index += sum([len(block.lines) for block in matched_block])
blocks.extend(matched_block)

return blocks


def matching_blocks(
all_lines: list[TextLine], line_index: int, next_layer_identifier: TextLine | None
) -> list[TextBlock]:
"""Adds lines to a block until the next layer identifier is reached.

Args:
all_lines (list[TextLine]): All TextLine objects constituting the material description.
line_index (int): The index of the last line that is already assigned to a block.
next_layer_identifier (TextLine | None): The next layer identifier.

Returns:
list[TextBlock]: The next block or an empty list if no lines are added.
"""
y1_threshold = None
if next_layer_identifier:
next_interval_start_rect = next_layer_identifier.rect
y1_threshold = next_interval_start_rect.y0 + next_interval_start_rect.height / 2

matched_lines = []

for current_line in all_lines[line_index:]:
if y1_threshold is None or current_line.rect.y1 < y1_threshold:
matched_lines.append(current_line)
else:
break

if len(matched_lines):
return [TextBlock(matched_lines)]
else:
return []


def get_description_blocks(
description_lines: list[TextLine],
geometric_lines: list[Line],
Expand Down
Loading
Loading