Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Material Block recognition for Geneva Layout #45

Merged
merged 4 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions config/matching_params.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,31 +46,44 @@ material_description:
fr:
including_expressions:
- sol
- végétal
- végétal # remove accents generally; ocr might be wrong
- dallage
- terre
- bitume
- bitumineux
- grave d'infrastructure
- grave d'infrastructure # what happens if we remove this?
- sable
- limon
- gravier
- asphalte
- humus
- humus # hummus maybe?
- brun
- gris
- grise
- mou
- dur
- dure
- ferme
- racine
- revetement
- pierre
- beige
- beton
- craie
- marne
- materiau de base
- materiau
- matrice sableuse
- enrobé
- enrobé # accent --> check what happens if it's removed
- terrain
- remblais
- remblai
- molasse
- phase
- formations
- limoneuse
- argileuse
- argileux
- mousse
excluding_expressions:
- monsieur
- fin
Comment on lines 46 to 89
Copy link
Contributor Author

@redur redur May 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked our expressions with a native French speaker who also has experience with NLP with the French language. We looked at borehole profiles together and extended the list. Also, he suggested to remove all accents because OCR frequently makes errors there.

That's the reasons for the changes and comments here.

Expand Down
61 changes: 58 additions & 3 deletions src/stratigraphy/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,17 @@

import fitz

from stratigraphy import DATAPATH
from stratigraphy.util import find_depth_columns
from stratigraphy.util.dataclasses import Line
from stratigraphy.util.depthcolumn import DepthColumn
from stratigraphy.util.find_description import get_description_blocks, get_description_lines
from stratigraphy.util.find_description import (
get_description_blocks,
get_description_blocks_from_layer_index,
get_description_lines,
)
from stratigraphy.util.interval import BoundaryInterval, Interval
from stratigraphy.util.layer_index_column import find_layer_index_column, find_layer_index_column_entries
from stratigraphy.util.line import TextLine, TextWord
from stratigraphy.util.textblock import TextBlock, block_distance
from stratigraphy.util.util import (
Expand Down Expand Up @@ -81,14 +87,63 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict
depth_column_entries, words, depth_column_params=params["depth_column_params"]
)
)

# Detect Layer Index Columns
layer_index_entries = find_layer_index_column_entries(words)
layer_index_columns = find_layer_index_column(layer_index_entries) if layer_index_entries else []
if layer_index_columns:
layer_index_pairs = []
for layer_index_column in layer_index_columns:
material_description_rect = find_material_description_column(
lines, layer_index_column, language, **params["material_description"]
)
if material_description_rect:
layer_index_pairs.append((layer_index_column, material_description_rect))

# Obtain the best pair. In contrast do depth columns, there only ever is one layer index column per page.
if layer_index_pairs:
layer_index_pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1]))
layer_index_column, material_description_rect = layer_index_pairs[-1]
# split the material description rect into blocks.
description_lines = get_description_lines(lines, material_description_rect)
blocks = get_description_blocks_from_layer_index(layer_index_column.entries, description_lines)

predictions = [{"material_description": block.to_json()} for block in blocks]
predictions = parse_and_remove_empty_predictions(predictions)

json_filtered_pairs = [
{
"depth_column": None,
"material_description_rect": [
material_description_rect.x0,
material_description_rect.y0,
material_description_rect.x1,
material_description_rect.y1,
],
}
]

# Visualization: To be dropped before merging to main.
for layer_index_column in layer_index_columns:
fitz.utils.draw_rect(
page, layer_index_column.rect() * page.derotation_matrix, color=fitz.utils.getColor("blue")
)
for block in blocks:
fitz.utils.draw_rect(page, block.rect * page.derotation_matrix, color=fitz.utils.getColor("red"))
fitz.utils.draw_rect(
page, material_description_rect * page.derotation_matrix, color=fitz.utils.getColor("blue")
)
page.parent.save(DATAPATH / "_temp" / "output.pdf", garbage=4, deflate=True, clean=True)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will need to be dropped. Is useful to inspect the recognized layer index columns as well as the material description rect.


return predictions, json_filtered_pairs
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now, I just exit here if there's a layer index column.

Some thoughts:
I believe it should be possible to consider a layer index column a special case of a depth column, and continue the "normal" way. I believe this will be "cleaner" to move on.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed


pairs = []
for depth_column in depth_columns:
material_description_rect = find_material_description_column(
lines, depth_column, language, **params["material_description"]
)
if material_description_rect:
pairs.append((depth_column, material_description_rect))

# lowest score first
pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words))

Expand All @@ -101,7 +156,7 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict
filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete]

groups = [] # list of matched depth intervals and text blocks
# groups is of the form: ["depth_interval": BoundaryInterval, "block": TextBlock]
# groups is of the form: [{"depth_interval": BoundaryInterval, "block": TextBlock}]
if len(filtered_pairs): # match depth column items with material description
for depth_column, material_description_rect in filtered_pairs:
description_lines = get_description_lines(lines, material_description_rect)
Expand Down
61 changes: 60 additions & 1 deletion src/stratigraphy/util/find_description.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,75 @@ def get_description_lines(lines: list[TextLine], material_description_rect: fitz
Returns:
list[TextLine]: The filtered lines.
"""
if not lines:
return []
filtered_lines = [
line
for line in lines
if line.rect.x0 < material_description_rect.x1 - 0.4 * material_description_rect.width
if material_description_rect.contains(line.rect)
]

return sorted([line for line in filtered_lines if line], key=lambda line: line.rect.y0)


def get_description_blocks_from_layer_index(
layer_index_entries: list[TextLine], description_lines: list[TextLine]
) -> list[TextBlock]:
"""Divide the description lines into blocks based on the layer index entries.

Args:
layer_index_entries (list[TextLine]): The layer index entries.
description_lines (list[TextLine]): All lines constituting the material description.

Returns:
list[TextBlock]: The blocks of the material description.
"""
blocks = []
line_index = 0
for layer_index_idx, _layer_index in enumerate(layer_index_entries):
# don't allow a layer above depth 0
stijnvermeeren-swisstopo marked this conversation as resolved.
Show resolved Hide resolved

next_layer_index = (
layer_index_entries[layer_index_idx + 1] if layer_index_idx + 1 < len(layer_index_entries) else None
)

matched_block = matching_blocks(description_lines, line_index, next_layer_index)
line_index += sum([len(block.lines) for block in matched_block])
blocks.extend(matched_block)

return blocks


def matching_blocks(all_lines: list[TextLine], line_index: int, next_layer_index: TextLine | None) -> list[TextBlock]:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could become a method of LayerIndexColumn.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If next_layer_index is not just an index but of type TextLine | None , then probably next_layer is a better name?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not create a class object LayerIndexEntry similar to the entries of depthcolumns. The entries of the LayerIndex are simply TextLine objects.

What I mean here is the next layer index "entry". Therefore I called it next_layer_index. Does that make sense?

Moving forward and considering LayerIndexColumn as a special case of a DepthColum we might want to create the "LayerIndexEntry" object as well, and then this will be clearer. What do you think?

"""Adds lines to a block until the next layer index is reached.

Args:
all_lines (list[TextLine]): All TextLine objects constituting the material description.
line_index (int): The index of the last line that is already assigned to a block.
next_layer_index (TextLine | None): The next layer index.

Returns:
list[TextBlock]: The next block or an empty list if no lines are added.
"""
y1_threshold = None
if next_layer_index:
next_interval_start_rect = next_layer_index.rect
y1_threshold = next_interval_start_rect.y0 + next_interval_start_rect.height / 2

matched_lines = []

for current_line in all_lines[line_index:]:
if y1_threshold is None or current_line.rect.y1 < y1_threshold:
matched_lines.append(current_line)
else:
break

if len(matched_lines):
return [TextBlock(matched_lines)]
else:
return []


def get_description_blocks(
description_lines: list[TextLine],
geometric_lines: list[Line],
Expand Down
162 changes: 162 additions & 0 deletions src/stratigraphy/util/layer_index_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
"""Module for the LayerIndexColumn class."""

import re

import fitz

from stratigraphy.util.line import TextLine


class LayerIndexColumn:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Already now, this is quite similar to a depth column.

"""Class for a layer index column."""

def __init__(self, entries: list[TextLine]):
"""Initialize the LayerIndexColumn object.

Args:
entries (list[TextLine]): The entries corresponding to the layer indices.
"""
self.entries = entries

@property
def max_x0(self) -> float:
return max([rect.x0 for rect in self.rects()])

@property
def min_x1(self) -> float:
return min([rect.x1 for rect in self.rects()])

def rect(self) -> fitz.Rect:
"""Get the rectangle of the layer index column.

Returns:
fitz.Rect: The rectangle of the layer index column.
"""
x0 = min([rect.x0 for rect in self.rects()])
x1 = max([rect.x1 for rect in self.rects()])
y0 = min([rect.y0 for rect in self.rects()])
y1 = max([rect.y1 for rect in self.rects()])
return fitz.Rect(x0, y0, x1, y1)

def rects(self) -> list[fitz.Rect]:
return [entry.rect for entry in self.entries]

def add_entry(self, entry: TextLine):
"""Add a new layer index column entry to the layer index column.

Args:
entry (TextLine): The layer index column entry to be added.
"""
self.entries.append(entry)

def can_be_appended(self, rect: fitz.Rect) -> bool:
"""Checks if a new layer index column entry can be appended to the current layer index column.

The checks are:
- The width of the new rectangle is greater than the width of the current layer index column. Or;
- The middle of the new rectangle is within the horizontal boundaries of the current layer index column.
- The new rectangle intersects with the minimal horizontal boundaries of the current layer index column.


Args:
rect (fitz.Rect): Rect of the layer index column entry to be appended.

Returns:
bool: True if the new layer index column entry can be appended, False otherwise.
"""
new_middle = (rect.x0 + rect.x1) / 2
if (self.rect().width < rect.width or self.rect().x0 < new_middle < self.rect().x1) and (
rect.x0 <= self.min_x1 and self.max_x0 <= rect.x1
):
return True
return False

def strictly_contains(self, other):
return len(other.entries) < len(self.entries) and all(
other_entry in self.entries for other_entry in other.entries
)

def is_contained(self, rect: fitz.Rect) -> bool:
"""Check if the layer index column is contained in another rectangle.

Args:
rect (fitz.Rect): The rectangle to check if it contains the layer index column.

Returns:
bool: True if the layer index column is contained in the rectangle, False otherwise.
"""
return (
rect.x0 <= self.rect().x0
and self.rect().x1 <= rect.x1
and rect.y0 <= self.rect().y0
and self.rect().y1 <= rect.y1
)

def noise_count(self, words):
return 0
stijnvermeeren-swisstopo marked this conversation as resolved.
Show resolved Hide resolved


def find_layer_index_column_entries(all_words: list[TextLine]) -> list:
r"""Find the layer index column entries.

Regex explanation:
- \b is a word boundary. This ensures that the match must start at the beginning of a word.
- [\da-z]+ matches one or more (+) alphanumeric characters (\d for digits and a-z for lowercase letters).
- \) matches a closing parenthesis. The backslash is necessary because parentheses are special characters
in regular expressions, so we need to escape it to match a literal parenthesis.
This regular expression will match strings like "1)", "2)", "a)", "b)", "1a4)", "6de)", etc.

Args:
all_words (ist[TextLine]): The words to search for layer index columns.

Returns:
list: The layer index column entries.
"""
entries = []
for line in sorted(all_words, key=lambda line: line.rect.y0):
stijnvermeeren-swisstopo marked this conversation as resolved.
Show resolved Hide resolved
regex = re.compile(r"\b[\da-z-]+\)")
stijnvermeeren-swisstopo marked this conversation as resolved.
Show resolved Hide resolved
match = regex.match(line.text)
if match and len(line.text) < 7:
entries.append(line)
return entries


def find_layer_index_column(entries: list[TextLine]) -> list[LayerIndexColumn]:
"""Find the layer index column given the index column entries.

Note: Similar to find_depth_columns.find_depth_columns. Refactoring may be desired.

Args:
entries (list[TextLine]): The layer index column entries.

Returns:
list[LayerIndexColumn]: The found layer index columns.
"""
layer_index_columns = [LayerIndexColumn([entries[0]])]
for entry in entries[1:]:
has_match = False
for column in layer_index_columns:
if column.can_be_appended(entry.rect):
column.add_entry(entry)
has_match = True
if not has_match:
layer_index_columns.append(LayerIndexColumn([entry]))

# only keep columns whose entries are not fully contained in a different column
layer_index_columns = [
column
for column in layer_index_columns
if all(not other.strictly_contains(column) for other in layer_index_columns)
]
# check if the column rect is a subset of another column rect. If so, merge the entries and sort them by y0.
for column in layer_index_columns:
for other in layer_index_columns:
if column != other and column.is_contained(other.rect()):
for entry in other.entries:
if entry not in column.entries:
column.entries.append(entry)
column.entries.sort(key=lambda entry: entry.rect.y0)
layer_index_columns.remove(other)
break
layer_index_columns = [column for column in layer_index_columns if len(column.entries) > 2]
return layer_index_columns
Loading