-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feat: Material Block recognition for Geneva Layout #45
Changes from 1 commit
14aab1f
7c161d6
1a60ddc
d4dac85
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,11 +5,17 @@ | |
|
||
import fitz | ||
|
||
from stratigraphy import DATAPATH | ||
from stratigraphy.util import find_depth_columns | ||
from stratigraphy.util.dataclasses import Line | ||
from stratigraphy.util.depthcolumn import DepthColumn | ||
from stratigraphy.util.find_description import get_description_blocks, get_description_lines | ||
from stratigraphy.util.find_description import ( | ||
get_description_blocks, | ||
get_description_blocks_from_layer_index, | ||
get_description_lines, | ||
) | ||
from stratigraphy.util.interval import BoundaryInterval, Interval | ||
from stratigraphy.util.layer_index_column import find_layer_index_column, find_layer_index_column_entries | ||
from stratigraphy.util.line import TextLine, TextWord | ||
from stratigraphy.util.textblock import TextBlock, block_distance | ||
from stratigraphy.util.util import ( | ||
|
@@ -81,14 +87,63 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict | |
depth_column_entries, words, depth_column_params=params["depth_column_params"] | ||
) | ||
) | ||
|
||
# Detect Layer Index Columns | ||
layer_index_entries = find_layer_index_column_entries(words) | ||
layer_index_columns = find_layer_index_column(layer_index_entries) if layer_index_entries else [] | ||
if layer_index_columns: | ||
layer_index_pairs = [] | ||
for layer_index_column in layer_index_columns: | ||
material_description_rect = find_material_description_column( | ||
lines, layer_index_column, language, **params["material_description"] | ||
) | ||
if material_description_rect: | ||
layer_index_pairs.append((layer_index_column, material_description_rect)) | ||
|
||
# Obtain the best pair. In contrast do depth columns, there only ever is one layer index column per page. | ||
if layer_index_pairs: | ||
layer_index_pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1])) | ||
layer_index_column, material_description_rect = layer_index_pairs[-1] | ||
# split the material description rect into blocks. | ||
description_lines = get_description_lines(lines, material_description_rect) | ||
blocks = get_description_blocks_from_layer_index(layer_index_column.entries, description_lines) | ||
|
||
predictions = [{"material_description": block.to_json()} for block in blocks] | ||
predictions = parse_and_remove_empty_predictions(predictions) | ||
|
||
json_filtered_pairs = [ | ||
{ | ||
"depth_column": None, | ||
"material_description_rect": [ | ||
material_description_rect.x0, | ||
material_description_rect.y0, | ||
material_description_rect.x1, | ||
material_description_rect.y1, | ||
], | ||
} | ||
] | ||
|
||
# Visualization: To be dropped before merging to main. | ||
for layer_index_column in layer_index_columns: | ||
fitz.utils.draw_rect( | ||
page, layer_index_column.rect() * page.derotation_matrix, color=fitz.utils.getColor("blue") | ||
) | ||
for block in blocks: | ||
fitz.utils.draw_rect(page, block.rect * page.derotation_matrix, color=fitz.utils.getColor("red")) | ||
fitz.utils.draw_rect( | ||
page, material_description_rect * page.derotation_matrix, color=fitz.utils.getColor("blue") | ||
) | ||
page.parent.save(DATAPATH / "_temp" / "output.pdf", garbage=4, deflate=True, clean=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will need to be dropped. Is useful to inspect the recognized layer index columns as well as the material description rect. |
||
|
||
return predictions, json_filtered_pairs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For now, I just exit here if there's a layer index column. Some thoughts: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed |
||
|
||
pairs = [] | ||
for depth_column in depth_columns: | ||
material_description_rect = find_material_description_column( | ||
lines, depth_column, language, **params["material_description"] | ||
) | ||
if material_description_rect: | ||
pairs.append((depth_column, material_description_rect)) | ||
|
||
# lowest score first | ||
pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words)) | ||
|
||
|
@@ -101,7 +156,7 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict | |
filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete] | ||
|
||
groups = [] # list of matched depth intervals and text blocks | ||
# groups is of the form: ["depth_interval": BoundaryInterval, "block": TextBlock] | ||
# groups is of the form: [{"depth_interval": BoundaryInterval, "block": TextBlock}] | ||
if len(filtered_pairs): # match depth column items with material description | ||
for depth_column, material_description_rect in filtered_pairs: | ||
description_lines = get_description_lines(lines, material_description_rect) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,16 +24,75 @@ def get_description_lines(lines: list[TextLine], material_description_rect: fitz | |
Returns: | ||
list[TextLine]: The filtered lines. | ||
""" | ||
if not lines: | ||
return [] | ||
filtered_lines = [ | ||
line | ||
for line in lines | ||
if line.rect.x0 < material_description_rect.x1 - 0.4 * material_description_rect.width | ||
if material_description_rect.contains(line.rect) | ||
] | ||
|
||
return sorted([line for line in filtered_lines if line], key=lambda line: line.rect.y0) | ||
|
||
|
||
def get_description_blocks_from_layer_index( | ||
layer_index_entries: list[TextLine], description_lines: list[TextLine] | ||
) -> list[TextBlock]: | ||
"""Divide the description lines into blocks based on the layer index entries. | ||
|
||
Args: | ||
layer_index_entries (list[TextLine]): The layer index entries. | ||
description_lines (list[TextLine]): All lines constituting the material description. | ||
|
||
Returns: | ||
list[TextBlock]: The blocks of the material description. | ||
""" | ||
blocks = [] | ||
line_index = 0 | ||
for layer_index_idx, _layer_index in enumerate(layer_index_entries): | ||
# don't allow a layer above depth 0 | ||
stijnvermeeren-swisstopo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
next_layer_index = ( | ||
layer_index_entries[layer_index_idx + 1] if layer_index_idx + 1 < len(layer_index_entries) else None | ||
) | ||
|
||
matched_block = matching_blocks(description_lines, line_index, next_layer_index) | ||
line_index += sum([len(block.lines) for block in matched_block]) | ||
blocks.extend(matched_block) | ||
|
||
return blocks | ||
|
||
|
||
def matching_blocks(all_lines: list[TextLine], line_index: int, next_layer_index: TextLine | None) -> list[TextBlock]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could become a method of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did not create a class object What I mean here is the next layer index "entry". Therefore I called it next_layer_index. Does that make sense? Moving forward and considering LayerIndexColumn as a special case of a DepthColum we might want to create the "LayerIndexEntry" object as well, and then this will be clearer. What do you think? |
||
"""Adds lines to a block until the next layer index is reached. | ||
|
||
Args: | ||
all_lines (list[TextLine]): All TextLine objects constituting the material description. | ||
line_index (int): The index of the last line that is already assigned to a block. | ||
next_layer_index (TextLine | None): The next layer index. | ||
|
||
Returns: | ||
list[TextBlock]: The next block or an empty list if no lines are added. | ||
""" | ||
y1_threshold = None | ||
if next_layer_index: | ||
next_interval_start_rect = next_layer_index.rect | ||
y1_threshold = next_interval_start_rect.y0 + next_interval_start_rect.height / 2 | ||
|
||
matched_lines = [] | ||
|
||
for current_line in all_lines[line_index:]: | ||
if y1_threshold is None or current_line.rect.y1 < y1_threshold: | ||
matched_lines.append(current_line) | ||
else: | ||
break | ||
|
||
if len(matched_lines): | ||
return [TextBlock(matched_lines)] | ||
else: | ||
return [] | ||
|
||
|
||
def get_description_blocks( | ||
description_lines: list[TextLine], | ||
geometric_lines: list[Line], | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
"""Module for the LayerIndexColumn class.""" | ||
|
||
import re | ||
|
||
import fitz | ||
|
||
from stratigraphy.util.line import TextLine | ||
|
||
|
||
class LayerIndexColumn: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Already now, this is quite similar to a depth column. |
||
"""Class for a layer index column.""" | ||
|
||
def __init__(self, entries: list[TextLine]): | ||
"""Initialize the LayerIndexColumn object. | ||
|
||
Args: | ||
entries (list[TextLine]): The entries corresponding to the layer indices. | ||
""" | ||
self.entries = entries | ||
|
||
@property | ||
def max_x0(self) -> float: | ||
return max([rect.x0 for rect in self.rects()]) | ||
|
||
@property | ||
def min_x1(self) -> float: | ||
return min([rect.x1 for rect in self.rects()]) | ||
|
||
def rect(self) -> fitz.Rect: | ||
"""Get the rectangle of the layer index column. | ||
|
||
Returns: | ||
fitz.Rect: The rectangle of the layer index column. | ||
""" | ||
x0 = min([rect.x0 for rect in self.rects()]) | ||
x1 = max([rect.x1 for rect in self.rects()]) | ||
y0 = min([rect.y0 for rect in self.rects()]) | ||
y1 = max([rect.y1 for rect in self.rects()]) | ||
return fitz.Rect(x0, y0, x1, y1) | ||
|
||
def rects(self) -> list[fitz.Rect]: | ||
return [entry.rect for entry in self.entries] | ||
|
||
def add_entry(self, entry: TextLine): | ||
"""Add a new layer index column entry to the layer index column. | ||
|
||
Args: | ||
entry (TextLine): The layer index column entry to be added. | ||
""" | ||
self.entries.append(entry) | ||
|
||
def can_be_appended(self, rect: fitz.Rect) -> bool: | ||
"""Checks if a new layer index column entry can be appended to the current layer index column. | ||
|
||
The checks are: | ||
- The width of the new rectangle is greater than the width of the current layer index column. Or; | ||
- The middle of the new rectangle is within the horizontal boundaries of the current layer index column. | ||
- The new rectangle intersects with the minimal horizontal boundaries of the current layer index column. | ||
|
||
|
||
Args: | ||
rect (fitz.Rect): Rect of the layer index column entry to be appended. | ||
|
||
Returns: | ||
bool: True if the new layer index column entry can be appended, False otherwise. | ||
""" | ||
new_middle = (rect.x0 + rect.x1) / 2 | ||
if (self.rect().width < rect.width or self.rect().x0 < new_middle < self.rect().x1) and ( | ||
rect.x0 <= self.min_x1 and self.max_x0 <= rect.x1 | ||
): | ||
return True | ||
return False | ||
|
||
def strictly_contains(self, other): | ||
return len(other.entries) < len(self.entries) and all( | ||
other_entry in self.entries for other_entry in other.entries | ||
) | ||
|
||
def is_contained(self, rect: fitz.Rect) -> bool: | ||
"""Check if the layer index column is contained in another rectangle. | ||
|
||
Args: | ||
rect (fitz.Rect): The rectangle to check if it contains the layer index column. | ||
|
||
Returns: | ||
bool: True if the layer index column is contained in the rectangle, False otherwise. | ||
""" | ||
return ( | ||
rect.x0 <= self.rect().x0 | ||
and self.rect().x1 <= rect.x1 | ||
and rect.y0 <= self.rect().y0 | ||
and self.rect().y1 <= rect.y1 | ||
) | ||
|
||
def noise_count(self, words): | ||
return 0 | ||
stijnvermeeren-swisstopo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
def find_layer_index_column_entries(all_words: list[TextLine]) -> list: | ||
r"""Find the layer index column entries. | ||
|
||
Regex explanation: | ||
- \b is a word boundary. This ensures that the match must start at the beginning of a word. | ||
- [\da-z]+ matches one or more (+) alphanumeric characters (\d for digits and a-z for lowercase letters). | ||
- \) matches a closing parenthesis. The backslash is necessary because parentheses are special characters | ||
in regular expressions, so we need to escape it to match a literal parenthesis. | ||
This regular expression will match strings like "1)", "2)", "a)", "b)", "1a4)", "6de)", etc. | ||
|
||
Args: | ||
all_words (ist[TextLine]): The words to search for layer index columns. | ||
|
||
Returns: | ||
list: The layer index column entries. | ||
""" | ||
entries = [] | ||
for line in sorted(all_words, key=lambda line: line.rect.y0): | ||
stijnvermeeren-swisstopo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
regex = re.compile(r"\b[\da-z-]+\)") | ||
stijnvermeeren-swisstopo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
match = regex.match(line.text) | ||
if match and len(line.text) < 7: | ||
entries.append(line) | ||
return entries | ||
|
||
|
||
def find_layer_index_column(entries: list[TextLine]) -> list[LayerIndexColumn]: | ||
"""Find the layer index column given the index column entries. | ||
|
||
Note: Similar to find_depth_columns.find_depth_columns. Refactoring may be desired. | ||
|
||
Args: | ||
entries (list[TextLine]): The layer index column entries. | ||
|
||
Returns: | ||
list[LayerIndexColumn]: The found layer index columns. | ||
""" | ||
layer_index_columns = [LayerIndexColumn([entries[0]])] | ||
for entry in entries[1:]: | ||
has_match = False | ||
for column in layer_index_columns: | ||
if column.can_be_appended(entry.rect): | ||
column.add_entry(entry) | ||
has_match = True | ||
if not has_match: | ||
layer_index_columns.append(LayerIndexColumn([entry])) | ||
|
||
# only keep columns whose entries are not fully contained in a different column | ||
layer_index_columns = [ | ||
column | ||
for column in layer_index_columns | ||
if all(not other.strictly_contains(column) for other in layer_index_columns) | ||
] | ||
# check if the column rect is a subset of another column rect. If so, merge the entries and sort them by y0. | ||
for column in layer_index_columns: | ||
for other in layer_index_columns: | ||
if column != other and column.is_contained(other.rect()): | ||
for entry in other.entries: | ||
if entry not in column.entries: | ||
column.entries.append(entry) | ||
column.entries.sort(key=lambda entry: entry.rect.y0) | ||
layer_index_columns.remove(other) | ||
break | ||
layer_index_columns = [column for column in layer_index_columns if len(column.entries) > 2] | ||
return layer_index_columns |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I checked our expressions with a native French speaker who also has experience with NLP with the French language. We looked at borehole profiles together and extended the list. Also, he suggested to remove all accents because OCR frequently makes errors there.
That's the reasons for the changes and comments here.