Skip to content

Commit

Permalink
LGVISIUM-102: create Extractor for LayerIdentifierSidebar
Browse files Browse the repository at this point in the history
  • Loading branch information
stijnvermeeren-swisstopo committed Nov 12, 2024
1 parent eadd42e commit f331f12
Show file tree
Hide file tree
Showing 10 changed files with 163 additions and 132 deletions.
36 changes: 17 additions & 19 deletions src/stratigraphy/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@
)
from stratigraphy.layer.layer import IntervalBlockPair, Layer
from stratigraphy.lines.line import TextLine
from stratigraphy.sidebar import AAboveBSidebarExtractor, AToBSidebarExtractor, Sidebar
from stratigraphy.sidebar.layer_identifier_sidebar import (
find_layer_identifier_sidebar_entries,
find_layer_identifier_sidebars,
from stratigraphy.sidebar import (
AAboveBSidebarExtractor,
AToBSidebarExtractor,
LayerIdentifierSidebarExtractor,
Sidebar,
)
from stratigraphy.text.find_description import (
get_description_blocks,
Expand Down Expand Up @@ -60,24 +61,21 @@ def process_page(
Returns:
list[dict]: All list of the text of all description blocks.
"""
# Detect Layer Index Columns
layer_identifier_entries = find_layer_identifier_sidebar_entries(lines)
layer_identifier_sidebars = (
find_layer_identifier_sidebars(layer_identifier_entries) if layer_identifier_entries else []
)
# Detect Layer Identifier Sidebars

layer_identifier_sidebars = LayerIdentifierSidebarExtractor.from_lines(lines)
material_descriptions_sidebar_pairs = []
if layer_identifier_sidebars:
for layer_identifier_sidebar in layer_identifier_sidebars:
material_description_rect = find_material_description_column(
lines, layer_identifier_sidebar, language, **params["material_description"]
for layer_identifier_sidebar in layer_identifier_sidebars:
material_description_rect = find_material_description_column(
lines, layer_identifier_sidebar, language, **params["material_description"]
)
if material_description_rect:
material_descriptions_sidebar_pairs.append(
MaterialDescriptionRectWithSidebar(layer_identifier_sidebar, material_description_rect)
)
if material_description_rect:
material_descriptions_sidebar_pairs.append(
MaterialDescriptionRectWithSidebar(layer_identifier_sidebar, material_description_rect)
)

if material_descriptions_sidebar_pairs:
material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match())
if material_descriptions_sidebar_pairs:
material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match())

# If there is a layer identifier sidebar, then we use this directly.
# Else, we search for sidebars with depths.
Expand Down
2 changes: 2 additions & 0 deletions src/stratigraphy/sidebar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .a_to_b_sidebar import AToBSidebar
from .a_to_b_sidebar_extractor import AToBSidebarExtractor
from .layer_identifier_sidebar import LayerIdentifierSidebar
from .layer_identifier_sidebar_extractor import LayerIdentifierSidebarExtractor
from .sidebar import Sidebar

__all__ = [
Expand All @@ -16,4 +17,5 @@
"AToBSidebar",
"AToBSidebarExtractor",
"LayerIdentifierSidebar",
"LayerIdentifierSidebarExtractor",
]
2 changes: 1 addition & 1 deletion src/stratigraphy/sidebar/a_above_b_sidebar.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
import numpy as np

from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry
from stratigraphy.layer.layer import IntervalBlockGroup
from stratigraphy.lines.line import TextLine
from stratigraphy.text.find_description import get_description_blocks
from stratigraphy.util.dataclasses import Line
from stratigraphy.util.interval import AAboveBInterval

from .interval_block_group import IntervalBlockGroup
from .sidebar import Sidebar


Expand Down
2 changes: 1 addition & 1 deletion src/stratigraphy/sidebar/a_to_b_sidebar.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
import fitz

from stratigraphy.depthcolumn.depthcolumnentry import AToBDepthColumnEntry
from stratigraphy.layer.layer import IntervalBlockGroup
from stratigraphy.lines.line import TextLine
from stratigraphy.util.dataclasses import Line
from stratigraphy.util.interval import AToBInterval

from .interval_block_group import IntervalBlockGroup
from .sidebar import Sidebar


Expand Down
1 change: 0 additions & 1 deletion src/stratigraphy/sidebar/find_sidebars.py

This file was deleted.

18 changes: 18 additions & 0 deletions src/stratigraphy/sidebar/interval_block_group.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Module that contains a helper class for associating depth intervals and text blocks."""

from dataclasses import dataclass

from stratigraphy.text.textblock import TextBlock
from stratigraphy.util.interval import Interval


@dataclass
class IntervalBlockGroup:
"""Helper class to represent a group of depth intervals and an associated group of text blocks.
The class is used to simplify the code for obtaining an appropriate one-to-one correspondence between depth
intervals and material descriptions.
"""

depth_intervals: list[Interval]
blocks: list[TextBlock]
111 changes: 2 additions & 109 deletions src/stratigraphy/sidebar/layer_identifier_sidebar.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
"""Module for the layer identifier sidebars."""

import re
from dataclasses import dataclass

import fitz

from stratigraphy.layer.layer import IntervalBlockGroup
from stratigraphy.lines.line import TextLine
from stratigraphy.text.textblock import TextBlock
from stratigraphy.util.dataclasses import Line

from ..depthcolumn.depthcolumnentry import AToBDepthColumnEntry
from ..util.interval import AToBInterval
from .interval_block_group import IntervalBlockGroup
from .sidebar import Sidebar


Expand Down Expand Up @@ -72,7 +70,7 @@ def identify_groups(
result = []
for block in blocks:
depth_intervals = []
depth_interval = get_depth_interval_from_textblock(block)
depth_interval = AToBInterval.get_depth_interval_from_textblock(block)
if depth_interval:
depth_intervals.append(depth_interval)
result.append(IntervalBlockGroup(depth_intervals=depth_intervals, blocks=[block]))
Expand Down Expand Up @@ -140,108 +138,3 @@ def is_contained(self, rect: fitz.Rect) -> bool:
and rect.y0 <= self.rect().y0
and self.rect().y1 <= rect.y1
)


def find_layer_identifier_sidebar_entries(lines: list[TextLine]) -> list[LayerIdentifierEntry]:
r"""Find the layer identifier sidebar entries.
Regex explanation:
- \b is a word boundary. This ensures that the match must start at the beginning of a word.
- [\da-z]+ matches one or more (+) alphanumeric characters (\d for digits and a-z for lowercase letters).
- \) matches a closing parenthesis. The backslash is necessary because parentheses are special characters
in regular expressions, so we need to escape it to match a literal parenthesis.
This regular expression will match strings like "1)", "2)", "a)", "b)", "1a4)", "6de)", etc.
Args:
lines (list[TextLine]): The lines to search for layer identifier entries.
Returns:
list[LayerIdentifierEntry]: The layer identifier sidebar entries.
"""
entries = []
for line in sorted(lines, key=lambda line: line.rect.y0):
if len(line.words) > 0:
# Only match in the first word of every line, to avoid e.g. matching with "cm)" in a material description
# containing an expression like "(diameter max 6 cm)".
first_word = line.words[0]
regex = re.compile(r"\b[\da-z-]+\)")
match = regex.match(first_word.text)
if match and len(first_word.text) < 7:
entries.append(LayerIdentifierEntry(first_word.rect, first_word.text))
return entries


def find_layer_identifier_sidebars(entries: list[LayerIdentifierEntry]) -> list[LayerIdentifierSidebar]:
"""Find the layer identifier column given the index column entries.
Note: Similar to find_depth_columns.find_depth_columns. Refactoring may be desired.
Args:
entries (list[LayerIdentifierEntry]): The layer identifier column entries.
Returns:
list[LayerIdentifierSidebar]: The found layer identifier sidebar.
"""
layer_identifier_sidebars = [LayerIdentifierSidebar([entries[0]])]
for entry in entries[1:]:
has_match = False
for column in layer_identifier_sidebars:
if column.can_be_appended(entry.rect):
column.entries.append(entry)
has_match = True
if not has_match:
layer_identifier_sidebars.append(LayerIdentifierSidebar([entry]))

# only keep columns whose entries are not fully contained in a different column
layer_identifier_sidebars = [
column
for column in layer_identifier_sidebars
if all(not other.strictly_contains(column) for other in layer_identifier_sidebars)
]
# check if the column rect is a subset of another column rect. If so, merge the entries and sort them by y0.
for column in layer_identifier_sidebars:
for other in layer_identifier_sidebars:
if column != other and column.is_contained(other.rect()):
for entry in other.entries:
if entry not in column.entries:
column.entries.append(entry)
column.entries.sort(key=lambda entry: entry.rect.y0)
layer_identifier_sidebars.remove(other)
break
layer_identifier_sidebars = [column for column in layer_identifier_sidebars if len(column.entries) > 2]
return layer_identifier_sidebars


def get_depth_interval_from_textblock(block: TextBlock) -> AToBInterval | None:
"""Extract depth interval from a material description block.
For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description
of the material. Often, these text descriptions contain a further separation into multiple sub layers.
These sub layers have their own depth intervals. This function extracts the overall depth interval,
spanning across all mentioned sub layers.
Args:
block (TextBlock): The block to calculate the depth interval for.
Returns:
AToBInterval | None: The depth interval.
"""
depth_entries = []
for line in block.lines:
try:
layer_depth_entry = AToBDepthColumnEntry.from_text(line.text, line.rect, require_start_of_string=False)
# require_start_of_string = False because the depth interval may not always start at the beginning
# of the line e.g. "Remblais Heterogene: 0.00 - 0.5m"
if layer_depth_entry:
depth_entries.append(layer_depth_entry)
except ValueError:
pass

if depth_entries:
# Merge the sub layers into one depth interval.
start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value)
end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value)

return AToBInterval(AToBDepthColumnEntry(start, end))
else:
return None
86 changes: 86 additions & 0 deletions src/stratigraphy/sidebar/layer_identifier_sidebar_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Module for finding LayerIdentifierSidebar instances in a borehole profile."""

import re

from stratigraphy.lines.line import TextLine
from stratigraphy.sidebar import LayerIdentifierSidebar
from stratigraphy.sidebar.layer_identifier_sidebar import LayerIdentifierEntry


class LayerIdentifierSidebarExtractor:
"""Class that finds LayerIdentifierSidebar instances in a borehole profile."""

@classmethod
def find_layer_identifier_sidebar_entries(cls, lines: list[TextLine]) -> list[LayerIdentifierEntry]:
r"""Find the layer identifier sidebar entries.
Regex explanation:
- \b is a word boundary. This ensures that the match must start at the beginning of a word.
- [\da-z]+ matches one or more (+) alphanumeric characters (\d for digits and a-z for lowercase letters).
- \) matches a closing parenthesis. The backslash is necessary because parentheses are special characters
in regular expressions, so we need to escape it to match a literal parenthesis.
This regular expression will match strings like "1)", "2)", "a)", "b)", "1a4)", "6de)", etc.
Args:
lines (list[TextLine]): The lines to search for layer identifier entries.
Returns:
list[LayerIdentifierEntry]: The layer identifier sidebar entries.
"""
entries = []
for line in sorted(lines, key=lambda line: line.rect.y0):
if len(line.words) > 0:
# Only match in the first word of every line, to avoid e.g. matching with "cm)" in a material
# description containing an expression like "(diameter max 6 cm)".
first_word = line.words[0]
regex = re.compile(r"\b[\da-z-]+\)")
match = regex.match(first_word.text)
if match and len(first_word.text) < 7:
entries.append(LayerIdentifierEntry(first_word.rect, first_word.text))
return entries

@classmethod
def from_lines(cls, lines: list[TextLine]) -> list[LayerIdentifierSidebar]:
"""Find layer identifier sidebars from text lines.
TODO: Similar to AToBSidebarExtractor.find_in_words(). Refactoring may be desired.
Args:
lines (list[TextLine]): The text lines in the document
Returns:
list[LayerIdentifierSidebar]: The found layer identifier sidebar.
"""
entries = cls.find_layer_identifier_sidebar_entries(lines)
if not entries:
return []

layer_identifier_sidebars = [LayerIdentifierSidebar([entries[0]])]
for entry in entries[1:]:
has_match = False
for column in layer_identifier_sidebars:
if column.can_be_appended(entry.rect):
column.entries.append(entry)
has_match = True
if not has_match:
layer_identifier_sidebars.append(LayerIdentifierSidebar([entry]))

# only keep columns whose entries are not fully contained in a different column
layer_identifier_sidebars = [
column
for column in layer_identifier_sidebars
if all(not other.strictly_contains(column) for other in layer_identifier_sidebars)
]
# check if the column rect is a subset of another column rect. If so, merge the entries and sort them by
# y0.
for column in layer_identifier_sidebars:
for other in layer_identifier_sidebars:
if column != other and column.is_contained(other.rect()):
for entry in other.entries:
if entry not in column.entries:
column.entries.append(entry)
column.entries.sort(key=lambda entry: entry.rect.y0)
layer_identifier_sidebars.remove(other)
break
layer_identifier_sidebars = [column for column in layer_identifier_sidebars if len(column.entries) > 2]
return layer_identifier_sidebars
2 changes: 1 addition & 1 deletion src/stratigraphy/sidebar/sidebar.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
import fitz

from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry
from stratigraphy.layer.layer import IntervalBlockGroup
from stratigraphy.lines.line import TextLine, TextWord
from stratigraphy.sidebar.interval_block_group import IntervalBlockGroup
from stratigraphy.util.dataclasses import Line

EntryT = TypeVar("EntryT", bound=DepthColumnEntry)
Expand Down
35 changes: 35 additions & 0 deletions src/stratigraphy/util/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,38 @@ def matching_blocks(
return [TextBlock(matched_lines)]
else:
return []

@classmethod
def get_depth_interval_from_textblock(cls, block: TextBlock) -> AToBInterval | None:
"""Extract depth interval from a material description block.
For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description
of the material. Often, these text descriptions contain a further separation into multiple sub layers.
These sub layers have their own depth intervals. This function extracts the overall depth interval,
spanning across all mentioned sub layers.
Args:
block (TextBlock): The block to calculate the depth interval for.
Returns:
AToBInterval | None: The depth interval.
"""
depth_entries = []
for line in block.lines:
try:
layer_depth_entry = AToBDepthColumnEntry.from_text(line.text, line.rect, require_start_of_string=False)
# require_start_of_string = False because the depth interval may not always start at the beginning
# of the line e.g. "Remblais Heterogene: 0.00 - 0.5m"
if layer_depth_entry:
depth_entries.append(layer_depth_entry)
except ValueError:
pass

if depth_entries:
# Merge the sub layers into one depth interval.
start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value)
end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value)

return AToBInterval(AToBDepthColumnEntry(start, end))
else:
return None

0 comments on commit f331f12

Please sign in to comment.