Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LGVISIUM-102: merge AToBInterval and AToBDepthColumnEntry #106

15 changes: 15 additions & 0 deletions src/stratigraphy/depth/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""Modules for extracting values indicating some measured depth below the surface."""

from .a_to_b_interval_extractor import AToBIntervalExtractor
from .depthcolumnentry import DepthColumnEntry
from .depthcolumnentry_extractor import DepthColumnEntryExtractor
from .interval import AAboveBInterval, AToBInterval, Interval

__all__ = [
"AAboveBInterval",
"AToBInterval",
"AToBIntervalExtractor",
"DepthColumnEntry",
"DepthColumnEntryExtractor",
"Interval",
]
83 changes: 83 additions & 0 deletions src/stratigraphy/depth/a_to_b_interval_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""Contains logic for finding AToBInterval instances in a text."""

import re

import fitz

from stratigraphy.lines.line import TextLine

from .depthcolumnentry import DepthColumnEntry
from .interval import AToBInterval
from .util import value_as_float


class AToBIntervalExtractor:
"""Methods for finding AToBInterval instances (e.g. "0.5m - 1.8m") in a text."""

@classmethod
def from_lines(cls, lines: list[TextLine]) -> AToBInterval | None:
stijnvermeeren-swisstopo marked this conversation as resolved.
Show resolved Hide resolved
"""Extract depth interval from text lines.

For borehole profiles in the Deriaz layout, the depth interval is usually found in the text of the material
description. Often, these text descriptions contain a further separation into multiple sub layers.
These sub layers have their own depth intervals. This function extracts the overall depth interval,
spanning across all mentioned sub layers.

Args:
lines (list[TextLine]): The lines to extract the depth interval from.

Returns:
AToBInterval | None: The depth interval (if any) or None (if no depth interval was found).
"""
depth_entries = []
for line in lines:
try:
a_to_b_depth_entry = AToBIntervalExtractor.from_text(
line.text, line.rect, require_start_of_string=False
)
# require_start_of_string = False because the depth interval may not always start at the beginning
# of the line e.g. "Remblais Heterogene: 0.00 - 0.5m"
if a_to_b_depth_entry:
depth_entries.append(a_to_b_depth_entry)
except ValueError:
pass

if depth_entries:
# Merge the sub layers into one depth interval.
start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value)
end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value)
return AToBInterval(start, end)
else:
return None

@classmethod
def from_text(cls, text: str, rect: fitz.Rect, require_start_of_string: bool = True) -> AToBInterval | None:
"""Attempts to extract a AToBInterval from a string.

Args:
text (str): The string to extract the depth interval from.
rect (fitz.Rect): The rectangle of the text.
require_start_of_string (bool, optional): Whether the number to extract needs to be
at the start of a string. Defaults to True.

Returns:
AToBInterval | None: The extracted AToBInterval or None if none is found.
"""
input_string = text.strip().replace(",", ".")

query = r"-?([0-9]+(\.[0-9]+)?)[müMN\]*[\s-]+([0-9]+(\.[0-9]+)?)[müMN\\.]*"
if not require_start_of_string:
query = r".*?" + query
regex = re.compile(query)
match = regex.match(input_string)
if match:
value1 = value_as_float(match.group(1))
first_half_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 - rect.width / 2, rect.y1)

value2 = value_as_float(match.group(3))
second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1)
return AToBInterval(
DepthColumnEntry(first_half_rect, value1),
DepthColumnEntry(second_half_rect, value2),
)
return None
35 changes: 35 additions & 0 deletions src/stratigraphy/depth/depthcolumnentry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Contains a dataclass for depth column entries, which indicate the measured depth of an interface between layers."""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any

import fitz


@dataclass
class DepthColumnEntry: # noqa: D101
"""Class to represent a depth column entry."""

rect: fitz.Rect
value: float

def __repr__(self) -> str:
return str(self.value)

def to_json(self) -> dict[str, Any]:
"""Convert the depth column entry to a JSON serializable format."""
return {"value": self.value, "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1]}

@classmethod
def from_json(cls, data: dict) -> DepthColumnEntry:
"""Converts a dictionary to an object.

Args:
data (dict): A dictionary representing the depth column entry.

Returns:
DepthColumnEntry: The depth column entry object.
"""
return cls(rect=fitz.Rect(data["rect"]), value=data["value"])
46 changes: 46 additions & 0 deletions src/stratigraphy/depth/depthcolumnentry_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Contains logic for finding depth column entries in text."""

import re

from stratigraphy.depth import DepthColumnEntry
from stratigraphy.depth.util import value_as_float
from stratigraphy.lines.line import TextWord

from .a_to_b_interval_extractor import AToBIntervalExtractor


class DepthColumnEntryExtractor:
"""Methods for finding depth column entries in a text."""

@classmethod
def find_in_words(cls, all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]:
"""Find all depth column entries given a list of TextWord objects.

Note: Only depths up to two digits before the decimal point are supported.

Args:
all_words (list[TextWord]): List of text words to extract depth column entries from.
include_splits (bool): Whether to include split entries.

Returns:
list[DepthColumnEntry]: The extracted depth column entries.
"""
entries = []
for word in sorted(all_words, key=lambda word: word.rect.y0):
try:
input_string = word.text.strip().replace(",", ".")
regex = re.compile(r"^-?\.?([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
# numbers such as '.40' are not supported. The reason is that sometimes the OCR
# recognizes a '-' as a '.' and we just ommit the leading '.' to avoid this issue.
match = regex.match(input_string)
if match:
value = value_as_float(match.group(1))
entries.append(DepthColumnEntry(word.rect, value))

elif include_splits:
# support for e.g. "1.10-1.60m" extracted as a single word
a_to_b_interval = AToBIntervalExtractor.from_text(input_string, word.rect)
entries.extend([a_to_b_interval.start, a_to_b_interval.end] if a_to_b_interval else [])
except ValueError:
pass
return entries
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@

import fitz

from stratigraphy.depthcolumn.depthcolumnentry import (
AToBDepthColumnEntry,
DepthColumnEntry,
)
from stratigraphy.lines.line import TextLine
from stratigraphy.text.textblock import TextBlock

from .depthcolumnentry import DepthColumnEntry


class Interval(metaclass=abc.ABCMeta):
"""Abstract class for (depth) intervals."""
Expand Down Expand Up @@ -143,9 +141,13 @@ def matching_blocks(
class AToBInterval(Interval):
"""Class for intervals that are defined in a single line like "1.00 - 2.30m"."""

def __init__(self, layer_depth_column_entry: AToBDepthColumnEntry):
self.entry = layer_depth_column_entry
super().__init__(layer_depth_column_entry.start, layer_depth_column_entry.end)
def __init__(self, start: DepthColumnEntry, end: DepthColumnEntry):
super().__init__(start, end)

@property
def rect(self) -> fitz.Rect:
"""Get the rectangle surrounding the interval."""
return fitz.Rect(self.start.rect).include_rect(self.end.rect)

@property
def line_anchor(self) -> fitz.Point | None:
Expand Down Expand Up @@ -177,38 +179,3 @@ def matching_blocks(
return [TextBlock(matched_lines)]
else:
return []

@classmethod
def get_depth_interval_from_lines(cls, lines: list[TextLine]) -> AToBInterval | None:
"""Extract depth interval from text lines.

For borehole profiles in the Deriaz layout, the depth interval is usually found in the text of the material
description. Often, these text descriptions contain a further separation into multiple sub layers.
These sub layers have their own depth intervals. This function extracts the overall depth interval,
spanning across all mentioned sub layers.

Args:
lines (list[TextLine]): The lines to extract the depth interval from.

Returns:
AToBInterval | None: The depth interval (if any) or None (if no depth interval was found).
"""
depth_entries = []
for line in lines:
try:
layer_depth_entry = AToBDepthColumnEntry.from_text(line.text, line.rect, require_start_of_string=False)
# require_start_of_string = False because the depth interval may not always start at the beginning
# of the line e.g. "Remblais Heterogene: 0.00 - 0.5m"
if layer_depth_entry:
depth_entries.append(layer_depth_entry)
except ValueError:
pass

if depth_entries:
# Merge the sub layers into one depth interval.
start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value)
end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value)

return AToBInterval(AToBDepthColumnEntry(start, end))
else:
return None
10 changes: 10 additions & 0 deletions src/stratigraphy/depth/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Contains utility functions for depth column entries."""

import re


def value_as_float(string_value: str) -> float: # noqa: D103
"""Converts a string to a float."""
# OCR sometimes tends to miss the decimal comma
parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value)
return abs(float(parsed_text))
Loading
Loading