From 203e11dfc147e2e18cde524e98798d41b5ab417c Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Mon, 11 Nov 2024 14:20:21 +0100 Subject: [PATCH 01/20] LGVISIUM-102: LayerIdentifierColumn extends from DepthColumn --- pyproject.toml | 1 - src/stratigraphy/annotations/draw.py | 6 +- .../boundarydepthcolumnvalidator.py | 14 +- src/stratigraphy/depthcolumn/depthcolumn.py | 238 +++++------------- .../depthcolumn/depthcolumnentry.py | 36 +-- .../depthcolumn/find_depth_columns.py | 40 +-- .../depths_materials_column_pairs.py | 47 ++-- src/stratigraphy/extract.py | 130 +++------- src/stratigraphy/layer/layer.py | 4 +- .../layer/layer_identifier_column.py | 146 ++++++----- src/stratigraphy/text/find_description.py | 61 ----- src/stratigraphy/util/interval.py | 10 - src/stratigraphy/util/predictions.py | 8 +- tests/test_depthcolumn.py | 23 +- tests/test_find_depth_columns.py | 106 ++++---- 15 files changed, 302 insertions(+), 568 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4766964c..fe9026d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,6 @@ dependencies = [ "boto3", "pandas", "levenshtein", - "pathlib", "python-dotenv", "setuptools", "tqdm", diff --git a/src/stratigraphy/annotations/draw.py b/src/stratigraphy/annotations/draw.py index e387d1e5..6045209f 100644 --- a/src/stratigraphy/annotations/draw.py +++ b/src/stratigraphy/annotations/draw.py @@ -9,7 +9,7 @@ from dotenv import load_dotenv from stratigraphy.data_extractor.data_extractor import FeatureOnPage from stratigraphy.depthcolumn.depthcolumn import DepthColumn -from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPairs +from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPair from stratigraphy.groundwater.groundwater_extraction import Groundwater from stratigraphy.layer.layer import Layer from stratigraphy.metadata.coordinate_extraction import Coordinate @@ -98,7 +98,7 @@ def draw_predictions( draw_depth_columns_and_material_rect( shape, page.derotation_matrix, - [pair for pair in depths_materials_column_pairs if pair.page == page_number], + [pair for pair in depths_materials_column_pairs if pair.depth_column.page == page_number], ) draw_material_descriptions( shape, @@ -245,7 +245,7 @@ def draw_material_descriptions(shape: fitz.Shape, derotation_matrix: fitz.Matrix def draw_depth_columns_and_material_rect( - shape: fitz.Shape, derotation_matrix: fitz.Matrix, depths_materials_column_pairs: list[DepthsMaterialsColumnPairs] + shape: fitz.Shape, derotation_matrix: fitz.Matrix, depths_materials_column_pairs: list[DepthsMaterialsColumnPair] ): """Draw depth columns as well as the material rects on a pdf page. diff --git a/src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py b/src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py index 477007fa..f2222486 100644 --- a/src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py +++ b/src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py @@ -63,7 +63,7 @@ def is_valid(self, column: BoundaryDepthColumn, corr_coef_threshold: float = 0.9 return corr_coef and corr_coef > corr_coef_threshold - def reduce_until_valid(self, column: BoundaryDepthColumn, page_number: int) -> BoundaryDepthColumn: + def reduce_until_valid(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn: """Removes entries from the depth column until it fulfills the is_valid condition. is_valid checks whether there is too much noise (i.e. other text) in the column and whether the entries are @@ -71,19 +71,18 @@ def reduce_until_valid(self, column: BoundaryDepthColumn, page_number: int) -> B Args: column (BoundaryDepthColumn): The depth column to validate - page_number (int): The page number of the depth column Returns: BoundaryDepthColumn: The current depth column with entries removed until it is valid. """ while column: if self.is_valid(column): return column - elif self.correct_OCR_mistakes(column, page_number) is not None: - return self.correct_OCR_mistakes(column, page_number) + elif self.correct_OCR_mistakes(column) is not None: + return self.correct_OCR_mistakes(column) else: column = column.remove_entry_by_correlation_gradient() - def correct_OCR_mistakes(self, column: BoundaryDepthColumn, page_number: int) -> BoundaryDepthColumn | None: + def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn | None: """Corrects OCR mistakes in the depth column entries. Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the @@ -102,15 +101,14 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn, page_number: int) -> Args: column (BoundaryDepthColumn): The depth column to validate - page_number (int): The page number of the depth column Returns: BoundaryDepthColumn | None: The corrected depth column, or None if no correction was possible. """ - new_columns = [BoundaryDepthColumn()] + new_columns = [BoundaryDepthColumn(entries=[], page=column.page)] for entry in column.entries: new_columns = [ - BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value, page_number)]) + BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value)], page=column.page) for column in new_columns for new_value in _value_alternatives(entry.value) ] diff --git a/src/stratigraphy/depthcolumn/depthcolumn.py b/src/stratigraphy/depthcolumn/depthcolumn.py index e8e4acc3..aaaf6ad4 100644 --- a/src/stratigraphy/depthcolumn/depthcolumn.py +++ b/src/stratigraphy/depthcolumn/depthcolumn.py @@ -3,36 +3,31 @@ from __future__ import annotations import abc +from dataclasses import dataclass +from typing import Generic, TypeVar import fitz import numpy as np from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry, LayerDepthColumnEntry from stratigraphy.layer.layer import IntervalBlockGroup -from stratigraphy.layer.layer_identifier_column import LayerIdentifierColumn from stratigraphy.lines.line import TextLine, TextWord from stratigraphy.text.find_description import get_description_blocks from stratigraphy.util.dataclasses import Line -from stratigraphy.util.interval import BoundaryInterval, Interval, LayerInterval +from stratigraphy.util.interval import BoundaryInterval, LayerInterval +EntryT = TypeVar("EntryT", bound=DepthColumnEntry) -class DepthColumn(metaclass=abc.ABCMeta): - """Abstract DepthColumn class.""" - @abc.abstractmethod - def __init__(self): # noqa: D107 - pass +@dataclass +class DepthColumn(abc.ABC, Generic[EntryT]): + """Abstract DepthColumn class.""" - @abc.abstractmethod - def depth_intervals(self) -> list[Interval]: - """Get the depth intervals of the depth column.""" - pass + entries: list[EntryT] + page: int - @abc.abstractmethod def rects(self) -> list[fitz.Rect]: """Get the rectangles of the depth column entries.""" - pass - - """Used for scoring how well a depth column corresponds to a material description bbox.""" + return [entry.rect for entry in self.entries] def rect(self) -> fitz.Rect: """Get the bounding box of the depth column entries.""" @@ -52,9 +47,10 @@ def min_x1(self) -> float: """Get the minimum x1 value of the depth column entries.""" return min([rect.x1 for rect in self.rects()]) - @abc.abstractmethod def noise_count(self, all_words: list[TextWord]) -> int: - """Count the number of words that intersect with the depth column entries. + """Counts the number of words that intersect with the depth column entries. + + Returns the number of words that intersect with the depth column entries, but are not part of the depth column. Args: all_words (list[TextWord]): A list of all text lines on the page. @@ -62,11 +58,20 @@ def noise_count(self, all_words: list[TextWord]) -> int: Returns: int: The number of words that intersect with the depth column entries but are not part of it. """ - pass + + def significant_intersection(other_rect): + intersection = fitz.Rect(other_rect).intersect(self.rect()) + return intersection.is_valid and intersection.width > 0.25 * self.rect().width + + return len([word for word in all_words if significant_intersection(word.rect)]) - len(self.entries) @abc.abstractmethod def identify_groups( - self, description_lines: list[TextLine], geometric_lines: list[Line], material_description_rect: fitz.Rect + self, + description_lines: list[TextLine], + geometric_lines: list[Line], + material_description_rect: fitz.Rect, + **params, ) -> list[IntervalBlockGroup]: """Identifies groups of description blocks that correspond to depth intervals. @@ -74,6 +79,7 @@ def identify_groups( description_lines (list[TextLine]): A list of text lines that are part of the description. geometric_lines (list[Line]): A list of geometric lines that are part of the description. material_description_rect (fitz.Rect): The bounding box of the material description. + params (dict): A dictionary of relevant parameters. Returns: list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. @@ -85,38 +91,33 @@ def to_json(self): """Converts the object to a dictionary.""" pass - @classmethod - @abc.abstractmethod - def from_json(cls, json_depth_column: dict) -> DepthColumn: - """Converts a dictionary to an object.""" - pass - + def can_be_appended(self, rect: fitz.Rect) -> bool: + """Checks if a new depth column entry can be appended to the current depth column. -class DepthColumnFactory: - """Factory class for creating DepthColumn objects.""" + Check if the middle of the new rect is between the outer horizontal boundaries of the column, and if there is + an intersection with the minimal horizontal boundaries of the column. - @staticmethod - def create(data: dict) -> DepthColumn: - """Creates a DepthColumn object from a dictionary. + The checks are: + - The width of the new rectangle is greater than the width of the current depth column. Or; + - The middle of the new rectangle is within the horizontal boundaries of the current depth column. + - The new rectangle intersects with the minimal horizontal boundaries of the current depth column. Args: - data (dict): A dictionary representing the depth column. + rect (fitz.Rect): Rect of the depth column entry to be appended. Returns: - DepthColumn: The depth column object. + bool: True if the new depth column entry can be appended, False otherwise. """ - column_type = data.get("type") - if column_type == "BoundaryDepthColumn": - return BoundaryDepthColumn.from_json(data) - elif column_type == "LayerDepthColumn": - return LayerDepthColumn.from_json(data) - elif column_type == "LayerIdentifierColumn": - return LayerIdentifierColumn.from_json(data) - else: - raise ValueError(f"Unknown depth column type: {column_type}") + new_middle = (rect.x0 + rect.x1) / 2 + if (self.rect().width < rect.width or self.rect().x0 < new_middle < self.rect().x1) and ( + rect.x0 <= self.min_x1 and self.max_x0 <= rect.x1 + ): + return True + return False -class LayerDepthColumn(DepthColumn): +@dataclass +class LayerDepthColumn(DepthColumn[LayerDepthColumnEntry]): """Represents a depth column where the upper and lower depths of each layer are explicitly specified. Example:: @@ -128,14 +129,6 @@ class LayerDepthColumn(DepthColumn): entries: list[LayerDepthColumnEntry] - def __init__(self, entries=None): - super().__init__() - - if entries is not None: - self.entries = entries - else: - self.entries = [] - def __repr__(self): """Converts the object to a string. @@ -157,42 +150,9 @@ def to_json(self) -> dict: "type": "LayerDepthColumn", } - @classmethod - def from_json(cls, json_depth_column: dict) -> LayerDepthColumn: - """Converts a dictionary to an object. - - Args: - json_depth_column (dict): A dictionary representing the depth column. - - Returns: - LayerDepthColumn: The depth column object. - """ - entries_data = json_depth_column.get("entries", []) - entries = [LayerDepthColumnEntry.from_json(entry) for entry in entries_data] - return LayerDepthColumn(entries) - - def add_entry(self, entry: LayerDepthColumnEntry) -> LayerDepthColumn: - """Adds a depth column entry to the depth column. - - Args: - entry (LayerDepthColumnEntry): The depth column entry to add. - - Returns: - LayerDepthColumn: The depth column with the new entry. - """ - self.entries.append(entry) - return self - - def depth_intervals(self) -> list[Interval]: + def depth_intervals(self) -> list[LayerInterval]: return [LayerInterval(entry) for entry in self.entries] - def rects(self) -> list[fitz.Rect]: - return [entry.rect for entry in self.entries] - - def noise_count(self, all_words: list[TextWord]) -> int: - # currently, we don't count noise for layer columns - return 0 - def break_on_mismatch(self) -> list[LayerDepthColumn]: """Breaks the depth column into segments where the depth intervals are not in an arithmetic progression. @@ -211,7 +171,7 @@ def break_on_mismatch(self) -> list[LayerDepthColumn]: if final_segment: segments.append(final_segment) - return [LayerDepthColumn(segment) for segment in segments] + return [LayerDepthColumn(segment, page=self.page) for segment in segments] def is_valid(self) -> bool: """Checks if the depth column is valid. @@ -245,7 +205,7 @@ def identify_groups( description_lines (list[TextLine]): A list of text lines that are part of the description. geometric_lines (list[Line]): A list of geometric lines that are part of the description. material_description_rect (fitz.Rect): The bounding box of the material description. - params (dict): A dictionary of parameters used for line detection. + params (dict): A dictionary of relevant parameters. Returns: list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. @@ -268,7 +228,8 @@ def identify_groups( return groups -class BoundaryDepthColumn(DepthColumn): +@dataclass +class BoundaryDepthColumn(DepthColumn[DepthColumnEntry]): """Represents a depth column. The depths of the boundaries between layers are labels, at a vertical position on @@ -286,22 +247,6 @@ class BoundaryDepthColumn(DepthColumn): entries: list[DepthColumnEntry] - def __init__(self, entries: list = None): - """Initializes a BoundaryDepthColumn object. - - Args: - entries (list, optional): Depth Column Entries for the depth column. Defaults to None. - """ - super().__init__() - - if entries is not None: - self.entries = entries - else: - self.entries = [] - - def rects(self) -> list[fitz.Rect]: - return [entry.rect for entry in self.entries] - def __repr__(self): return "DepthColumn({})".format(", ".join([str(entry) for entry in self.entries])) @@ -318,65 +263,12 @@ def to_json(self) -> dict: "type": "BoundaryDepthColumn", } - @classmethod - def from_json(cls, json_depth_column: dict) -> BoundaryDepthColumn: - """Converts a dictionary to an object. - - Args: - json_depth_column (dict): A dictionary representing the depth column. - - Returns: - BoundaryDepthColumn: The depth column object. - """ - entries_data = json_depth_column.get("entries", []) - entries = [DepthColumnEntry.from_json(entry) for entry in entries_data] - return BoundaryDepthColumn(entries) - - def add_entry(self, entry: DepthColumnEntry) -> BoundaryDepthColumn: - """Adds a depth column entry to the depth column. - - Args: - entry (DepthColumnEntry): The depth column entry to add. - - Returns: - BoundaryDepthColumn: The depth column with the new entry. - """ - self.entries.append(entry) - return self - - """ - Check if the middle of the new rect is between the outer horizontal boundaries of the column, and if there is an - intersection with the minimal horizontal boundaries of the column. - """ - - def can_be_appended(self, rect: fitz.Rect) -> bool: - """Checks if a new depth column entry can be appended to the current depth column. - - The checks are: - - The width of the new rectangle is greater than the width of the current depth column. Or; - - The middle of the new rectangle is within the horizontal boundaries of the current depth column. - - The new rectangle intersects with the minimal horizontal boundaries of the current depth column. - - - Args: - rect (fitz.Rect): Rect of the depth column entry to be appended. - - Returns: - bool: True if the new depth column entry can be appended, False otherwise. - """ - new_middle = (rect.x0 + rect.x1) / 2 - if (self.rect().width < rect.width or self.rect().x0 < new_middle < self.rect().x1) and ( - rect.x0 <= self.min_x1 and self.max_x0 <= rect.x1 - ): - return True - return False - def valid_initial_segment(self, rect: fitz.Rect) -> BoundaryDepthColumn: for i in range(len(self.entries) - 1): - initial_segment = BoundaryDepthColumn(self.entries[: -i - 1]) + initial_segment = BoundaryDepthColumn(self.entries[: -i - 1], page=self.page) if initial_segment.can_be_appended(rect): return initial_segment - return BoundaryDepthColumn() + return BoundaryDepthColumn(entries=[], page=self.page) def strictly_contains(self, other: BoundaryDepthColumn) -> bool: return len(other.entries) < len(self.entries) and all( @@ -412,7 +304,9 @@ def significant_arithmetic_progression(self) -> bool: return self.is_arithmetic_progression() else: for i in range(len(self.entries) - segment_length + 1): - if BoundaryDepthColumn(self.entries[i : i + segment_length]).is_arithmetic_progression(): + if BoundaryDepthColumn( + self.entries[i : i + segment_length], page=self.page + ).is_arithmetic_progression(): return True return False @@ -430,24 +324,6 @@ def is_arithmetic_progression(self) -> bool: scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item() return abs(scale_pearson_correlation_coef) >= 0.9999 - def noise_count(self, all_words: list[TextWord]) -> int: - """Counts the number of words that intersect with the depth column entries. - - Returns the number of words that intersect with the depth column entries, but are not part of the depth column. - - Args: - all_words (list[TextWord]): A list of all text lines on the page. - - Returns: - int: The number of words that intersect with the depth column entries but are not part of it. - """ - - def significant_intersection(other_rect): - intersection = fitz.Rect(other_rect).intersect(self.rect()) - return intersection.is_valid and intersection.width > 0.25 * self.rect().width - - return len([word for word in all_words if significant_intersection(word.rect)]) - len(self.entries) - def pearson_correlation_coef(self) -> float: # We look at the lower y coordinate, because most often the baseline of the depth value text is aligned with # the line of the corresponding layer boundary. @@ -465,7 +341,9 @@ def remove_entry_by_correlation_gradient(self) -> BoundaryDepthColumn | None: return None new_columns = [ - BoundaryDepthColumn([entry for index, entry in enumerate(self.entries) if index != remove_index]) + BoundaryDepthColumn( + [entry for index, entry in enumerate(self.entries) if index != remove_index], page=self.page + ) for remove_index in range(len(self.entries)) ] return max(new_columns, key=lambda column: column.pearson_correlation_coef()) @@ -490,7 +368,7 @@ def break_on_double_descending(self) -> list[BoundaryDepthColumn]: if final_segment: segments.append(final_segment) - return [BoundaryDepthColumn(segment) for segment in segments] + return [BoundaryDepthColumn(segment, page=self.page) for segment in segments] def identify_groups( self, @@ -508,7 +386,7 @@ def identify_groups( description_lines (list[TextLine]): A list of text lines that are part of the description. geometric_lines (list[Line]): A list of geometric lines that are part of the description. material_description_rect (fitz.Rect): The bounding box of the material description. - params (dict): A dictionary of parameters used for line detection. + params (dict): A dictionary of relevant parameters. Returns: list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. diff --git a/src/stratigraphy/depthcolumn/depthcolumnentry.py b/src/stratigraphy/depthcolumn/depthcolumnentry.py index 0a9faac4..529fd1fe 100644 --- a/src/stratigraphy/depthcolumn/depthcolumnentry.py +++ b/src/stratigraphy/depthcolumn/depthcolumnentry.py @@ -8,21 +8,16 @@ class DepthColumnEntry: # noqa: D101 """Class to represent a depth column entry.""" - def __init__(self, rect: fitz.Rect, value: float, page_number: int): + def __init__(self, rect: fitz.Rect, value: float): self.rect = rect self.value = value - self.page_number = page_number def __repr__(self) -> str: return str(self.value) def to_json(self) -> dict[str, Any]: """Convert the depth column entry to a JSON serializable format.""" - return { - "value": self.value, - "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], - "page": self.page_number, - } + return {"value": self.value, "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1]} @classmethod def from_json(cls, json_depth_column_entry: dict) -> "DepthColumnEntry": @@ -34,29 +29,7 @@ def from_json(cls, json_depth_column_entry: dict) -> "DepthColumnEntry": Returns: DepthColumnEntry: The depth column entry object. """ - return cls( - rect=fitz.Rect(json_depth_column_entry["rect"]), - value=json_depth_column_entry["value"], - page_number=json_depth_column_entry["page"], - ) - - -class AnnotatedDepthColumnEntry(DepthColumnEntry): # noqa: D101 - """Class to represent a depth column entry obtained from LabelStudio. - - The annotation process in label studio does not come with rectangles for depth column entries. - Therefore, we set them to None. - """ - - def __init__(self, value): - super().__init__(None, value, None) - - def to_json(self) -> dict[str, Any]: - return { - "value": self.value, - "rect": self.rect, - "page": self.page_number, - } + return cls(rect=fitz.Rect(json_depth_column_entry["rect"]), value=json_depth_column_entry["value"]) class LayerDepthColumnEntry: # noqa: D101 @@ -66,8 +39,6 @@ def __init__(self, start: DepthColumnEntry, end: DepthColumnEntry): self.start = start self.end = end - assert start.page_number == end.page_number, "Start and end entries are on different pages." - def __repr__(self) -> str: return f"{self.start.value}-{self.end.value}" @@ -82,7 +53,6 @@ def to_json(self) -> dict[str, Any]: "start": self.start.to_json(), "end": self.end.to_json(), "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], - "page": self.start.page_number, } @classmethod diff --git a/src/stratigraphy/depthcolumn/find_depth_columns.py b/src/stratigraphy/depthcolumn/find_depth_columns.py index d7aa8321..ccadae35 100644 --- a/src/stratigraphy/depthcolumn/find_depth_columns.py +++ b/src/stratigraphy/depthcolumn/find_depth_columns.py @@ -8,6 +8,7 @@ from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry, LayerDepthColumnEntry from stratigraphy.lines.line import TextWord from stratigraphy.text.textblock import TextBlock +from stratigraphy.util.interval import LayerInterval def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]: @@ -32,10 +33,10 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis match = regex.match(input_string) if match: value = value_as_float(match.group(1)) - entries.append(DepthColumnEntry(word.rect, value, word.page_number)) + entries.append(DepthColumnEntry(word.rect, value)) elif include_splits: # support for e.g. "1.10-1.60m" extracted as a single word - layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect, word.page_number) + layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect) entries.extend( [layer_depth_column_entry.start, layer_depth_column_entry.end] if layer_depth_column_entry else [] ) @@ -52,14 +53,13 @@ def value_as_float(string_value: str) -> float: # noqa: D103 def extract_layer_depth_interval( - text: str, rect: fitz.Rect, page_number: int, require_start_of_string: bool = True + text: str, rect: fitz.Rect, require_start_of_string: bool = True ) -> LayerDepthColumnEntry | None: """Extracts a LayerDepthColumnEntry from a string. Args: text (str): The string to extract the depth interval from. rect (fitz.Rect): The rectangle of the text. - page_number (int): The page number of the text. require_start_of_string (bool, optional): Whether the number to extract needs to be at the start of a string. Defaults to True. @@ -80,13 +80,15 @@ def extract_layer_depth_interval( value2 = value_as_float(match.group(3)) second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1) return LayerDepthColumnEntry( - DepthColumnEntry(first_half_rect, value1, page_number), - DepthColumnEntry(second_half_rect, value2, page_number), + DepthColumnEntry(first_half_rect, value1), + DepthColumnEntry(second_half_rect, value2), ) return None -def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[TextWord]) -> list[LayerDepthColumn]: +def find_layer_depth_columns( + entries: list[DepthColumnEntry], all_words: list[TextWord], page_number: int +) -> list[LayerDepthColumn]: """Finds all layer depth columns. Generates a list of LayerDepthColumnEntry objects by finding consecutive pairs of DepthColumnEntry objects. @@ -99,6 +101,7 @@ def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[Te Args: entries (list[DepthColumnEntry]): List of depth column entries. all_words (list[TextWord]): List of all TextWord objects. + page_number (int): The number of the page. Returns: list[LayerDepthColumn]: List of all layer depth columns identified. @@ -139,10 +142,10 @@ def find_pair(entry: DepthColumnEntry) -> DepthColumnEntry | None: # noqa: D103 new_start_middle = (entry.start.rect.x0 + entry.start.rect.x1) / 2 if column_rect.x0 < new_start_middle < column_rect.x1: is_matched = True - column.add_entry(entry) + column.entries.append(entry) if not is_matched: - columns.append(LayerDepthColumn([entry])) + columns.append(LayerDepthColumn([entry], page=page_number)) return [ column_segment @@ -173,16 +176,17 @@ def find_depth_columns( for column in numeric_columns: if column.can_be_appended(entry.rect): has_match = True - column.add_entry(entry) + column.entries.append(entry) else: valid_initial_segment = column.valid_initial_segment(entry.rect) if len(valid_initial_segment.entries) > 0: has_match = True - additional_columns.append(valid_initial_segment.add_entry(entry)) + valid_initial_segment.entries.append(entry) + additional_columns.append(valid_initial_segment) numeric_columns.extend(additional_columns) if not has_match: - numeric_columns.append(BoundaryDepthColumn(entries=[entry])) + numeric_columns.append(BoundaryDepthColumn(entries=[entry], page=page_number)) # only keep columns that are not contained in a different column numeric_columns = [ @@ -194,7 +198,7 @@ def find_depth_columns( boundary_depth_column_validator = BoundaryDepthColumnValidator(all_words, **depth_column_params) numeric_columns = [ - boundary_depth_column_validator.reduce_until_valid(column, page_number) + boundary_depth_column_validator.reduce_until_valid(column) for numeric_column in numeric_columns for column in numeric_column.break_on_double_descending() # when we have a perfect arithmetic progression, this is usually just a scale @@ -208,7 +212,7 @@ def find_depth_columns( ) -def get_depth_interval_from_textblock(block: TextBlock) -> LayerDepthColumnEntry | None: +def get_depth_interval_from_textblock(block: TextBlock) -> LayerInterval | None: """Extract depth interval from a material description block. For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description @@ -220,14 +224,12 @@ def get_depth_interval_from_textblock(block: TextBlock) -> LayerDepthColumnEntry block (TextBlock): The block to calculate the depth interval for. Returns: - LayerDepthColumnEntry | None: The depth interval. + LayerInterval | None: The depth interval. """ depth_entries = [] for line in block.lines: try: - layer_depth_entry = extract_layer_depth_interval( - line.text, line.rect, line.page_number, require_start_of_string=False - ) + layer_depth_entry = extract_layer_depth_interval(line.text, line.rect, require_start_of_string=False) # require_start_of_string = False because the depth interval may not always start at the beginning # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" if layer_depth_entry: @@ -240,6 +242,6 @@ def get_depth_interval_from_textblock(block: TextBlock) -> LayerDepthColumnEntry start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value) end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value) - return LayerDepthColumnEntry(start, end) + return LayerInterval(LayerDepthColumnEntry(start, end)) else: return None diff --git a/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py b/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py index 4afbbef1..35f8f575 100644 --- a/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py +++ b/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py @@ -1,29 +1,19 @@ """Definition of the DepthsMaterialsColumnPairs class.""" +import math from dataclasses import dataclass import fitz -from stratigraphy.depthcolumn.depthcolumn import DepthColumn, DepthColumnFactory +from stratigraphy.depthcolumn.depthcolumn import DepthColumn +from stratigraphy.lines.line import TextWord @dataclass -class DepthsMaterialsColumnPairs: +class DepthsMaterialsColumnPair: """A class to represent pairs of depth columns and material descriptions.""" depth_column: DepthColumn | None material_description_rect: fitz.Rect - page: int - - def __str__(self) -> str: - """Converts the object to a string. - - Returns: - str: The object as a string. - """ - return ( - f"DepthsMaterialsColumnPairs(depth_column={self.depth_column}," - f"material_description_rect={self.material_description_rect}, page={self.page})" - ) def to_json(self) -> dict: """Converts the object to a dictionary. @@ -39,22 +29,29 @@ def to_json(self) -> dict: self.material_description_rect.x1, self.material_description_rect.y1, ], - "page": self.page, } - @classmethod - def from_json(cls, json_depths_materials_column_pairs: dict) -> "DepthsMaterialsColumnPairs": - """Converts a dictionary to an object. + def score_column_match(self, all_words: list[TextWord] | None = None) -> float: + """Scores the match between a depth column and a material description. Args: - json_depths_materials_column_pairs (dict): A dictionary representing the depths materials column pairs. + all_words (list[TextWord] | None, optional): List of the available text words. Defaults to None. Returns: - DepthsMaterialsColumnPairs: The depths materials column pairs object. + float: The score of the match. """ - depth_column_entry = json_depths_materials_column_pairs["depth_column"] - depth_column = DepthColumnFactory.create(depth_column_entry) if depth_column_entry else None - material_description_rect = fitz.Rect(json_depths_materials_column_pairs["material_description_rect"]) - page = json_depths_materials_column_pairs["page"] + rect = self.depth_column.rect() + top = rect.y0 + bottom = rect.y1 + right = rect.x1 + distance = ( + abs(top - self.material_description_rect.y0) + + abs(bottom - self.material_description_rect.y1) + + abs(right - self.material_description_rect.x0) + ) + + height = bottom - top + + noise_count = self.depth_column.noise_count(all_words) if all_words else 0 - return cls(depth_column, material_description_rect, page) + return (height - distance) * math.pow(0.8, noise_count) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index cdc7dcd8..a18b94e2 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -1,7 +1,6 @@ """Contains the main extraction pipeline for stratigraphy.""" import logging -import math from dataclasses import dataclass import fitz @@ -9,17 +8,15 @@ from stratigraphy.data_extractor.data_extractor import FeatureOnPage from stratigraphy.depthcolumn import find_depth_columns from stratigraphy.depthcolumn.depthcolumn import DepthColumn -from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPairs +from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPair from stratigraphy.layer.layer import IntervalBlockPair, Layer from stratigraphy.layer.layer_identifier_column import ( - LayerIdentifierColumn, find_layer_identifier_column, find_layer_identifier_column_entries, ) -from stratigraphy.lines.line import TextLine, TextWord +from stratigraphy.lines.line import TextLine from stratigraphy.text.find_description import ( get_description_blocks, - get_description_blocks_from_layer_identifier, get_description_lines, ) from stratigraphy.text.textblock import MaterialDescription, MaterialDescriptionLine, TextBlock, block_distance @@ -38,7 +35,7 @@ class ProcessPageResult: """The result of processing a single page of a pdf.""" predictions: list[Layer] - depth_material_pairs: list[DepthsMaterialsColumnPairs] + depth_material_pairs: list[DepthsMaterialsColumnPair] def process_page( @@ -64,28 +61,29 @@ def process_page( # Detect Layer Index Columns layer_identifier_entries = find_layer_identifier_column_entries(lines) layer_identifier_columns = ( - find_layer_identifier_column(layer_identifier_entries) if layer_identifier_entries else [] + find_layer_identifier_column(layer_identifier_entries, page_number) if layer_identifier_entries else [] ) - pairs = [] + depths_materials_column_pairs = [] if layer_identifier_columns: for layer_identifier_column in layer_identifier_columns: material_description_rect = find_material_description_column( lines, layer_identifier_column, language, **params["material_description"] ) if material_description_rect: - pairs.append((layer_identifier_column, material_description_rect)) + depths_materials_column_pairs.append( + DepthsMaterialsColumnPair(layer_identifier_column, material_description_rect) + ) # Obtain the best pair. In contrast to depth columns, there only ever is one layer index column per page. - if pairs: - pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1])) - - words = [word for line in lines for word in line.words] + if depths_materials_column_pairs: + depths_materials_column_pairs.sort(key=lambda pair: pair.score_column_match()) # If there is a layer identifier column, then we use this directly. # Else, we search for depth columns. We could also think of some scoring mechanism to decide which one to use. - if not pairs: + if not depths_materials_column_pairs: + words = [word for line in lines for word in line.words] depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True) - layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words) + layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words, page_number) used_entry_rects = [] for column in layer_depth_columns: @@ -109,32 +107,33 @@ def process_page( lines, depth_column, language, **params["material_description"] ) if material_description_rect: - pairs.append((depth_column, material_description_rect)) + depths_materials_column_pairs.append( + DepthsMaterialsColumnPair(depth_column, material_description_rect) + ) # lowest score first - pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words)) + depths_materials_column_pairs.sort(key=lambda pair: pair.score_column_match(words)) to_delete = [] - for i, (_depth_column, material_description_rect) in enumerate(pairs): - if any(material_description_rect.intersects(other_rect) for _, other_rect in pairs[i + 1 :]): + for i, pair in enumerate(depths_materials_column_pairs): + if any( + pair.material_description_rect.intersects(other_pair.material_description_rect) + for other_pair in depths_materials_column_pairs[i + 1 :] + ): to_delete.append(i) - filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete] + filtered_depth_material_column_pairs = [ + item for index, item in enumerate(depths_materials_column_pairs) if index not in to_delete + ] pairs: list[IntervalBlockPair] = [] # list of matched depth intervals and text blocks # groups is of the form: [{"depth_interval": BoundaryInterval, "block": TextBlock}] - if filtered_pairs: # match depth column items with material description - for depth_column, material_description_rect in filtered_pairs: - description_lines = get_description_lines(lines, material_description_rect) + if filtered_depth_material_column_pairs: # match depth column items with material description + for pair in filtered_depth_material_column_pairs: + description_lines = get_description_lines(lines, pair.material_description_rect) if len(description_lines) > 1: new_pairs = match_columns( - depth_column, description_lines, geometric_lines, material_description_rect, **params + pair.depth_column, description_lines, geometric_lines, pair.material_description_rect, **params ) pairs.extend(new_pairs) - filtered_depth_material_column_pairs = [ - DepthsMaterialsColumnPairs( - depth_column=depth_column, material_description_rect=material_description_rect, page=page_number - ) - for depth_column, material_description_rect in filtered_pairs - ] else: filtered_depth_material_column_pairs = [] # Fallback when no depth column was found @@ -152,11 +151,7 @@ def process_page( ) pairs.extend([IntervalBlockPair(block=block, depth_interval=None) for block in description_blocks]) filtered_depth_material_column_pairs.extend( - [ - DepthsMaterialsColumnPairs( - depth_column=None, material_description_rect=material_description_rect, page=page_number - ) - ] + [DepthsMaterialsColumnPair(depth_column=None, material_description_rect=material_description_rect)] ) layer_predictions = [ @@ -186,38 +181,8 @@ def process_page( return ProcessPageResult(layer_predictions, filtered_depth_material_column_pairs) -def score_column_match( - depth_column: DepthColumn, material_description_rect: fitz.Rect, all_words: list[TextWord] | None = None -) -> float: - """Scores the match between a depth column and a material description. - - Args: - depth_column (DepthColumn): The depth column. - material_description_rect (fitz.Rect): The material description rectangle. - all_words (list[TextWord] | None, optional): List of the available text words. Defaults to None. - - Returns: - float: The score of the match. - """ - rect = depth_column.rect() - top = rect.y0 - bottom = rect.y1 - right = rect.x1 - distance = ( - abs(top - material_description_rect.y0) - + abs(bottom - material_description_rect.y1) - + abs(right - material_description_rect.x0) - ) - - height = bottom - top - - noise_count = depth_column.noise_count(all_words) if all_words else 0 - - return (height - distance) * math.pow(0.8, noise_count) - - def match_columns( - depth_column: DepthColumn | LayerIdentifierColumn, + depth_column: DepthColumn, description_lines: list[TextLine], geometric_lines: list[Line], material_description_rect: fitz.Rect, @@ -230,7 +195,7 @@ def match_columns( as well as their depth intervals where present. Args: - depth_column (DepthColumn | LayerIdentifierColumn): The depth column. + depth_column (DepthColumn): The depth column. description_lines (list[TextLine]): The description lines. geometric_lines (list[Line]): The geometric lines. material_description_rect (fitz.Rect): The material description rectangle. @@ -239,28 +204,13 @@ def match_columns( Returns: list[IntervalBlockPair]: The matched depth intervals and text blocks. """ - if isinstance(depth_column, DepthColumn): - return [ - element - for group in depth_column.identify_groups( - description_lines, geometric_lines, material_description_rect, **params - ) - for element in transform_groups(group.depth_intervals, group.blocks, **params) - ] - elif isinstance(depth_column, LayerIdentifierColumn): - blocks = get_description_blocks_from_layer_identifier(depth_column.entries, description_lines) - pairs: list[IntervalBlockPair] = [] - for block in blocks: - depth_interval = find_depth_columns.get_depth_interval_from_textblock(block) - if depth_interval: - pairs.append(IntervalBlockPair(depth_interval=depth_interval, block=block)) - else: - pairs.append(IntervalBlockPair(depth_interval=None, block=block)) - return pairs - else: - raise ValueError( - f"depth_column must be a DepthColumn or a LayerIdentifierColumn object. Got {type(depth_column)}." + return [ + element + for group in depth_column.identify_groups( + description_lines, geometric_lines, material_description_rect, **params ) + for element in transform_groups(group.depth_intervals, group.blocks, **params) + ] def transform_groups( @@ -493,6 +443,8 @@ def is_below(best_x0, best_y1, line): if len(candidate_rects) == 0: return None if depth_column: - return max(candidate_rects, key=lambda rect: score_column_match(depth_column, rect)) + return max( + candidate_rects, key=lambda rect: DepthsMaterialsColumnPair(depth_column, rect).score_column_match() + ) else: return candidate_rects[0] diff --git a/src/stratigraphy/layer/layer.py b/src/stratigraphy/layer/layer.py index 853ae8f1..0d9001f9 100644 --- a/src/stratigraphy/layer/layer.py +++ b/src/stratigraphy/layer/layer.py @@ -7,7 +7,7 @@ from stratigraphy.data_extractor.data_extractor import ExtractedFeature, FeatureOnPage from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry from stratigraphy.text.textblock import MaterialDescription, TextBlock -from stratigraphy.util.interval import AnnotatedInterval, BoundaryInterval, Interval +from stratigraphy.util.interval import BoundaryInterval, Interval from stratigraphy.util.util import parse_text @@ -16,7 +16,7 @@ class Layer(ExtractedFeature): """A class to represent predictions for a single layer.""" material_description: FeatureOnPage[MaterialDescription] - depth_interval: BoundaryInterval | AnnotatedInterval | None + depth_interval: BoundaryInterval | None id: uuid.UUID = field(default_factory=uuid.uuid4) def __str__(self) -> str: diff --git a/src/stratigraphy/layer/layer_identifier_column.py b/src/stratigraphy/layer/layer_identifier_column.py index 518d2aaf..f8a07bd1 100644 --- a/src/stratigraphy/layer/layer_identifier_column.py +++ b/src/stratigraphy/layer/layer_identifier_column.py @@ -1,9 +1,15 @@ """Module for the LayerIdentifierColumn class.""" import re +from dataclasses import dataclass import fitz +from stratigraphy.depthcolumn.depthcolumn import DepthColumn +from stratigraphy.depthcolumn.find_depth_columns import get_depth_interval_from_textblock +from stratigraphy.layer.layer import IntervalBlockGroup from stratigraphy.lines.line import TextLine +from stratigraphy.text.textblock import TextBlock +from stratigraphy.util.dataclasses import Line class LayerIdentifierEntry: @@ -31,84 +37,87 @@ def to_json(self): } -class LayerIdentifierColumn: - """Class for a layer identifier column.""" +@dataclass +class LayerIdentifierColumn(DepthColumn[LayerIdentifierEntry]): + """Class for a layer identifier column. - def __init__(self, entries: list[LayerIdentifierEntry]): - """Initialize the LayerIdentifierColumn object. - - Args: - entries (list[LayerIdentifierEntry]): The entries corresponding to the layer indices. - """ - self.entries: list[LayerIdentifierEntry] = entries - - @property - def max_x0(self) -> float: - """Get the maximum x0 value of the layer identifier column entries. - - Returns: - float: The maximum x0 value of the layer identifier column entries. - """ - return max([rect.x0 for rect in self.rects()]) + Layer identifiers are labels that are particularly common in Deriaz layout borehole profiles. They can be + sequential such as in 1007.pdf - a), b), c), etc. - or contain some semantic meaning such as in 10781.pdf - + 5c12), 4a), etc. + """ - @property - def min_x1(self) -> float: - """Get the minimum x1 value of the layer identifier column entries. + entries: list[LayerIdentifierEntry] - Returns: - float: The minimum x1 value of the layer identifier column entries. - """ - return min([rect.x1 for rect in self.rects()]) + def identify_groups( + self, + description_lines: list[TextLine], + geometric_lines: list[Line], + material_description_rect: fitz.Rect, + **params, + ) -> list[IntervalBlockGroup]: + """Divide the description lines into blocks based on the layer identifier entries. - def rect(self) -> fitz.Rect: - """Get the rectangle of the layer identifier column. + Args: + description_lines (list[TextLine]): A list of text lines that are part of the description. + geometric_lines (list[Line]): A list of geometric lines that are part of the description. + material_description_rect (fitz.Rect): The bounding box of the material description. + params (dict): A dictionary of relevant parameters. Returns: - fitz.Rect: The rectangle of the layer identifier column. + list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. """ - x0 = min([rect.x0 for rect in self.rects()]) - x1 = max([rect.x1 for rect in self.rects()]) - y0 = min([rect.y0 for rect in self.rects()]) - y1 = max([rect.y1 for rect in self.rects()]) - return fitz.Rect(x0, y0, x1, y1) + blocks = [] + line_index = 0 + for layer_identifier_idx, _layer_index in enumerate(self.entries): + next_layer_identifier = ( + self.entries[layer_identifier_idx + 1] if layer_identifier_idx + 1 < len(self.entries) else None + ) + + matched_block = self.matching_blocks(description_lines, line_index, next_layer_identifier) + line_index += sum([len(block.lines) for block in matched_block]) + blocks.extend(matched_block) + + result = [] + for block in blocks: + depth_intervals = [] + depth_interval = get_depth_interval_from_textblock(block) + if depth_interval: + depth_intervals.append(depth_interval) + result.append(IntervalBlockGroup(depth_intervals=depth_intervals, blocks=[block])) + + return result + + @staticmethod + def matching_blocks( + all_lines: list[TextLine], line_index: int, next_layer_identifier: LayerIdentifierEntry | None + ) -> list[TextBlock]: + """Adds lines to a block until the next layer identifier is reached. - def rects(self) -> list[fitz.Rect]: - """Get the rectangles of the layer identifier column entries. + Args: + all_lines (list[TextLine]): All TextLine objects constituting the material description. + line_index (int): The index of the last line that is already assigned to a block. + next_layer_identifier (TextLine | None): The next layer identifier. Returns: - list[fitz.Rect]: The rectangles of the layer identifier column entries. - """ - return [entry.rect for entry in self.entries] - - def add_entry(self, entry: LayerIdentifierEntry): - """Add a new layer identifier column entry to the layer identifier column. - - Args: - entry (LayerIdentifierEntry): The layer identifier column entry to be added. + list[TextBlock]: The next block or an empty list if no lines are added. """ - self.entries.append(entry) + y1_threshold = None + if next_layer_identifier: + next_interval_start_rect = next_layer_identifier.rect + y1_threshold = next_interval_start_rect.y0 + next_interval_start_rect.height / 2 - def can_be_appended(self, rect: fitz.Rect) -> bool: - """Checks if a new layer identifier column entry can be appended to the current layer identifier column. + matched_lines = [] - The checks are: - - The width of the new rectangle is greater than the width of the current layer identifier column. Or; - - The middle of the new rectangle is within the horizontal boundaries of the current layer identifier column. - - The new rectangle intersects with the minimal horizontal boundaries of the current layer identifier column. + for current_line in all_lines[line_index:]: + if y1_threshold is None or current_line.rect.y1 < y1_threshold: + matched_lines.append(current_line) + else: + break - - Args: - rect (fitz.Rect): Rect of the layer identifier column entry to be appended. - - Returns: - bool: True if the new layer identifier column entry can be appended, False otherwise. - """ - new_middle = (rect.x0 + rect.x1) / 2 - if (self.rect().width < rect.width or self.rect().x0 < new_middle < self.rect().x1) and ( - rect.x0 <= self.min_x1 and self.max_x0 <= rect.x1 - ): - return True - return False + if matched_lines: + return [TextBlock(matched_lines)] + else: + return [] def strictly_contains(self, other: "LayerIdentifierColumn") -> bool: """Check if the layer identifier column strictly contains another layer identifier column. @@ -205,26 +214,27 @@ def find_layer_identifier_column_entries(lines: list[TextLine]) -> list[LayerIde return entries -def find_layer_identifier_column(entries: list[LayerIdentifierEntry]) -> list[LayerIdentifierColumn]: +def find_layer_identifier_column(entries: list[LayerIdentifierEntry], page_number: int) -> list[LayerIdentifierColumn]: """Find the layer identifier column given the index column entries. Note: Similar to find_depth_columns.find_depth_columns. Refactoring may be desired. Args: entries (list[LayerIdentifierEntry]): The layer identifier column entries. + page_number (int): The number of the page. Returns: list[LayerIdentifierColumn]: The found layer identifier columns. """ - layer_identifier_columns = [LayerIdentifierColumn([entries[0]])] + layer_identifier_columns = [LayerIdentifierColumn([entries[0]], page=page_number)] for entry in entries[1:]: has_match = False for column in layer_identifier_columns: if column.can_be_appended(entry.rect): - column.add_entry(entry) + column.entries.append(entry) has_match = True if not has_match: - layer_identifier_columns.append(LayerIdentifierColumn([entry])) + layer_identifier_columns.append(LayerIdentifierColumn([entry], page=page_number)) # only keep columns whose entries are not fully contained in a different column layer_identifier_columns = [ diff --git a/src/stratigraphy/text/find_description.py b/src/stratigraphy/text/find_description.py index 48902ab4..57b3114f 100644 --- a/src/stratigraphy/text/find_description.py +++ b/src/stratigraphy/text/find_description.py @@ -1,7 +1,6 @@ """This module contains functions to find the description (blocks) of a material in a pdf page.""" import fitz -from stratigraphy.layer.layer_identifier_column import LayerIdentifierEntry from stratigraphy.lines.line import TextLine from stratigraphy.text.description_block_splitter import ( SplitDescriptionBlockByLeftHandSideSeparator, @@ -35,66 +34,6 @@ def get_description_lines(lines: list[TextLine], material_description_rect: fitz return sorted([line for line in filtered_lines if line], key=lambda line: line.rect.y0) -def get_description_blocks_from_layer_identifier( - layer_identifier_entries: list[LayerIdentifierEntry], description_lines: list[TextLine] -) -> list[TextBlock]: - """Divide the description lines into blocks based on the layer identifier entries. - - Args: - layer_identifier_entries (list[LayerIdentifierEntry]): The layer identifier entries. - description_lines (list[TextLine]): All lines constituting the material description. - - Returns: - list[TextBlock]: The blocks of the material description. - """ - blocks = [] - line_index = 0 - for layer_identifier_idx, _layer_index in enumerate(layer_identifier_entries): - next_layer_identifier = ( - layer_identifier_entries[layer_identifier_idx + 1] - if layer_identifier_idx + 1 < len(layer_identifier_entries) - else None - ) - - matched_block = matching_blocks(description_lines, line_index, next_layer_identifier) - line_index += sum([len(block.lines) for block in matched_block]) - blocks.extend(matched_block) - - return blocks - - -def matching_blocks( - all_lines: list[TextLine], line_index: int, next_layer_identifier: TextLine | None -) -> list[TextBlock]: - """Adds lines to a block until the next layer identifier is reached. - - Args: - all_lines (list[TextLine]): All TextLine objects constituting the material description. - line_index (int): The index of the last line that is already assigned to a block. - next_layer_identifier (TextLine | None): The next layer identifier. - - Returns: - list[TextBlock]: The next block or an empty list if no lines are added. - """ - y1_threshold = None - if next_layer_identifier: - next_interval_start_rect = next_layer_identifier.rect - y1_threshold = next_interval_start_rect.y0 + next_interval_start_rect.height / 2 - - matched_lines = [] - - for current_line in all_lines[line_index:]: - if y1_threshold is None or current_line.rect.y1 < y1_threshold: - matched_lines.append(current_line) - else: - break - - if matched_lines: - return [TextBlock(matched_lines)] - else: - return [] - - def get_description_blocks( description_lines: list[TextLine], geometric_lines: list[Line], diff --git a/src/stratigraphy/util/interval.py b/src/stratigraphy/util/interval.py index 793193c5..8a4cd5e3 100644 --- a/src/stratigraphy/util/interval.py +++ b/src/stratigraphy/util/interval.py @@ -7,7 +7,6 @@ import fitz from stratigraphy.depthcolumn.depthcolumnentry import ( - AnnotatedDepthColumnEntry, DepthColumnEntry, LayerDepthColumnEntry, ) @@ -59,15 +58,6 @@ def to_json(self): } -class AnnotatedInterval: - """Class for annotated intervals.""" - - def __init__(self, start: float, end: float, background_rect: fitz.Rect): - self.start = AnnotatedDepthColumnEntry(start) - self.end = AnnotatedDepthColumnEntry(end) - self.background_rect = background_rect - - class BoundaryInterval(Interval): """Class for boundary intervals. diff --git a/src/stratigraphy/util/predictions.py b/src/stratigraphy/util/predictions.py index f1d187c3..77660d41 100644 --- a/src/stratigraphy/util/predictions.py +++ b/src/stratigraphy/util/predictions.py @@ -5,7 +5,7 @@ from stratigraphy.benchmark.ground_truth import GroundTruth from stratigraphy.benchmark.metrics import OverallMetricsCatalog from stratigraphy.data_extractor.data_extractor import FeatureOnPage -from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPairs +from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPair from stratigraphy.evaluation.evaluation_dataclasses import OverallBoreholeMetadataMetrics from stratigraphy.evaluation.groundwater_evaluator import GroundwaterEvaluator from stratigraphy.evaluation.layer_evaluator import LayerEvaluator @@ -26,10 +26,10 @@ def __init__( file_name: str, metadata: BoreholeMetadata, groundwater: GroundwaterInDocument, - depths_materials_columns_pairs: list[DepthsMaterialsColumnPairs], + depths_materials_columns_pairs: list[DepthsMaterialsColumnPair], ): self.layers_in_document: LayersInDocument = layers_in_document - self.depths_materials_columns_pairs: list[DepthsMaterialsColumnPairs] = depths_materials_columns_pairs + self.depths_materials_columns_pairs: list[DepthsMaterialsColumnPair] = depths_materials_columns_pairs self.file_name: str = file_name self.metadata: BoreholeMetadata = metadata self.groundwater: GroundwaterInDocument = groundwater @@ -104,7 +104,7 @@ def from_json(cls, prediction_from_file: dict) -> "OverallFilePredictions": layers_in_doc = LayersInDocument(layers=layers, filename=file_name) depths_materials_columns_pairs = [ - DepthsMaterialsColumnPairs.from_json(dmc_pair) + DepthsMaterialsColumnPair.from_json(dmc_pair) for dmc_pair in file_data["depths_materials_column_pairs"] ] diff --git a/tests/test_depthcolumn.py b/tests/test_depthcolumn.py index 484a8e7a..58501db3 100644 --- a/tests/test_depthcolumn.py +++ b/tests/test_depthcolumn.py @@ -7,26 +7,25 @@ def test_boundarydepthcolumn_isarithmeticprogression(): # noqa: D103 """Test the is_arithmetic_progression method of the BoundaryDepthColumn class.""" - page_number = 1 column = BoundaryDepthColumn( [ - DepthColumnEntry(fitz.Rect(), value=1, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=2, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=3, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=4, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=5, page_number=page_number), + DepthColumnEntry(fitz.Rect(), value=1), + DepthColumnEntry(fitz.Rect(), value=2), + DepthColumnEntry(fitz.Rect(), value=3), + DepthColumnEntry(fitz.Rect(), value=4), + DepthColumnEntry(fitz.Rect(), value=5), ] ) assert column.is_arithmetic_progression(), "The column should be recognized as arithmetic progression" column = BoundaryDepthColumn( [ - DepthColumnEntry(fitz.Rect(), value=17.6, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=18.15, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=18.65, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=19.3, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=19.9, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=20.5, page_number=page_number), + DepthColumnEntry(fitz.Rect(), value=17.6), + DepthColumnEntry(fitz.Rect(), value=18.15), + DepthColumnEntry(fitz.Rect(), value=18.65), + DepthColumnEntry(fitz.Rect(), value=19.3), + DepthColumnEntry(fitz.Rect(), value=19.9), + DepthColumnEntry(fitz.Rect(), value=20.5), ] ) assert not column.is_arithmetic_progression(), "The column should not be recognized as arithmetic progression" diff --git a/tests/test_find_depth_columns.py b/tests/test_find_depth_columns.py index d8fd9294..9574958d 100644 --- a/tests/test_find_depth_columns.py +++ b/tests/test_find_depth_columns.py @@ -80,11 +80,11 @@ def test_depth_column_entries_with_leading_character(): # noqa: D103 def test_find_depth_columns_arithmetic_progression(): # noqa: D103 """Test the find_depth_columns function with an arithmetic progression.""" entries = [ - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 10.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 30.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, PAGE_NUMBER), + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 10.0), + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 30.0), + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), ] columns = find_depth_columns( entries, @@ -98,11 +98,11 @@ def test_find_depth_columns_arithmetic_progression(): # noqa: D103 def test_find_depth_columns(): # noqa: D103 """Test the find_depth_columns function.""" entries = [ - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, PAGE_NUMBER), + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0), + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0), + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), ] columns = find_depth_columns( @@ -123,17 +123,17 @@ def test_find_depth_columns(): # noqa: D103 def test_two_columns_find_depth_columns(): # noqa: D103 """Test the find_depth_columns function with two columns.""" entries = [ # first depth column - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0, PAGE_NUMBER), # second depth column - DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 10, 25, 11), 61.0, PAGE_NUMBER), + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0), + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0), + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), + DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0), # second depth column + DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0), + DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0), + DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0), + DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0), + DepthColumnEntry(fitz.Rect(20, 10, 25, 11), 61.0), ] columns = find_depth_columns( entries, @@ -149,19 +149,19 @@ def test_two_columns_find_depth_columns(): # noqa: D103 def test_find_layer_depth_columns(): # noqa: D103 """Test the find_layer_depth_columns function.""" entries = [ - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, PAGE_NUMBER), # layer 12.0-20.0m - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, PAGE_NUMBER), # layer 20.0-34.0m - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, PAGE_NUMBER), # layer 34.0-40.0m - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, PAGE_NUMBER), # layer 40.0-50.0m - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, PAGE_NUMBER), # layer 50.0-60.0m - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0, PAGE_NUMBER), + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0), # layer 12.0-20.0m + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0), + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), # layer 20.0-34.0m + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0), + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0), # layer 34.0-40.0m + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0), + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), # layer 40.0-50.0m + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0), + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), # layer 50.0-60.0m + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0), ] - columns = find_layer_depth_columns(entries, ALL_WORDS_FIND_DEPTH_COLUMN) + columns = find_layer_depth_columns(entries, ALL_WORDS_FIND_DEPTH_COLUMN, PAGE_NUMBER) assert len(columns) == 1, "There should be 1 column" assert len(columns[0].entries) == 5, "The column should have 5 entries" assert columns[0].entries[0].start.value == 12.0, "The first entry should have a value of 12.0" @@ -179,29 +179,29 @@ def test_find_layer_depth_columns(): # noqa: D103 def test_two_columns_find_layer_depth_columns(): # noqa: D103 """Test the find_layer_depth_columns function with two columns.""" entries = [ # first depth column - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, PAGE_NUMBER), # layer 12.0-20.0m - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, PAGE_NUMBER), # layer 20.0-34.0m - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, PAGE_NUMBER), # layer 34.0-40.0m - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, PAGE_NUMBER), # layer 40.0-50.0m - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, PAGE_NUMBER), # layer 50.0-60.0m - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0, PAGE_NUMBER), + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0), # layer 12.0-20.0m + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0), + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), # layer 20.0-34.0m + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0), + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0), # layer 34.0-40.0m + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0), + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), # layer 40.0-50.0m + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0), + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), # layer 50.0-60.0m + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0), # second depth column - DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0, PAGE_NUMBER), # layer 12.0-20.0m - DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0, PAGE_NUMBER), # layer 20.0-34.0m - DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 34.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0, PAGE_NUMBER), # layer 34.0-40.0m - DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0, PAGE_NUMBER), # layer 40.0-50.0m - DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 50.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0, PAGE_NUMBER), # layer 50.0-60.0m - DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 60.0, PAGE_NUMBER), + DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0), # layer 12.0-20.0m + DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 20.0), + DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0), # layer 20.0-34.0m + DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 34.0), + DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0), # layer 34.0-40.0m + DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 40.0), + DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0), # layer 40.0-50.0m + DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 50.0), + DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0), # layer 50.0-60.0m + DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 60.0), ] - columns = find_layer_depth_columns(entries, ALL_WORDS_FIND_LAYER_DEPTH_COLUMN) + columns = find_layer_depth_columns(entries, ALL_WORDS_FIND_LAYER_DEPTH_COLUMN, PAGE_NUMBER) assert len(columns) == 2, "There should be 2 columns" assert len(columns[0].entries) == 5, "The first column should have 5 entries" assert len(columns[1].entries) == 5, "The second column should have 5 entries" From b218485d9103e5de4e62e044e7c36e34046b3086 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Mon, 11 Nov 2024 14:46:19 +0100 Subject: [PATCH 02/20] LGVISIUM-102: minor fixes --- src/stratigraphy/layer/layer.py | 12 ++---------- tests/test_depthcolumn.py | 8 ++++++-- tests/test_interval.py | 14 ++++++-------- 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/src/stratigraphy/layer/layer.py b/src/stratigraphy/layer/layer.py index 0d9001f9..298df64e 100644 --- a/src/stratigraphy/layer/layer.py +++ b/src/stratigraphy/layer/layer.py @@ -58,20 +58,12 @@ def from_json(cls, data: dict) -> "Layer": start_data = depth_interval.get("start") end_data = depth_interval.get("end") start = ( - DepthColumnEntry( - value=start_data["value"], - rect=fitz.Rect(start_data["rect"]), - page_number=start_data["page"], - ) + DepthColumnEntry(value=start_data["value"], rect=fitz.Rect(start_data["rect"])) if start_data is not None else None ) end = ( - DepthColumnEntry( - value=end_data["value"], - rect=fitz.Rect(end_data["rect"]), - page_number=end_data["page"], - ) + DepthColumnEntry(value=end_data["value"], rect=fitz.Rect(end_data["rect"])) if end_data is not None else None ) diff --git a/tests/test_depthcolumn.py b/tests/test_depthcolumn.py index 58501db3..915758b4 100644 --- a/tests/test_depthcolumn.py +++ b/tests/test_depthcolumn.py @@ -7,6 +7,8 @@ def test_boundarydepthcolumn_isarithmeticprogression(): # noqa: D103 """Test the is_arithmetic_progression method of the BoundaryDepthColumn class.""" + page_number = 1 + column = BoundaryDepthColumn( [ DepthColumnEntry(fitz.Rect(), value=1), @@ -14,7 +16,8 @@ def test_boundarydepthcolumn_isarithmeticprogression(): # noqa: D103 DepthColumnEntry(fitz.Rect(), value=3), DepthColumnEntry(fitz.Rect(), value=4), DepthColumnEntry(fitz.Rect(), value=5), - ] + ], + page=page_number, ) assert column.is_arithmetic_progression(), "The column should be recognized as arithmetic progression" @@ -26,6 +29,7 @@ def test_boundarydepthcolumn_isarithmeticprogression(): # noqa: D103 DepthColumnEntry(fitz.Rect(), value=19.3), DepthColumnEntry(fitz.Rect(), value=19.9), DepthColumnEntry(fitz.Rect(), value=20.5), - ] + ], + page=page_number, ) assert not column.is_arithmetic_progression(), "The column should not be recognized as arithmetic progression" diff --git a/tests/test_interval.py b/tests/test_interval.py index 0d90cded..b53c0718 100644 --- a/tests/test_interval.py +++ b/tests/test_interval.py @@ -7,9 +7,8 @@ def test_line_anchor(): # noqa: D103 """Test the line anchor property of the BoundaryInterval and LayerInterval classes.""" - page_number = 1 - start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5, page_number) - end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10, page_number) + start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5) + end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10) boundary_interval = BoundaryInterval(start, end) assert boundary_interval.line_anchor == fitz.Point(1, 1.5), ( "The 'line anchor' for a BoundaryInterval should be halfway between the bottom-right of the start depth and " @@ -26,8 +25,8 @@ def test_line_anchor(): # noqa: D103 1, 2 ), "The 'line anchor' for a BoundaryInterval without start should be the top-right of the end depth." - start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5, page_number) - end = DepthColumnEntry(fitz.Rect(2, 0, 3, 1), 10, page_number) + start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5) + end = DepthColumnEntry(fitz.Rect(2, 0, 3, 1), 10) entry = LayerDepthColumnEntry(start, end) layer_interval = LayerInterval(entry) assert layer_interval.line_anchor == fitz.Point( @@ -37,9 +36,8 @@ def test_line_anchor(): # noqa: D103 def test_background_rect(): # noqa: D103 """Test the background_rect property of the BoundaryInterval class.""" - page_number = 1 - start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5, page_number) - end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10, page_number) + start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5) + end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10) boundary_interval = BoundaryInterval(start, end) assert boundary_interval.background_rect == fitz.Rect( start.rect.x0, start.rect.y1, start.rect.x1, end.rect.y0 From b527b8a5d02616192a570a1ad5a6a10bd09e1cc0 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Mon, 11 Nov 2024 14:59:15 +0100 Subject: [PATCH 03/20] LGVISIUM-102: minor fixes --- src/stratigraphy/annotations/draw.py | 2 +- .../boundarydepthcolumnvalidator.py | 4 ++-- src/stratigraphy/depthcolumn/depthcolumn.py | 17 ++++++----------- .../depthcolumn/find_depth_columns.py | 12 ++++-------- .../depths_materials_column_pairs.py | 2 ++ src/stratigraphy/extract.py | 19 ++++++++++++------- .../layer/layer_identifier_column.py | 7 +++---- tests/test_depthcolumn.py | 8 ++------ tests/test_find_depth_columns.py | 13 +++++-------- 9 files changed, 37 insertions(+), 47 deletions(-) diff --git a/src/stratigraphy/annotations/draw.py b/src/stratigraphy/annotations/draw.py index 6045209f..46f167fc 100644 --- a/src/stratigraphy/annotations/draw.py +++ b/src/stratigraphy/annotations/draw.py @@ -98,7 +98,7 @@ def draw_predictions( draw_depth_columns_and_material_rect( shape, page.derotation_matrix, - [pair for pair in depths_materials_column_pairs if pair.depth_column.page == page_number], + [pair for pair in depths_materials_column_pairs if pair.page == page_number], ) draw_material_descriptions( shape, diff --git a/src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py b/src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py index f2222486..1636a09d 100644 --- a/src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py +++ b/src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py @@ -105,10 +105,10 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu Returns: BoundaryDepthColumn | None: The corrected depth column, or None if no correction was possible. """ - new_columns = [BoundaryDepthColumn(entries=[], page=column.page)] + new_columns = [BoundaryDepthColumn(entries=[])] for entry in column.entries: new_columns = [ - BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value)], page=column.page) + BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value)]) for column in new_columns for new_value in _value_alternatives(entry.value) ] diff --git a/src/stratigraphy/depthcolumn/depthcolumn.py b/src/stratigraphy/depthcolumn/depthcolumn.py index aaaf6ad4..17d10665 100644 --- a/src/stratigraphy/depthcolumn/depthcolumn.py +++ b/src/stratigraphy/depthcolumn/depthcolumn.py @@ -23,7 +23,6 @@ class DepthColumn(abc.ABC, Generic[EntryT]): """Abstract DepthColumn class.""" entries: list[EntryT] - page: int def rects(self) -> list[fitz.Rect]: """Get the rectangles of the depth column entries.""" @@ -171,7 +170,7 @@ def break_on_mismatch(self) -> list[LayerDepthColumn]: if final_segment: segments.append(final_segment) - return [LayerDepthColumn(segment, page=self.page) for segment in segments] + return [LayerDepthColumn(segment) for segment in segments] def is_valid(self) -> bool: """Checks if the depth column is valid. @@ -265,10 +264,10 @@ def to_json(self) -> dict: def valid_initial_segment(self, rect: fitz.Rect) -> BoundaryDepthColumn: for i in range(len(self.entries) - 1): - initial_segment = BoundaryDepthColumn(self.entries[: -i - 1], page=self.page) + initial_segment = BoundaryDepthColumn(self.entries[: -i - 1]) if initial_segment.can_be_appended(rect): return initial_segment - return BoundaryDepthColumn(entries=[], page=self.page) + return BoundaryDepthColumn(entries=[]) def strictly_contains(self, other: BoundaryDepthColumn) -> bool: return len(other.entries) < len(self.entries) and all( @@ -304,9 +303,7 @@ def significant_arithmetic_progression(self) -> bool: return self.is_arithmetic_progression() else: for i in range(len(self.entries) - segment_length + 1): - if BoundaryDepthColumn( - self.entries[i : i + segment_length], page=self.page - ).is_arithmetic_progression(): + if BoundaryDepthColumn(self.entries[i : i + segment_length]).is_arithmetic_progression(): return True return False @@ -341,9 +338,7 @@ def remove_entry_by_correlation_gradient(self) -> BoundaryDepthColumn | None: return None new_columns = [ - BoundaryDepthColumn( - [entry for index, entry in enumerate(self.entries) if index != remove_index], page=self.page - ) + BoundaryDepthColumn([entry for index, entry in enumerate(self.entries) if index != remove_index]) for remove_index in range(len(self.entries)) ] return max(new_columns, key=lambda column: column.pearson_correlation_coef()) @@ -368,7 +363,7 @@ def break_on_double_descending(self) -> list[BoundaryDepthColumn]: if final_segment: segments.append(final_segment) - return [BoundaryDepthColumn(segment, page=self.page) for segment in segments] + return [BoundaryDepthColumn(segment) for segment in segments] def identify_groups( self, diff --git a/src/stratigraphy/depthcolumn/find_depth_columns.py b/src/stratigraphy/depthcolumn/find_depth_columns.py index ccadae35..68ce6bbc 100644 --- a/src/stratigraphy/depthcolumn/find_depth_columns.py +++ b/src/stratigraphy/depthcolumn/find_depth_columns.py @@ -86,9 +86,7 @@ def extract_layer_depth_interval( return None -def find_layer_depth_columns( - entries: list[DepthColumnEntry], all_words: list[TextWord], page_number: int -) -> list[LayerDepthColumn]: +def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[TextWord]) -> list[LayerDepthColumn]: """Finds all layer depth columns. Generates a list of LayerDepthColumnEntry objects by finding consecutive pairs of DepthColumnEntry objects. @@ -101,7 +99,6 @@ def find_layer_depth_columns( Args: entries (list[DepthColumnEntry]): List of depth column entries. all_words (list[TextWord]): List of all TextWord objects. - page_number (int): The number of the page. Returns: list[LayerDepthColumn]: List of all layer depth columns identified. @@ -145,7 +142,7 @@ def find_pair(entry: DepthColumnEntry) -> DepthColumnEntry | None: # noqa: D103 column.entries.append(entry) if not is_matched: - columns.append(LayerDepthColumn([entry], page=page_number)) + columns.append(LayerDepthColumn([entry])) return [ column_segment @@ -156,14 +153,13 @@ def find_pair(entry: DepthColumnEntry) -> DepthColumnEntry | None: # noqa: D103 def find_depth_columns( - entries: list[DepthColumnEntry], all_words: list[TextWord], page_number: int, depth_column_params: dict + entries: list[DepthColumnEntry], all_words: list[TextWord], depth_column_params: dict ) -> list[BoundaryDepthColumn]: """Construct all possible BoundaryDepthColumn objects from the given DepthColumnEntry objects. Args: entries (list[DepthColumnEntry]): All found depth column entries in the page. all_words (list[TextLine]): All words in the page. - page_number (int): The page number of the entries. depth_column_params (dict): Parameters for the BoundaryDepthColumn objects. Returns: @@ -186,7 +182,7 @@ def find_depth_columns( numeric_columns.extend(additional_columns) if not has_match: - numeric_columns.append(BoundaryDepthColumn(entries=[entry], page=page_number)) + numeric_columns.append(BoundaryDepthColumn(entries=[entry])) # only keep columns that are not contained in a different column numeric_columns = [ diff --git a/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py b/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py index 35f8f575..d7efd203 100644 --- a/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py +++ b/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py @@ -14,6 +14,7 @@ class DepthsMaterialsColumnPair: depth_column: DepthColumn | None material_description_rect: fitz.Rect + page: int def to_json(self) -> dict: """Converts the object to a dictionary. @@ -29,6 +30,7 @@ def to_json(self) -> dict: self.material_description_rect.x1, self.material_description_rect.y1, ], + "page": self.page, } def score_column_match(self, all_words: list[TextWord] | None = None) -> float: diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index a18b94e2..d7ddf88b 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -61,7 +61,7 @@ def process_page( # Detect Layer Index Columns layer_identifier_entries = find_layer_identifier_column_entries(lines) layer_identifier_columns = ( - find_layer_identifier_column(layer_identifier_entries, page_number) if layer_identifier_entries else [] + find_layer_identifier_column(layer_identifier_entries) if layer_identifier_entries else [] ) depths_materials_column_pairs = [] if layer_identifier_columns: @@ -71,7 +71,7 @@ def process_page( ) if material_description_rect: depths_materials_column_pairs.append( - DepthsMaterialsColumnPair(layer_identifier_column, material_description_rect) + DepthsMaterialsColumnPair(layer_identifier_column, material_description_rect, page=page_number) ) # Obtain the best pair. In contrast to depth columns, there only ever is one layer index column per page. @@ -83,7 +83,7 @@ def process_page( if not depths_materials_column_pairs: words = [word for line in lines for word in line.words] depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True) - layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words, page_number) + layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words) used_entry_rects = [] for column in layer_depth_columns: @@ -98,7 +98,7 @@ def process_page( depth_columns: list[DepthColumn] = layer_depth_columns depth_columns.extend( find_depth_columns.find_depth_columns( - depth_column_entries, words, page_number, depth_column_params=params["depth_column_params"] + depth_column_entries, words, depth_column_params=params["depth_column_params"] ) ) @@ -108,7 +108,7 @@ def process_page( ) if material_description_rect: depths_materials_column_pairs.append( - DepthsMaterialsColumnPair(depth_column, material_description_rect) + DepthsMaterialsColumnPair(depth_column, material_description_rect, page=page_number) ) # lowest score first depths_materials_column_pairs.sort(key=lambda pair: pair.score_column_match(words)) @@ -151,7 +151,11 @@ def process_page( ) pairs.extend([IntervalBlockPair(block=block, depth_interval=None) for block in description_blocks]) filtered_depth_material_column_pairs.extend( - [DepthsMaterialsColumnPair(depth_column=None, material_description_rect=material_description_rect)] + [ + DepthsMaterialsColumnPair( + depth_column=None, material_description_rect=material_description_rect, page=page_number + ) + ] ) layer_predictions = [ @@ -444,7 +448,8 @@ def is_below(best_x0, best_y1, line): return None if depth_column: return max( - candidate_rects, key=lambda rect: DepthsMaterialsColumnPair(depth_column, rect).score_column_match() + candidate_rects, + key=lambda rect: DepthsMaterialsColumnPair(depth_column, rect, page=0).score_column_match(), ) else: return candidate_rects[0] diff --git a/src/stratigraphy/layer/layer_identifier_column.py b/src/stratigraphy/layer/layer_identifier_column.py index f8a07bd1..670b4477 100644 --- a/src/stratigraphy/layer/layer_identifier_column.py +++ b/src/stratigraphy/layer/layer_identifier_column.py @@ -214,19 +214,18 @@ def find_layer_identifier_column_entries(lines: list[TextLine]) -> list[LayerIde return entries -def find_layer_identifier_column(entries: list[LayerIdentifierEntry], page_number: int) -> list[LayerIdentifierColumn]: +def find_layer_identifier_column(entries: list[LayerIdentifierEntry]) -> list[LayerIdentifierColumn]: """Find the layer identifier column given the index column entries. Note: Similar to find_depth_columns.find_depth_columns. Refactoring may be desired. Args: entries (list[LayerIdentifierEntry]): The layer identifier column entries. - page_number (int): The number of the page. Returns: list[LayerIdentifierColumn]: The found layer identifier columns. """ - layer_identifier_columns = [LayerIdentifierColumn([entries[0]], page=page_number)] + layer_identifier_columns = [LayerIdentifierColumn([entries[0]])] for entry in entries[1:]: has_match = False for column in layer_identifier_columns: @@ -234,7 +233,7 @@ def find_layer_identifier_column(entries: list[LayerIdentifierEntry], page_numbe column.entries.append(entry) has_match = True if not has_match: - layer_identifier_columns.append(LayerIdentifierColumn([entry], page=page_number)) + layer_identifier_columns.append(LayerIdentifierColumn([entry])) # only keep columns whose entries are not fully contained in a different column layer_identifier_columns = [ diff --git a/tests/test_depthcolumn.py b/tests/test_depthcolumn.py index 915758b4..58501db3 100644 --- a/tests/test_depthcolumn.py +++ b/tests/test_depthcolumn.py @@ -7,8 +7,6 @@ def test_boundarydepthcolumn_isarithmeticprogression(): # noqa: D103 """Test the is_arithmetic_progression method of the BoundaryDepthColumn class.""" - page_number = 1 - column = BoundaryDepthColumn( [ DepthColumnEntry(fitz.Rect(), value=1), @@ -16,8 +14,7 @@ def test_boundarydepthcolumn_isarithmeticprogression(): # noqa: D103 DepthColumnEntry(fitz.Rect(), value=3), DepthColumnEntry(fitz.Rect(), value=4), DepthColumnEntry(fitz.Rect(), value=5), - ], - page=page_number, + ] ) assert column.is_arithmetic_progression(), "The column should be recognized as arithmetic progression" @@ -29,7 +26,6 @@ def test_boundarydepthcolumn_isarithmeticprogression(): # noqa: D103 DepthColumnEntry(fitz.Rect(), value=19.3), DepthColumnEntry(fitz.Rect(), value=19.9), DepthColumnEntry(fitz.Rect(), value=20.5), - ], - page=page_number, + ] ) assert not column.is_arithmetic_progression(), "The column should not be recognized as arithmetic progression" diff --git a/tests/test_find_depth_columns.py b/tests/test_find_depth_columns.py index 9574958d..621eda07 100644 --- a/tests/test_find_depth_columns.py +++ b/tests/test_find_depth_columns.py @@ -8,7 +8,7 @@ find_depth_columns, find_layer_depth_columns, ) -from stratigraphy.lines.line import TextLine, TextWord +from stratigraphy.lines.line import TextWord PAGE_NUMBER = 1 ALL_WORDS_FIND_DEPTH_COLUMN = [ @@ -50,8 +50,8 @@ def test_depth_column_entries(): # noqa: D103 def test_depth_column_entries_with_splits(): # noqa: D103 """Test the depth_column_entries function with include_splits=True.""" all_words = [ - TextLine([TextWord(fitz.Rect(0, 0, 10, 1), "10.00-20.0m", PAGE_NUMBER)]), - TextLine([TextWord(fitz.Rect(0, 2, 10, 3), "30.0-40.0m", PAGE_NUMBER)]), + TextWord(fitz.Rect(0, 0, 10, 1), "10.00-20.0m", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 10, 3), "30.0-40.0m", PAGE_NUMBER), ] entries = depth_column_entries(all_words, include_splits=True) assert len(entries) == 4, "There should be 4 entries" @@ -89,7 +89,6 @@ def test_find_depth_columns_arithmetic_progression(): # noqa: D103 columns = find_depth_columns( entries, ALL_WORDS_FIND_DEPTH_COLUMN, - PAGE_NUMBER, depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) assert len(columns) == 0, "There should be 0 columns as the above is a perfect arithmetic progression" @@ -108,7 +107,6 @@ def test_find_depth_columns(): # noqa: D103 columns = find_depth_columns( entries, ALL_WORDS_FIND_DEPTH_COLUMN, - PAGE_NUMBER, depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) assert len(columns) == 1, "There should be 1 column" @@ -138,7 +136,6 @@ def test_two_columns_find_depth_columns(): # noqa: D103 columns = find_depth_columns( entries, ALL_WORDS_FIND_DEPTH_COLUMN, - PAGE_NUMBER, depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) assert len(columns) == 2, "There should be 2 columns" @@ -161,7 +158,7 @@ def test_find_layer_depth_columns(): # noqa: D103 DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0), ] - columns = find_layer_depth_columns(entries, ALL_WORDS_FIND_DEPTH_COLUMN, PAGE_NUMBER) + columns = find_layer_depth_columns(entries, ALL_WORDS_FIND_DEPTH_COLUMN) assert len(columns) == 1, "There should be 1 column" assert len(columns[0].entries) == 5, "The column should have 5 entries" assert columns[0].entries[0].start.value == 12.0, "The first entry should have a value of 12.0" @@ -201,7 +198,7 @@ def test_two_columns_find_layer_depth_columns(): # noqa: D103 DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0), # layer 50.0-60.0m DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 60.0), ] - columns = find_layer_depth_columns(entries, ALL_WORDS_FIND_LAYER_DEPTH_COLUMN, PAGE_NUMBER) + columns = find_layer_depth_columns(entries, ALL_WORDS_FIND_LAYER_DEPTH_COLUMN) assert len(columns) == 2, "There should be 2 columns" assert len(columns[0].entries) == 5, "The first column should have 5 entries" assert len(columns[1].entries) == 5, "The second column should have 5 entries" From 6c514fa108609fdc6579ca4b21882d9c8d67d9d0 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Mon, 11 Nov 2024 15:40:41 +0100 Subject: [PATCH 04/20] LGVISIUM-102: replace DepthMaterialColumnPairs with BoundingBoxes in predictions --- src/stratigraphy/annotations/draw.py | 4 +- src/stratigraphy/depthcolumn/depthcolumn.py | 31 ------------ .../bounding_boxes.py | 48 +++++++++++++++++++ .../depths_materials_column_pairs.py | 18 ------- src/stratigraphy/extract.py | 36 ++++++++------ .../layer/layer_identifier_column.py | 46 ------------------ src/stratigraphy/main.py | 6 +-- src/stratigraphy/util/predictions.py | 17 +++---- tests/test_predictions.py | 2 +- 9 files changed, 81 insertions(+), 127 deletions(-) create mode 100644 src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py diff --git a/src/stratigraphy/annotations/draw.py b/src/stratigraphy/annotations/draw.py index 46f167fc..1549fa7a 100644 --- a/src/stratigraphy/annotations/draw.py +++ b/src/stratigraphy/annotations/draw.py @@ -55,7 +55,7 @@ def draw_predictions( for file_prediction in predictions.file_predictions_list: logger.info("Drawing predictions for file %s", file_prediction.file_name) - depths_materials_column_pairs = file_prediction.depths_materials_columns_pairs + bounding_boxes = file_prediction.bounding_boxes coordinates = file_prediction.metadata.coordinates elevation = file_prediction.metadata.elevation @@ -98,7 +98,7 @@ def draw_predictions( draw_depth_columns_and_material_rect( shape, page.derotation_matrix, - [pair for pair in depths_materials_column_pairs if pair.page == page_number], + [bboxes for bboxes in bounding_boxes if bboxes.page == page_number], ) draw_material_descriptions( shape, diff --git a/src/stratigraphy/depthcolumn/depthcolumn.py b/src/stratigraphy/depthcolumn/depthcolumn.py index 17d10665..a9e4f7fe 100644 --- a/src/stratigraphy/depthcolumn/depthcolumn.py +++ b/src/stratigraphy/depthcolumn/depthcolumn.py @@ -85,11 +85,6 @@ def identify_groups( """ pass - @abc.abstractmethod - def to_json(self): - """Converts the object to a dictionary.""" - pass - def can_be_appended(self, rect: fitz.Rect) -> bool: """Checks if a new depth column entry can be appended to the current depth column. @@ -136,19 +131,6 @@ def __repr__(self): """ return "LayerDepthColumn({})".format(", ".join([str(entry) for entry in self.entries])) - def to_json(self) -> dict: - """Converts the object to a dictionary. - - Returns: - dict: The object as a dictionary. - """ - rect = self.rect() - return { - "rect": [rect.x0, rect.y0, rect.x1, rect.y1], - "entries": [entry.to_json() for entry in self.entries], - "type": "LayerDepthColumn", - } - def depth_intervals(self) -> list[LayerInterval]: return [LayerInterval(entry) for entry in self.entries] @@ -249,19 +231,6 @@ class BoundaryDepthColumn(DepthColumn[DepthColumnEntry]): def __repr__(self): return "DepthColumn({})".format(", ".join([str(entry) for entry in self.entries])) - def to_json(self) -> dict: - """Converts the object to a dictionary. - - Returns: - dict: The object as a dictionary. - """ - rect = self.rect() - return { - "rect": [rect.x0, rect.y0, rect.x1, rect.y1], - "entries": [entry.to_json() for entry in self.entries], - "type": "BoundaryDepthColumn", - } - def valid_initial_segment(self, rect: fitz.Rect) -> BoundaryDepthColumn: for i in range(len(self.entries) - 1): initial_segment = BoundaryDepthColumn(self.entries[: -i - 1]) diff --git a/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py new file mode 100644 index 00000000..aa737119 --- /dev/null +++ b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py @@ -0,0 +1,48 @@ +"""Definition of the DepthsMaterialsColumnPairs class.""" + +from dataclasses import dataclass + +import fitz + + +@dataclass +class BoundingBox: + """A single bounding box, JSON serializable.""" + + rect: fitz.Rect + + def to_json(self) -> list[int]: + """Converts the object to a dictionary. + + Returns: + list[int]: The object as a list. + """ + return [ + self.rect.x0, + self.rect.y0, + self.rect.x1, + self.rect.y1, + ] + + +@dataclass +class BoundingBoxes: + """A class to represent the bounding boxes of depth columns and associated material descriptions.""" + + depth_column_bbox: BoundingBox | None + depth_column_entry_bboxes: list[BoundingBox] + material_description_bbox: BoundingBox + page: int + + def to_json(self) -> dict: + """Converts the object to a dictionary. + + Returns: + dict: The object as a dictionary. + """ + return { + "depth_column_rect": self.depth_column_bbox.to_json() if self.depth_column_bbox else None, + "depth_column_entries": [entry.to_json for entry in self.depth_column_entry_bboxes], + "material_description_rect": self.material_description_bbox.to_json(), + "page": self.page, + } diff --git a/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py b/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py index d7efd203..2cb51d85 100644 --- a/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py +++ b/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py @@ -14,24 +14,6 @@ class DepthsMaterialsColumnPair: depth_column: DepthColumn | None material_description_rect: fitz.Rect - page: int - - def to_json(self) -> dict: - """Converts the object to a dictionary. - - Returns: - dict: The object as a dictionary. - """ - return { - "depth_column": self.depth_column.to_json() if self.depth_column else None, - "material_description_rect": [ - self.material_description_rect.x0, - self.material_description_rect.y0, - self.material_description_rect.x1, - self.material_description_rect.y1, - ], - "page": self.page, - } def score_column_match(self, all_words: list[TextWord] | None = None) -> float: """Scores the match between a depth column and a material description. diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index d7ddf88b..53094f57 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -8,6 +8,7 @@ from stratigraphy.data_extractor.data_extractor import FeatureOnPage from stratigraphy.depthcolumn import find_depth_columns from stratigraphy.depthcolumn.depthcolumn import DepthColumn +from stratigraphy.depths_materials_column_pairs.bounding_boxes import BoundingBox, BoundingBoxes from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPair from stratigraphy.layer.layer import IntervalBlockPair, Layer from stratigraphy.layer.layer_identifier_column import ( @@ -35,7 +36,7 @@ class ProcessPageResult: """The result of processing a single page of a pdf.""" predictions: list[Layer] - depth_material_pairs: list[DepthsMaterialsColumnPair] + bounding_boxes: list[BoundingBoxes] def process_page( @@ -71,10 +72,9 @@ def process_page( ) if material_description_rect: depths_materials_column_pairs.append( - DepthsMaterialsColumnPair(layer_identifier_column, material_description_rect, page=page_number) + DepthsMaterialsColumnPair(layer_identifier_column, material_description_rect) ) - # Obtain the best pair. In contrast to depth columns, there only ever is one layer index column per page. if depths_materials_column_pairs: depths_materials_column_pairs.sort(key=lambda pair: pair.score_column_match()) @@ -108,7 +108,7 @@ def process_page( ) if material_description_rect: depths_materials_column_pairs.append( - DepthsMaterialsColumnPair(depth_column, material_description_rect, page=page_number) + DepthsMaterialsColumnPair(depth_column, material_description_rect) ) # lowest score first depths_materials_column_pairs.sort(key=lambda pair: pair.score_column_match(words)) @@ -125,7 +125,6 @@ def process_page( ] pairs: list[IntervalBlockPair] = [] # list of matched depth intervals and text blocks - # groups is of the form: [{"depth_interval": BoundaryInterval, "block": TextBlock}] if filtered_depth_material_column_pairs: # match depth column items with material description for pair in filtered_depth_material_column_pairs: description_lines = get_description_lines(lines, pair.material_description_rect) @@ -135,7 +134,6 @@ def process_page( ) pairs.extend(new_pairs) else: - filtered_depth_material_column_pairs = [] # Fallback when no depth column was found material_description_rect = find_material_description_column( lines, depth_column=None, language=language, **params["material_description"] @@ -150,13 +148,21 @@ def process_page( params["left_line_length_threshold"], ) pairs.extend([IntervalBlockPair(block=block, depth_interval=None) for block in description_blocks]) - filtered_depth_material_column_pairs.extend( - [ - DepthsMaterialsColumnPair( - depth_column=None, material_description_rect=material_description_rect, page=page_number - ) - ] - ) + + bounding_boxes = [] + for pair in filtered_depth_material_column_pairs: + if pair.depth_column: + depth_column_bbox = BoundingBox(pair.depth_column.rect()) + depth_column_entry_bboxes = [BoundingBox(entry.rect) for entry in pair.depth_column.entries] + else: + depth_column_bbox = None + depth_column_entry_bboxes = [] + BoundingBoxes( + depth_column_bbox=depth_column_bbox, + depth_column_entry_bboxes=depth_column_entry_bboxes, + material_description_bbox=BoundingBox(pair.material_description_rect), + page=page_number, + ) layer_predictions = [ Layer( @@ -182,7 +188,7 @@ def process_page( for pair in pairs ] layer_predictions = [layer for layer in layer_predictions if layer.description_nonempty()] - return ProcessPageResult(layer_predictions, filtered_depth_material_column_pairs) + return ProcessPageResult(layer_predictions, bounding_boxes) def match_columns( @@ -449,7 +455,7 @@ def is_below(best_x0, best_y1, line): if depth_column: return max( candidate_rects, - key=lambda rect: DepthsMaterialsColumnPair(depth_column, rect, page=0).score_column_match(), + key=lambda rect: DepthsMaterialsColumnPair(depth_column, rect).score_column_match(), ) else: return candidate_rects[0] diff --git a/src/stratigraphy/layer/layer_identifier_column.py b/src/stratigraphy/layer/layer_identifier_column.py index 670b4477..801a41f8 100644 --- a/src/stratigraphy/layer/layer_identifier_column.py +++ b/src/stratigraphy/layer/layer_identifier_column.py @@ -25,17 +25,6 @@ def __init__(self, rect: fitz.Rect, text: str): def __repr__(self): return str(self.text) - def to_json(self): - """Convert the layer identifier entry to a JSON serializable format. - - Returns: - dict: The JSON serializable format of the layer identifier entry. - """ - return { - "text": self.text, - "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], - } - @dataclass class LayerIdentifierColumn(DepthColumn[LayerIdentifierEntry]): @@ -149,41 +138,6 @@ def is_contained(self, rect: fitz.Rect) -> bool: and self.rect().y1 <= rect.y1 ) - def to_json(self): - """Convert the layer identifier column to a JSON serializable format. - - Returns: - dict: The JSON serializable format of the layer identifier column. - """ - rect = self.rect() - return { - "rect": [rect.x0, rect.y0, rect.x1, rect.y1], - "entries": [entry.to_json() for entry in self.entries], - "type": "LayerIdentifierColumn", - } - - @classmethod - def from_json(cls, data: dict) -> "LayerIdentifierColumn": - """Converts a dictionary to an object. - - Args: - data (dict): A dictionary containing 'entries' list with 'rect' and 'text' fields. - - Raises: - ValueError: If the input dictionary is missing required fields or has invalid data. - - Returns: - LayerIdentifierColumn: The layer identifier column object. - """ - if not isinstance(data, dict) or "entries" not in data: - raise ValueError("Invalid input: data must be a dictionary with 'entries' field") - - return LayerIdentifierColumn( - entries=[ - LayerIdentifierEntry(rect=fitz.Rect(entry["rect"]), text=entry["text"]) for entry in data["entries"] - ] - ) - def find_layer_identifier_column_entries(lines: list[TextLine]) -> list[LayerIdentifierEntry]: r"""Find the layer identifier column entries. diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index 35a56c10..9b3ba10b 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -230,7 +230,7 @@ def start_pipeline( # Initialize common variables groundwater_entries = GroundwaterInDocument(filename=filename, groundwater=[]) layers_in_document = LayersInDocument([], filename) - depths_materials_columns_pairs = [] + bounding_boxes = [] if part == "all": # Extract the groundwater levels @@ -262,7 +262,7 @@ def start_pipeline( layer_predictions = process_page_results.predictions layers_in_document.layers.extend(layer_predictions) - depths_materials_columns_pairs.extend(process_page_results.depth_material_pairs) + bounding_boxes.extend(process_page_results.bounding_boxes) if draw_lines: # could be changed to if draw_lines and mflow_tracking: if not mlflow_tracking: @@ -282,7 +282,7 @@ def start_pipeline( metadata=metadata, groundwater=groundwater_entries, layers_in_document=layers_in_document, - depths_materials_columns_pairs=depths_materials_columns_pairs, + bounding_boxes=bounding_boxes, ) ) diff --git a/src/stratigraphy/util/predictions.py b/src/stratigraphy/util/predictions.py index 77660d41..37414381 100644 --- a/src/stratigraphy/util/predictions.py +++ b/src/stratigraphy/util/predictions.py @@ -5,7 +5,7 @@ from stratigraphy.benchmark.ground_truth import GroundTruth from stratigraphy.benchmark.metrics import OverallMetricsCatalog from stratigraphy.data_extractor.data_extractor import FeatureOnPage -from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPair +from stratigraphy.depths_materials_column_pairs.bounding_boxes import BoundingBoxes from stratigraphy.evaluation.evaluation_dataclasses import OverallBoreholeMetadataMetrics from stratigraphy.evaluation.groundwater_evaluator import GroundwaterEvaluator from stratigraphy.evaluation.layer_evaluator import LayerEvaluator @@ -26,10 +26,10 @@ def __init__( file_name: str, metadata: BoreholeMetadata, groundwater: GroundwaterInDocument, - depths_materials_columns_pairs: list[DepthsMaterialsColumnPair], + bounding_boxes: list[BoundingBoxes], ): self.layers_in_document: LayersInDocument = layers_in_document - self.depths_materials_columns_pairs: list[DepthsMaterialsColumnPair] = depths_materials_columns_pairs + self.bounding_boxes: list[BoundingBoxes] = bounding_boxes self.file_name: str = file_name self.metadata: BoreholeMetadata = metadata self.groundwater: GroundwaterInDocument = groundwater @@ -43,9 +43,7 @@ def to_json(self) -> dict: return { "metadata": self.metadata.to_json(), "layers": [layer.to_json() for layer in self.layers_in_document.layers], - "depths_materials_column_pairs": [dmc_pair.to_json() for dmc_pair in self.depths_materials_columns_pairs] - if self.depths_materials_columns_pairs is not None - else [], + "bounding_boxes": [bboxes.to_json() for bboxes in self.bounding_boxes], "page_dimensions": self.metadata.page_dimensions, # TODO: Remove, already in metadata "groundwater": self.groundwater.to_json() if self.groundwater is not None else [], "file_name": self.file_name, @@ -103,10 +101,7 @@ def from_json(cls, prediction_from_file: dict) -> "OverallFilePredictions": layers = [Layer.from_json(data) for data in file_data["layers"]] layers_in_doc = LayersInDocument(layers=layers, filename=file_name) - depths_materials_columns_pairs = [ - DepthsMaterialsColumnPair.from_json(dmc_pair) - for dmc_pair in file_data["depths_materials_column_pairs"] - ] + bounding_boxes = [BoundingBoxes.from_json(bboxes) for bboxes in file_data["bounding_boxes"]] groundwater_entries = [FeatureOnPage.from_json(entry, Groundwater) for entry in file_data["groundwater"]] groundwater_in_document = GroundwaterInDocument(groundwater=groundwater_entries, filename=file_name) @@ -115,7 +110,7 @@ def from_json(cls, prediction_from_file: dict) -> "OverallFilePredictions": layers_in_document=layers_in_doc, file_name=file_name, metadata=metadata, - depths_materials_columns_pairs=depths_materials_columns_pairs, + bounding_boxes=bounding_boxes, groundwater=groundwater_in_document, ) ) diff --git a/tests/test_predictions.py b/tests/test_predictions.py index c692f951..0fcc99d2 100644 --- a/tests/test_predictions.py +++ b/tests/test_predictions.py @@ -49,7 +49,7 @@ def sample_file_prediction() -> FilePredictions: file_name="test_file", metadata=metadata, groundwater=groundwater_in_doc, - depths_materials_columns_pairs=[], + bounding_boxes=[], ) From cb042bf379f2d4d243aceee9a983cfc486a43a31 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Mon, 11 Nov 2024 15:46:36 +0100 Subject: [PATCH 05/20] LGVISIUM-102: fixes for bboxes --- src/stratigraphy/annotations/draw.py | 20 ++++++++----------- .../bounding_boxes.py | 2 +- src/stratigraphy/extract.py | 12 ++++++----- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/src/stratigraphy/annotations/draw.py b/src/stratigraphy/annotations/draw.py index 1549fa7a..67b4cb38 100644 --- a/src/stratigraphy/annotations/draw.py +++ b/src/stratigraphy/annotations/draw.py @@ -8,8 +8,7 @@ import pandas as pd from dotenv import load_dotenv from stratigraphy.data_extractor.data_extractor import FeatureOnPage -from stratigraphy.depthcolumn.depthcolumn import DepthColumn -from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPair +from stratigraphy.depths_materials_column_pairs.bounding_boxes import BoundingBoxes from stratigraphy.groundwater.groundwater_extraction import Groundwater from stratigraphy.layer.layer import Layer from stratigraphy.metadata.coordinate_extraction import Coordinate @@ -245,7 +244,7 @@ def draw_material_descriptions(shape: fitz.Shape, derotation_matrix: fitz.Matrix def draw_depth_columns_and_material_rect( - shape: fitz.Shape, derotation_matrix: fitz.Matrix, depths_materials_column_pairs: list[DepthsMaterialsColumnPair] + shape: fitz.Shape, derotation_matrix: fitz.Matrix, bounding_boxes: list[BoundingBoxes] ): """Draw depth columns as well as the material rects on a pdf page. @@ -257,25 +256,22 @@ def draw_depth_columns_and_material_rect( Args: shape (fitz.Shape): The shape object for drawing. derotation_matrix (fitz.Matrix): The derotation matrix of the page. - depths_materials_column_pairs (list): List of depth column entries. + bounding_boxes (list[BoundingBoxes]): List of bounding boxes for depth column and material descriptions. """ - for pair in depths_materials_column_pairs: - depth_column: DepthColumn = pair.depth_column - material_description_rect = pair.material_description_rect - - if depth_column: # Draw rectangle for depth columns + for bboxes in bounding_boxes: + if bboxes.depth_column_bbox: # Draw rectangle for depth columns shape.draw_rect( - fitz.Rect(depth_column.rect()) * derotation_matrix, + fitz.Rect(bboxes.depth_column_bbox.rect) * derotation_matrix, ) shape.finish(color=fitz.utils.getColor("green")) - for depth_column_entry in depth_column.entries: # Draw rectangle for depth column entries + for depth_column_entry in bboxes.depth_column_entry_bboxes: # Draw rectangle for depth column entries shape.draw_rect( fitz.Rect(depth_column_entry.rect) * derotation_matrix, ) shape.finish(color=fitz.utils.getColor("purple")) shape.draw_rect( # Draw rectangle for material description column - fitz.Rect(material_description_rect) * derotation_matrix, + bboxes.material_description_bbox.rect * derotation_matrix, ) shape.finish(color=fitz.utils.getColor("red")) diff --git a/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py index aa737119..4aa53397 100644 --- a/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py +++ b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py @@ -42,7 +42,7 @@ def to_json(self) -> dict: """ return { "depth_column_rect": self.depth_column_bbox.to_json() if self.depth_column_bbox else None, - "depth_column_entries": [entry.to_json for entry in self.depth_column_entry_bboxes], + "depth_column_entries": [entry.to_json() for entry in self.depth_column_entry_bboxes], "material_description_rect": self.material_description_bbox.to_json(), "page": self.page, } diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 53094f57..215aca0d 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -157,11 +157,13 @@ def process_page( else: depth_column_bbox = None depth_column_entry_bboxes = [] - BoundingBoxes( - depth_column_bbox=depth_column_bbox, - depth_column_entry_bboxes=depth_column_entry_bboxes, - material_description_bbox=BoundingBox(pair.material_description_rect), - page=page_number, + bounding_boxes.append( + BoundingBoxes( + depth_column_bbox=depth_column_bbox, + depth_column_entry_bboxes=depth_column_entry_bboxes, + material_description_bbox=BoundingBox(pair.material_description_rect), + page=page_number, + ) ) layer_predictions = [ From ecb6727e492c53a6602b39399ac623f577c6109e Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Mon, 11 Nov 2024 16:40:46 +0100 Subject: [PATCH 06/20] LGVISIUM-102: BoundingBoxes.from_json --- .../bounding_boxes.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py index 4aa53397..f048431d 100644 --- a/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py +++ b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py @@ -24,6 +24,10 @@ def to_json(self) -> list[int]: self.rect.y1, ] + @classmethod + def from_json(cls, data) -> "BoundingBox": + return cls(rect=fitz.Rect(data)) + @dataclass class BoundingBoxes: @@ -46,3 +50,14 @@ def to_json(self) -> dict: "material_description_rect": self.material_description_bbox.to_json(), "page": self.page, } + + @classmethod + def from_json(cls, data) -> "BoundingBoxes": + return cls( + depth_column_bbox=BoundingBox.from_json(data["depth_column_bbox"]) + if "depth_column_bbox" in data + else None, + depth_column_entry_bboxes=[BoundingBox.from_json(entry) for entry in data["depth_column_entries"]], + material_description_bbox=BoundingBox.from_json(data["material_description_rect"]), + page=data["page"], + ) From 1c7bdce6f81d8e932818c4e4678ff572d254de0a Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Mon, 11 Nov 2024 16:46:37 +0100 Subject: [PATCH 07/20] LGVISIUM-102: keep material description without depth interval --- src/stratigraphy/extract.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 215aca0d..23e4a3db 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -242,13 +242,12 @@ def transform_groups( Returns: List[IntervalBlockPair]: Pairing of text blocks and depth intervals. """ - if len(depth_intervals) == 0: - return [] - elif len(depth_intervals) == 1: + if len(depth_intervals) <= 1: concatenated_block = TextBlock( [line for block in blocks for line in block.lines] ) # concatenate all text lines within a block; line separation flag does not matter here. - return [IntervalBlockPair(depth_interval=depth_intervals[0], block=concatenated_block)] + depth_interval = depth_intervals[0] if len(depth_intervals) else None + return [IntervalBlockPair(depth_interval=depth_interval, block=concatenated_block)] else: if len(blocks) < len(depth_intervals): blocks = split_blocks_by_textline_length(blocks, target_split_count=len(depth_intervals) - len(blocks)) From 03d9201c10619c26a853211b6a3cc4971f8d5b7a Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Mon, 11 Nov 2024 16:53:01 +0100 Subject: [PATCH 08/20] LGVISIUM-102: cleanup --- .../bounding_boxes.py | 18 +++++++++++++++++ src/stratigraphy/extract.py | 20 ++----------------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py index f048431d..dd4795cb 100644 --- a/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py +++ b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py @@ -3,6 +3,7 @@ from dataclasses import dataclass import fitz +from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPair @dataclass @@ -53,6 +54,7 @@ def to_json(self) -> dict: @classmethod def from_json(cls, data) -> "BoundingBoxes": + """Convert a JSON data structure to a BoundingBoxes object.""" return cls( depth_column_bbox=BoundingBox.from_json(data["depth_column_bbox"]) if "depth_column_bbox" in data @@ -61,3 +63,19 @@ def from_json(cls, data) -> "BoundingBoxes": material_description_bbox=BoundingBox.from_json(data["material_description_rect"]), page=data["page"], ) + + @classmethod + def from_depths_material_column_pair(cls, pair: DepthsMaterialsColumnPair, page_number: int) -> "BoundingBoxes": + """Convert a DepthsMaterialsColumnPair instance to a BoundingBoxes object.""" + if pair.depth_column: + depth_column_bbox = BoundingBox(pair.depth_column.rect()) + depth_column_entry_bboxes = [BoundingBox(entry.rect) for entry in pair.depth_column.entries] + else: + depth_column_bbox = None + depth_column_entry_bboxes = [] + return BoundingBoxes( + depth_column_bbox=depth_column_bbox, + depth_column_entry_bboxes=depth_column_entry_bboxes, + material_description_bbox=BoundingBox(pair.material_description_rect), + page=page_number, + ) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 23e4a3db..7adf30b0 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -8,7 +8,7 @@ from stratigraphy.data_extractor.data_extractor import FeatureOnPage from stratigraphy.depthcolumn import find_depth_columns from stratigraphy.depthcolumn.depthcolumn import DepthColumn -from stratigraphy.depths_materials_column_pairs.bounding_boxes import BoundingBox, BoundingBoxes +from stratigraphy.depths_materials_column_pairs.bounding_boxes import BoundingBoxes from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPair from stratigraphy.layer.layer import IntervalBlockPair, Layer from stratigraphy.layer.layer_identifier_column import ( @@ -149,23 +149,6 @@ def process_page( ) pairs.extend([IntervalBlockPair(block=block, depth_interval=None) for block in description_blocks]) - bounding_boxes = [] - for pair in filtered_depth_material_column_pairs: - if pair.depth_column: - depth_column_bbox = BoundingBox(pair.depth_column.rect()) - depth_column_entry_bboxes = [BoundingBox(entry.rect) for entry in pair.depth_column.entries] - else: - depth_column_bbox = None - depth_column_entry_bboxes = [] - bounding_boxes.append( - BoundingBoxes( - depth_column_bbox=depth_column_bbox, - depth_column_entry_bboxes=depth_column_entry_bboxes, - material_description_bbox=BoundingBox(pair.material_description_rect), - page=page_number, - ) - ) - layer_predictions = [ Layer( material_description=FeatureOnPage( @@ -190,6 +173,7 @@ def process_page( for pair in pairs ] layer_predictions = [layer for layer in layer_predictions if layer.description_nonempty()] + bounding_boxes = [BoundingBoxes.from_depths_material_column_pair(pair, page_number) for pair in pairs] return ProcessPageResult(layer_predictions, bounding_boxes) From 77c554bd7e9ce9dc091941a533933e83e6cad077 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Mon, 11 Nov 2024 17:38:26 +0100 Subject: [PATCH 09/20] LGVISIUM-102: cleanup --- src/stratigraphy/extract.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 7adf30b0..23aa28f1 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -8,7 +8,7 @@ from stratigraphy.data_extractor.data_extractor import FeatureOnPage from stratigraphy.depthcolumn import find_depth_columns from stratigraphy.depthcolumn.depthcolumn import DepthColumn -from stratigraphy.depths_materials_column_pairs.bounding_boxes import BoundingBoxes +from stratigraphy.depths_materials_column_pairs.bounding_boxes import BoundingBox, BoundingBoxes from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPair from stratigraphy.layer.layer import IntervalBlockPair, Layer from stratigraphy.layer.layer_identifier_column import ( @@ -126,6 +126,10 @@ def process_page( pairs: list[IntervalBlockPair] = [] # list of matched depth intervals and text blocks if filtered_depth_material_column_pairs: # match depth column items with material description + bounding_boxes = [ + BoundingBoxes.from_depths_material_column_pair(pair, page_number) + for pair in filtered_depth_material_column_pairs + ] for pair in filtered_depth_material_column_pairs: description_lines = get_description_lines(lines, pair.material_description_rect) if len(description_lines) > 1: @@ -138,7 +142,16 @@ def process_page( material_description_rect = find_material_description_column( lines, depth_column=None, language=language, **params["material_description"] ) + bounding_boxes = [] if material_description_rect: + bounding_boxes.append( + BoundingBoxes( + depth_column_bbox=None, + depth_column_entry_bboxes=[], + material_description_bbox=BoundingBox(material_description_rect), + page=page_number, + ) + ) description_lines = get_description_lines(lines, material_description_rect) description_blocks = get_description_blocks( description_lines, @@ -173,7 +186,6 @@ def process_page( for pair in pairs ] layer_predictions = [layer for layer in layer_predictions if layer.description_nonempty()] - bounding_boxes = [BoundingBoxes.from_depths_material_column_pair(pair, page_number) for pair in pairs] return ProcessPageResult(layer_predictions, bounding_boxes) From 609283e344342792b2f8e0e3354a22a2c457a6f6 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Tue, 12 Nov 2024 13:27:53 +0100 Subject: [PATCH 10/20] LGVISIUM-102: rename DepthColumn -> Sidebar --- src/stratigraphy/annotations/draw.py | 4 +- src/stratigraphy/depthcolumn/depthcolumn.py | 412 ------------------ .../depthcolumn/depthcolumnentry.py | 6 +- .../bounding_boxes.py | 26 +- ...material_description_rect_with_sidebar.py} | 16 +- src/stratigraphy/extract.py | 131 +++--- src/stratigraphy/layer/layer.py | 6 +- src/stratigraphy/sidebar/__init__.py | 9 + src/stratigraphy/sidebar/a_above_b_sidebar.py | 219 ++++++++++ .../a_above_b_sidebar_validator.py} | 46 +- src/stratigraphy/sidebar/a_to_b_sidebar.py | 114 +++++ .../find_sidebars.py} | 68 +-- .../layer_identifier_sidebar.py} | 50 ++- src/stratigraphy/sidebar/sidebar.py | 108 +++++ src/stratigraphy/util/interval.py | 19 +- tests/test_depthcolumn.py | 10 +- ..._depth_columns.py => test_find_sidebar.py} | 24 +- tests/test_interval.py | 16 +- 18 files changed, 657 insertions(+), 627 deletions(-) delete mode 100644 src/stratigraphy/depthcolumn/depthcolumn.py rename src/stratigraphy/depths_materials_column_pairs/{depths_materials_column_pairs.py => material_description_rect_with_sidebar.py} (61%) create mode 100644 src/stratigraphy/sidebar/__init__.py create mode 100644 src/stratigraphy/sidebar/a_above_b_sidebar.py rename src/stratigraphy/{depthcolumn/boundarydepthcolumnvalidator.py => sidebar/a_above_b_sidebar_validator.py} (77%) create mode 100644 src/stratigraphy/sidebar/a_to_b_sidebar.py rename src/stratigraphy/{depthcolumn/find_depth_columns.py => sidebar/find_sidebars.py} (80%) rename src/stratigraphy/{layer/layer_identifier_column.py => sidebar/layer_identifier_sidebar.py} (82%) create mode 100644 src/stratigraphy/sidebar/sidebar.py rename tests/{test_find_depth_columns.py => test_find_sidebar.py} (94%) diff --git a/src/stratigraphy/annotations/draw.py b/src/stratigraphy/annotations/draw.py index 67b4cb38..de513c69 100644 --- a/src/stratigraphy/annotations/draw.py +++ b/src/stratigraphy/annotations/draw.py @@ -259,9 +259,9 @@ def draw_depth_columns_and_material_rect( bounding_boxes (list[BoundingBoxes]): List of bounding boxes for depth column and material descriptions. """ for bboxes in bounding_boxes: - if bboxes.depth_column_bbox: # Draw rectangle for depth columns + if bboxes.sidebar_bbox: # Draw rectangle for depth columns shape.draw_rect( - fitz.Rect(bboxes.depth_column_bbox.rect) * derotation_matrix, + fitz.Rect(bboxes.sidebar_bbox.rect) * derotation_matrix, ) shape.finish(color=fitz.utils.getColor("green")) for depth_column_entry in bboxes.depth_column_entry_bboxes: # Draw rectangle for depth column entries diff --git a/src/stratigraphy/depthcolumn/depthcolumn.py b/src/stratigraphy/depthcolumn/depthcolumn.py deleted file mode 100644 index a9e4f7fe..00000000 --- a/src/stratigraphy/depthcolumn/depthcolumn.py +++ /dev/null @@ -1,412 +0,0 @@ -"""This module contains the DepthColumn class, which is used to represent a depth column in a pdf page.""" - -from __future__ import annotations - -import abc -from dataclasses import dataclass -from typing import Generic, TypeVar - -import fitz -import numpy as np -from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry, LayerDepthColumnEntry -from stratigraphy.layer.layer import IntervalBlockGroup -from stratigraphy.lines.line import TextLine, TextWord -from stratigraphy.text.find_description import get_description_blocks -from stratigraphy.util.dataclasses import Line -from stratigraphy.util.interval import BoundaryInterval, LayerInterval - -EntryT = TypeVar("EntryT", bound=DepthColumnEntry) - - -@dataclass -class DepthColumn(abc.ABC, Generic[EntryT]): - """Abstract DepthColumn class.""" - - entries: list[EntryT] - - def rects(self) -> list[fitz.Rect]: - """Get the rectangles of the depth column entries.""" - return [entry.rect for entry in self.entries] - - def rect(self) -> fitz.Rect: - """Get the bounding box of the depth column entries.""" - x0 = min([rect.x0 for rect in self.rects()]) - x1 = max([rect.x1 for rect in self.rects()]) - y0 = min([rect.y0 for rect in self.rects()]) - y1 = max([rect.y1 for rect in self.rects()]) - return fitz.Rect(x0, y0, x1, y1) - - @property - def max_x0(self) -> float: - """Get the maximum x0 value of the depth column entries.""" - return max([rect.x0 for rect in self.rects()]) - - @property - def min_x1(self) -> float: - """Get the minimum x1 value of the depth column entries.""" - return min([rect.x1 for rect in self.rects()]) - - def noise_count(self, all_words: list[TextWord]) -> int: - """Counts the number of words that intersect with the depth column entries. - - Returns the number of words that intersect with the depth column entries, but are not part of the depth column. - - Args: - all_words (list[TextWord]): A list of all text lines on the page. - - Returns: - int: The number of words that intersect with the depth column entries but are not part of it. - """ - - def significant_intersection(other_rect): - intersection = fitz.Rect(other_rect).intersect(self.rect()) - return intersection.is_valid and intersection.width > 0.25 * self.rect().width - - return len([word for word in all_words if significant_intersection(word.rect)]) - len(self.entries) - - @abc.abstractmethod - def identify_groups( - self, - description_lines: list[TextLine], - geometric_lines: list[Line], - material_description_rect: fitz.Rect, - **params, - ) -> list[IntervalBlockGroup]: - """Identifies groups of description blocks that correspond to depth intervals. - - Args: - description_lines (list[TextLine]): A list of text lines that are part of the description. - geometric_lines (list[Line]): A list of geometric lines that are part of the description. - material_description_rect (fitz.Rect): The bounding box of the material description. - params (dict): A dictionary of relevant parameters. - - Returns: - list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. - """ - pass - - def can_be_appended(self, rect: fitz.Rect) -> bool: - """Checks if a new depth column entry can be appended to the current depth column. - - Check if the middle of the new rect is between the outer horizontal boundaries of the column, and if there is - an intersection with the minimal horizontal boundaries of the column. - - The checks are: - - The width of the new rectangle is greater than the width of the current depth column. Or; - - The middle of the new rectangle is within the horizontal boundaries of the current depth column. - - The new rectangle intersects with the minimal horizontal boundaries of the current depth column. - - Args: - rect (fitz.Rect): Rect of the depth column entry to be appended. - - Returns: - bool: True if the new depth column entry can be appended, False otherwise. - """ - new_middle = (rect.x0 + rect.x1) / 2 - if (self.rect().width < rect.width or self.rect().x0 < new_middle < self.rect().x1) and ( - rect.x0 <= self.min_x1 and self.max_x0 <= rect.x1 - ): - return True - return False - - -@dataclass -class LayerDepthColumn(DepthColumn[LayerDepthColumnEntry]): - """Represents a depth column where the upper and lower depths of each layer are explicitly specified. - - Example:: - 0 - 0.1m: xxx - 0.1 - 0.3m: yyy - 0.3 - 0.8m: zzz - ... - """ - - entries: list[LayerDepthColumnEntry] - - def __repr__(self): - """Converts the object to a string. - - Returns: - str: The object as a string. - """ - return "LayerDepthColumn({})".format(", ".join([str(entry) for entry in self.entries])) - - def depth_intervals(self) -> list[LayerInterval]: - return [LayerInterval(entry) for entry in self.entries] - - def break_on_mismatch(self) -> list[LayerDepthColumn]: - """Breaks the depth column into segments where the depth intervals are not in an arithmetic progression. - - Returns: - list[LayerDepthColumn]: A list of depth column segments. - """ - segments = [] - segment_start = 0 - for index, current_entry in enumerate(self.entries): - if index >= 1 and current_entry.start.value < self.entries[index - 1].end.value: - # (_, big) || (small, _) - segments.append(self.entries[segment_start:index]) - segment_start = index - - final_segment = self.entries[segment_start:] - if final_segment: - segments.append(final_segment) - - return [LayerDepthColumn(segment) for segment in segments] - - def is_valid(self) -> bool: - """Checks if the depth column is valid. - - A depth column is valid if it is strictly increasing and the depth intervals are significant. - - Returns: - bool: True if the depth column is valid, False otherwise. - """ - if len(self.entries) <= 2: - return False - - # At least half of the "end" values must match the subsequent "start" value (e.g. 2-5m, 5-9m). - sequence_matches_count = 0 - for index, entry in enumerate(self.entries): - if index >= 1 and self.entries[index - 1].end.value == entry.start.value: - sequence_matches_count += 1 - - return sequence_matches_count / (len(self.entries) - 1) > 0.5 - - def identify_groups( - self, - description_lines: list[TextLine], - geometric_lines: list[Line], - material_description_rect: fitz.Rect, - **params, - ) -> list[IntervalBlockGroup]: - """Identifies groups of description blocks that correspond to depth intervals. - - Args: - description_lines (list[TextLine]): A list of text lines that are part of the description. - geometric_lines (list[Line]): A list of geometric lines that are part of the description. - material_description_rect (fitz.Rect): The bounding box of the material description. - params (dict): A dictionary of relevant parameters. - - Returns: - list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. - """ - depth_intervals = self.depth_intervals() - - groups = [] - line_index = 0 - - for interval_index, interval in enumerate(depth_intervals): - # don't allow a layer above depth 0 - if interval.start is None and interval.end.value == 0: - continue - - next_interval = depth_intervals[interval_index + 1] if interval_index + 1 < len(depth_intervals) else None - - matched_blocks = interval.matching_blocks(description_lines, line_index, next_interval) - line_index += sum([len(block.lines) for block in matched_blocks]) - groups.append(IntervalBlockGroup(depth_intervals=[interval], blocks=matched_blocks)) - return groups - - -@dataclass -class BoundaryDepthColumn(DepthColumn[DepthColumnEntry]): - """Represents a depth column. - - The depths of the boundaries between layers are labels, at a vertical position on - the page that is proportional to the depth. - - Example: - 0m - - 0.2m - - - 0.5m - ... - """ - - entries: list[DepthColumnEntry] - - def __repr__(self): - return "DepthColumn({})".format(", ".join([str(entry) for entry in self.entries])) - - def valid_initial_segment(self, rect: fitz.Rect) -> BoundaryDepthColumn: - for i in range(len(self.entries) - 1): - initial_segment = BoundaryDepthColumn(self.entries[: -i - 1]) - if initial_segment.can_be_appended(rect): - return initial_segment - return BoundaryDepthColumn(entries=[]) - - def strictly_contains(self, other: BoundaryDepthColumn) -> bool: - return len(other.entries) < len(self.entries) and all( - other_entry in self.entries for other_entry in other.entries - ) - - def is_strictly_increasing(self) -> bool: - return all(i.value < j.value for i, j in zip(self.entries, self.entries[1:], strict=False)) - - def depth_intervals(self) -> list[BoundaryInterval]: - """Creates a list of depth intervals from the depth column entries. - - The first depth interval has an open start value (i.e. None). - - Returns: - list[BoundaryInterval]: A list of depth intervals. - """ - depth_intervals = [BoundaryInterval(None, self.entries[0])] - for i in range(len(self.entries) - 1): - depth_intervals.append(BoundaryInterval(self.entries[i], self.entries[i + 1])) - depth_intervals.append( - BoundaryInterval(self.entries[len(self.entries) - 1], None) - ) # even though no open ended intervals are allowed, they are still useful for matching, - # especially for documents where the material description rectangle is too tall - # (and includes additional lines below the actual material descriptions). - return depth_intervals - - def significant_arithmetic_progression(self) -> bool: - # to allow for OCR errors or gaps in the progression, we only require a segment of length 6 that is an - # arithmetic progression - segment_length = 6 - if len(self.entries) < segment_length: - return self.is_arithmetic_progression() - else: - for i in range(len(self.entries) - segment_length + 1): - if BoundaryDepthColumn(self.entries[i : i + segment_length]).is_arithmetic_progression(): - return True - return False - - def is_arithmetic_progression(self) -> bool: - if len(self.entries) <= 2: - return True - - progression = np.array(range(len(self.entries))) - entries = np.array([entry.value for entry in self.entries]) - - # Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0. - if np.std(entries) == 0: - return False - - scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item() - return abs(scale_pearson_correlation_coef) >= 0.9999 - - def pearson_correlation_coef(self) -> float: - # We look at the lower y coordinate, because most often the baseline of the depth value text is aligned with - # the line of the corresponding layer boundary. - positions = np.array([entry.rect.y1 for entry in self.entries]) - entries = np.array([entry.value for entry in self.entries]) - - # Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0. - if np.std(entries) == 0 or np.std(positions) == 0: - return 0 - - return np.corrcoef(positions, entries)[0, 1].item() - - def remove_entry_by_correlation_gradient(self) -> BoundaryDepthColumn | None: - if len(self.entries) < 3: - return None - - new_columns = [ - BoundaryDepthColumn([entry for index, entry in enumerate(self.entries) if index != remove_index]) - for remove_index in range(len(self.entries)) - ] - return max(new_columns, key=lambda column: column.pearson_correlation_coef()) - - def break_on_double_descending(self) -> list[BoundaryDepthColumn]: - segments = [] - segment_start = 0 - for index, current_entry in enumerate(self.entries): - if ( - index >= 2 - and index + 1 < len(self.entries) - and current_entry.value < self.entries[index - 2].value - and current_entry.value < self.entries[index - 1].value - and self.entries[index + 1].value < self.entries[index - 2].value - and self.entries[index + 1].value < self.entries[index - 1].value - ): - # big big || small small - segments.append(self.entries[segment_start:index]) - segment_start = index - - final_segment = self.entries[segment_start:] - if final_segment: - segments.append(final_segment) - - return [BoundaryDepthColumn(segment) for segment in segments] - - def identify_groups( - self, - description_lines: list[TextLine], - geometric_lines: list[Line], - material_description_rect: fitz.Rect, - **params, - ) -> list[IntervalBlockGroup]: - """Identifies groups of description blocks that correspond to depth intervals. - - Note: includes a heuristic of whether there should be a group corresponding to a final depth interval - starting from the last depth entry without any end value. - - Args: - description_lines (list[TextLine]): A list of text lines that are part of the description. - geometric_lines (list[Line]): A list of geometric lines that are part of the description. - material_description_rect (fitz.Rect): The bounding box of the material description. - params (dict): A dictionary of relevant parameters. - - Returns: - list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. - - Example: - [ - { - "depth_intervals": [BoundaryInterval(None, 0.1), BoundaryInterval(0.1, 0.3), ...], - "blocks": [DescriptionBlock(...), DescriptionBlock(...), ...] - }, - { - "depth_intervals": [BoundaryInterval(0.3, 0.7)], - "blocks": [DescriptionBlock(...), DescriptionBlock(...), ...] - }, - ... - ] - """ - depth_intervals = self.depth_intervals() - - groups = [] - - current_intervals = [] - current_blocks = [] - all_blocks = get_description_blocks( - description_lines, - geometric_lines, - material_description_rect, - params["block_line_ratio"], - left_line_length_threshold=params["left_line_length_threshold"], - target_layer_count=len(depth_intervals), - ) - - block_index = 0 - - for interval in depth_intervals: - # don't allow a layer above depth 0 - if interval.start is None and interval.end.value == 0: - continue - - pre, exact, post = interval.matching_blocks(all_blocks, block_index) - block_index += len(pre) + len(exact) + len(post) - - current_blocks.extend(pre) - if len(exact): - if len(current_intervals) > 0 or len(current_blocks) > 0: - groups.append(IntervalBlockGroup(depth_intervals=current_intervals, blocks=current_blocks)) - groups.append(IntervalBlockGroup(depth_intervals=[interval], blocks=exact)) - current_blocks = post - current_intervals = [] - else: - # The final open-ended interval should not be added, since borehole profiles do typically not come - # with open-ended intervals. - if interval.end is not None: - current_intervals.append(interval) - - if len(current_intervals) > 0 or len(current_blocks) > 0: - groups.append(IntervalBlockGroup(depth_intervals=current_intervals, blocks=current_blocks)) - - return groups diff --git a/src/stratigraphy/depthcolumn/depthcolumnentry.py b/src/stratigraphy/depthcolumn/depthcolumnentry.py index 529fd1fe..bea3778c 100644 --- a/src/stratigraphy/depthcolumn/depthcolumnentry.py +++ b/src/stratigraphy/depthcolumn/depthcolumnentry.py @@ -32,7 +32,7 @@ def from_json(cls, json_depth_column_entry: dict) -> "DepthColumnEntry": return cls(rect=fitz.Rect(json_depth_column_entry["rect"]), value=json_depth_column_entry["value"]) -class LayerDepthColumnEntry: # noqa: D101 +class AToBDepthColumnEntry: # noqa: D101 """Class to represent a layer depth column entry.""" def __init__(self, start: DepthColumnEntry, end: DepthColumnEntry): @@ -56,14 +56,14 @@ def to_json(self) -> dict[str, Any]: } @classmethod - def from_json(cls, json_layer_depth_column_entry: dict) -> "LayerDepthColumnEntry": + def from_json(cls, json_layer_depth_column_entry: dict) -> "AToBDepthColumnEntry": """Converts a dictionary to an object. Args: json_layer_depth_column_entry (dict): A dictionary representing the layer depth column entry. Returns: - LayerDepthColumnEntry: The layer depth column entry object. + AToBDepthColumnEntry: The layer depth column entry object. """ start = DepthColumnEntry.from_json(json_layer_depth_column_entry["start"]) end = DepthColumnEntry.from_json(json_layer_depth_column_entry["end"]) diff --git a/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py index dd4795cb..2731713f 100644 --- a/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py +++ b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py @@ -3,7 +3,9 @@ from dataclasses import dataclass import fitz -from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPair +from stratigraphy.depths_materials_column_pairs.material_description_rect_with_sidebar import ( + MaterialDescriptionRectWithSidebar, +) @dataclass @@ -34,7 +36,7 @@ def from_json(cls, data) -> "BoundingBox": class BoundingBoxes: """A class to represent the bounding boxes of depth columns and associated material descriptions.""" - depth_column_bbox: BoundingBox | None + sidebar_bbox: BoundingBox | None depth_column_entry_bboxes: list[BoundingBox] material_description_bbox: BoundingBox page: int @@ -46,7 +48,7 @@ def to_json(self) -> dict: dict: The object as a dictionary. """ return { - "depth_column_rect": self.depth_column_bbox.to_json() if self.depth_column_bbox else None, + "sidebar_rect": self.sidebar_bbox.to_json() if self.sidebar_bbox else None, "depth_column_entries": [entry.to_json() for entry in self.depth_column_entry_bboxes], "material_description_rect": self.material_description_bbox.to_json(), "page": self.page, @@ -56,25 +58,25 @@ def to_json(self) -> dict: def from_json(cls, data) -> "BoundingBoxes": """Convert a JSON data structure to a BoundingBoxes object.""" return cls( - depth_column_bbox=BoundingBox.from_json(data["depth_column_bbox"]) - if "depth_column_bbox" in data - else None, + sidebar_bbox=BoundingBox.from_json(data["sidebar_rect"]) if "sidebar_rect" in data else None, depth_column_entry_bboxes=[BoundingBox.from_json(entry) for entry in data["depth_column_entries"]], material_description_bbox=BoundingBox.from_json(data["material_description_rect"]), page=data["page"], ) @classmethod - def from_depths_material_column_pair(cls, pair: DepthsMaterialsColumnPair, page_number: int) -> "BoundingBoxes": - """Convert a DepthsMaterialsColumnPair instance to a BoundingBoxes object.""" - if pair.depth_column: - depth_column_bbox = BoundingBox(pair.depth_column.rect()) - depth_column_entry_bboxes = [BoundingBox(entry.rect) for entry in pair.depth_column.entries] + def from_material_description_rect_with_sidebar( + cls, pair: MaterialDescriptionRectWithSidebar, page_number: int + ) -> "BoundingBoxes": + """Convert a MaterialDescriptionRectWithSidebar instance to a BoundingBoxes object.""" + if pair.sidebar: + depth_column_bbox = BoundingBox(pair.sidebar.rect()) + depth_column_entry_bboxes = [BoundingBox(entry.rect) for entry in pair.sidebar.entries] else: depth_column_bbox = None depth_column_entry_bboxes = [] return BoundingBoxes( - depth_column_bbox=depth_column_bbox, + sidebar_bbox=depth_column_bbox, depth_column_entry_bboxes=depth_column_entry_bboxes, material_description_bbox=BoundingBox(pair.material_description_rect), page=page_number, diff --git a/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py b/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py similarity index 61% rename from src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py rename to src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py index 2cb51d85..ec1d9557 100644 --- a/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py +++ b/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py @@ -4,19 +4,19 @@ from dataclasses import dataclass import fitz -from stratigraphy.depthcolumn.depthcolumn import DepthColumn from stratigraphy.lines.line import TextWord +from stratigraphy.sidebar import Sidebar @dataclass -class DepthsMaterialsColumnPair: - """A class to represent pairs of depth columns and material descriptions.""" +class MaterialDescriptionRectWithSidebar: + """A class to represent pairs of sidebar and material description rectangle.""" - depth_column: DepthColumn | None + sidebar: Sidebar | None material_description_rect: fitz.Rect - def score_column_match(self, all_words: list[TextWord] | None = None) -> float: - """Scores the match between a depth column and a material description. + def score_match(self, all_words: list[TextWord] | None = None) -> float: + """Scores the match between a sidebar and a material description. Args: all_words (list[TextWord] | None, optional): List of the available text words. Defaults to None. @@ -24,7 +24,7 @@ def score_column_match(self, all_words: list[TextWord] | None = None) -> float: Returns: float: The score of the match. """ - rect = self.depth_column.rect() + rect = self.sidebar.rect() top = rect.y0 bottom = rect.y1 right = rect.x1 @@ -36,6 +36,6 @@ def score_column_match(self, all_words: list[TextWord] | None = None) -> float: height = bottom - top - noise_count = self.depth_column.noise_count(all_words) if all_words else 0 + noise_count = self.sidebar.noise_count(all_words) if all_words else 0 return (height - distance) * math.pow(0.8, noise_count) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 23aa28f1..bfb43e54 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -6,23 +6,24 @@ import fitz from stratigraphy.data_extractor.data_extractor import FeatureOnPage -from stratigraphy.depthcolumn import find_depth_columns -from stratigraphy.depthcolumn.depthcolumn import DepthColumn from stratigraphy.depths_materials_column_pairs.bounding_boxes import BoundingBox, BoundingBoxes -from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPair -from stratigraphy.layer.layer import IntervalBlockPair, Layer -from stratigraphy.layer.layer_identifier_column import ( - find_layer_identifier_column, - find_layer_identifier_column_entries, +from stratigraphy.depths_materials_column_pairs.material_description_rect_with_sidebar import ( + MaterialDescriptionRectWithSidebar, ) +from stratigraphy.layer.layer import IntervalBlockPair, Layer from stratigraphy.lines.line import TextLine +from stratigraphy.sidebar import Sidebar, find_sidebars +from stratigraphy.sidebar.layer_identifier_sidebar import ( + find_layer_identifier_sidebar_entries, + find_layer_identifier_sidebars, +) from stratigraphy.text.find_description import ( get_description_blocks, get_description_lines, ) from stratigraphy.text.textblock import MaterialDescription, MaterialDescriptionLine, TextBlock, block_distance from stratigraphy.util.dataclasses import Line -from stratigraphy.util.interval import BoundaryInterval, Interval +from stratigraphy.util.interval import AAboveBInterval, Interval from stratigraphy.util.util import ( x_overlap, x_overlap_significant_smallest, @@ -60,93 +61,91 @@ def process_page( list[dict]: All list of the text of all description blocks. """ # Detect Layer Index Columns - layer_identifier_entries = find_layer_identifier_column_entries(lines) - layer_identifier_columns = ( - find_layer_identifier_column(layer_identifier_entries) if layer_identifier_entries else [] + layer_identifier_entries = find_layer_identifier_sidebar_entries(lines) + layer_identifier_sidebars = ( + find_layer_identifier_sidebars(layer_identifier_entries) if layer_identifier_entries else [] ) - depths_materials_column_pairs = [] - if layer_identifier_columns: - for layer_identifier_column in layer_identifier_columns: + material_descriptions_sidebar_pairs = [] + if layer_identifier_sidebars: + for layer_identifier_sidebar in layer_identifier_sidebars: material_description_rect = find_material_description_column( - lines, layer_identifier_column, language, **params["material_description"] + lines, layer_identifier_sidebar, language, **params["material_description"] ) if material_description_rect: - depths_materials_column_pairs.append( - DepthsMaterialsColumnPair(layer_identifier_column, material_description_rect) + material_descriptions_sidebar_pairs.append( + MaterialDescriptionRectWithSidebar(layer_identifier_sidebar, material_description_rect) ) - if depths_materials_column_pairs: - depths_materials_column_pairs.sort(key=lambda pair: pair.score_column_match()) + if material_descriptions_sidebar_pairs: + material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match()) - # If there is a layer identifier column, then we use this directly. - # Else, we search for depth columns. We could also think of some scoring mechanism to decide which one to use. - if not depths_materials_column_pairs: + # If there is a layer identifier sidebar, then we use this directly. + # Else, we search for sidebars with depths. + # We could also think of some scoring mechanism to decide which one to use. + if not material_descriptions_sidebar_pairs: words = [word for line in lines for word in line.words] - depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True) - layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words) + depth_column_entries = find_sidebars.depth_column_entries(words, include_splits=True) + a_to_b_sidebars = find_sidebars.find_a_to_b_sidebars(depth_column_entries, words) used_entry_rects = [] - for column in layer_depth_columns: + for column in a_to_b_sidebars: for entry in column.entries: used_entry_rects.extend([entry.start.rect, entry.end.rect]) depth_column_entries = [ entry - for entry in find_depth_columns.depth_column_entries(words, include_splits=False) + for entry in find_sidebars.depth_column_entries(words, include_splits=False) if entry.rect not in used_entry_rects ] - depth_columns: list[DepthColumn] = layer_depth_columns - depth_columns.extend( - find_depth_columns.find_depth_columns( - depth_column_entries, words, depth_column_params=params["depth_column_params"] + sidebars: list[Sidebar] = a_to_b_sidebars + sidebars.extend( + find_sidebars.find_a_above_b_sidebars( + depth_column_entries, words, sidebar_params=params["depth_column_params"] ) ) - for depth_column in depth_columns: + for sidebar in sidebars: material_description_rect = find_material_description_column( - lines, depth_column, language, **params["material_description"] + lines, sidebar, language, **params["material_description"] ) if material_description_rect: - depths_materials_column_pairs.append( - DepthsMaterialsColumnPair(depth_column, material_description_rect) + material_descriptions_sidebar_pairs.append( + MaterialDescriptionRectWithSidebar(sidebar, material_description_rect) ) # lowest score first - depths_materials_column_pairs.sort(key=lambda pair: pair.score_column_match(words)) + material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match(words)) to_delete = [] - for i, pair in enumerate(depths_materials_column_pairs): + for i, pair in enumerate(material_descriptions_sidebar_pairs): if any( pair.material_description_rect.intersects(other_pair.material_description_rect) - for other_pair in depths_materials_column_pairs[i + 1 :] + for other_pair in material_descriptions_sidebar_pairs[i + 1 :] ): to_delete.append(i) - filtered_depth_material_column_pairs = [ - item for index, item in enumerate(depths_materials_column_pairs) if index not in to_delete - ] + filtered_pairs = [item for index, item in enumerate(material_descriptions_sidebar_pairs) if index not in to_delete] pairs: list[IntervalBlockPair] = [] # list of matched depth intervals and text blocks - if filtered_depth_material_column_pairs: # match depth column items with material description + if filtered_pairs: # match sidebars with material description bounding_boxes = [ - BoundingBoxes.from_depths_material_column_pair(pair, page_number) - for pair in filtered_depth_material_column_pairs + BoundingBoxes.from_material_description_rect_with_sidebar(pair, page_number) for pair in filtered_pairs ] - for pair in filtered_depth_material_column_pairs: + for pair in filtered_pairs: description_lines = get_description_lines(lines, pair.material_description_rect) if len(description_lines) > 1: new_pairs = match_columns( - pair.depth_column, description_lines, geometric_lines, pair.material_description_rect, **params + pair.sidebar, description_lines, geometric_lines, pair.material_description_rect, **params ) pairs.extend(new_pairs) else: # Fallback when no depth column was found material_description_rect = find_material_description_column( - lines, depth_column=None, language=language, **params["material_description"] + lines, sidebar=None, language=language, **params["material_description"] ) bounding_boxes = [] if material_description_rect: bounding_boxes.append( BoundingBoxes( - depth_column_bbox=None, + sidebar_bbox=None, depth_column_entry_bboxes=[], material_description_bbox=BoundingBox(material_description_rect), page=page_number, @@ -179,7 +178,7 @@ def process_page( rect=pair.block.rect, page=page_number, ), - depth_interval=BoundaryInterval(start=pair.depth_interval.start, end=pair.depth_interval.end) + depth_interval=AAboveBInterval(start=pair.depth_interval.start, end=pair.depth_interval.end) if pair.depth_interval else None, ) @@ -190,20 +189,20 @@ def process_page( def match_columns( - depth_column: DepthColumn, + sidebar: Sidebar, description_lines: list[TextLine], geometric_lines: list[Line], material_description_rect: fitz.Rect, **params: dict, ) -> list[IntervalBlockPair]: - """Match the depth column entries with the description lines. + """Match the layers that can be derived from the sidebar with the description lines. This function identifies groups of depth intervals and text blocks that are likely to match. - Makes a distinction between DepthColumn and LayerIdentifierColumn and obtains the corresponding text blocks - as well as their depth intervals where present. + The actual matching between text blocks and depth intervals is handled by the implementation of the actual Sidebar + instance (e.b. AAboveBSidebar, AToBSidebar). Args: - depth_column (DepthColumn): The depth column. + sidebar (Sidebar): The sidebar. description_lines (list[TextLine]): The description lines. geometric_lines (list[Line]): The geometric lines. material_description_rect (fitz.Rect): The material description rectangle. @@ -214,9 +213,7 @@ def match_columns( """ return [ element - for group in depth_column.identify_groups( - description_lines, geometric_lines, material_description_rect, **params - ) + for group in sidebar.identify_groups(description_lines, geometric_lines, material_description_rect, **params) for element in transform_groups(group.depth_intervals, group.blocks, **params) ] @@ -250,7 +247,7 @@ def transform_groups( if len(blocks) > len(depth_intervals): # create additional depth intervals with end & start value None to match the number of blocks - depth_intervals.extend([BoundaryInterval(None, None) for _ in range(len(blocks) - len(depth_intervals))]) + depth_intervals.extend([AAboveBInterval(None, None) for _ in range(len(blocks) - len(depth_intervals))]) return [ IntervalBlockPair(depth_interval=depth_interval, block=block) @@ -339,30 +336,28 @@ def split_blocks_by_textline_length(blocks: list[TextBlock], target_split_count: def find_material_description_column( - lines: list[TextLine], depth_column: DepthColumn | None, language: str, **params: dict + lines: list[TextLine], sidebar: Sidebar | None, language: str, **params: dict ) -> fitz.Rect | None: """Find the material description column given a depth column. Args: lines (list[TextLine]): The text lines of the page. - depth_column (DepthColumn | None): The depth column. + sidebar (Sidebar | None): The sidebar to be associated with the material descriptions. language (str): The language of the page. **params (dict): Additional parameters for the matching pipeline. Returns: fitz.Rect | None: The material description column. """ - if depth_column: - above_depth_column = [ - line - for line in lines - if x_overlap(line.rect, depth_column.rect()) and line.rect.y0 < depth_column.rect().y0 + if sidebar: + above_sidebar = [ + line for line in lines if x_overlap(line.rect, sidebar.rect()) and line.rect.y0 < sidebar.rect().y0 ] - min_y0 = max(line.rect.y0 for line in above_depth_column) if above_depth_column else -1 + min_y0 = max(line.rect.y0 for line in above_sidebar) if above_sidebar else -1 def check_y0_condition(y0): - return y0 > min_y0 and y0 < depth_column.rect().y1 + return y0 > min_y0 and y0 < sidebar.rect().y1 else: def check_y0_condition(y0): @@ -449,10 +444,10 @@ def is_below(best_x0, best_y1, line): if len(candidate_rects) == 0: return None - if depth_column: + if sidebar: return max( candidate_rects, - key=lambda rect: DepthsMaterialsColumnPair(depth_column, rect).score_column_match(), + key=lambda rect: MaterialDescriptionRectWithSidebar(sidebar, rect).score_match(), ) else: return candidate_rects[0] diff --git a/src/stratigraphy/layer/layer.py b/src/stratigraphy/layer/layer.py index 298df64e..45037225 100644 --- a/src/stratigraphy/layer/layer.py +++ b/src/stratigraphy/layer/layer.py @@ -7,7 +7,7 @@ from stratigraphy.data_extractor.data_extractor import ExtractedFeature, FeatureOnPage from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry from stratigraphy.text.textblock import MaterialDescription, TextBlock -from stratigraphy.util.interval import BoundaryInterval, Interval +from stratigraphy.util.interval import AAboveBInterval, Interval from stratigraphy.util.util import parse_text @@ -16,7 +16,7 @@ class Layer(ExtractedFeature): """A class to represent predictions for a single layer.""" material_description: FeatureOnPage[MaterialDescription] - depth_interval: BoundaryInterval | None + depth_interval: AAboveBInterval | None id: uuid.UUID = field(default_factory=uuid.uuid4) def __str__(self) -> str: @@ -68,7 +68,7 @@ def from_json(cls, data: dict) -> "Layer": else None ) - depth_interval_prediction = BoundaryInterval(start=start, end=end) + depth_interval_prediction = AAboveBInterval(start=start, end=end) else: depth_interval_prediction = None diff --git a/src/stratigraphy/sidebar/__init__.py b/src/stratigraphy/sidebar/__init__.py new file mode 100644 index 00000000..fbb94fba --- /dev/null +++ b/src/stratigraphy/sidebar/__init__.py @@ -0,0 +1,9 @@ +"""Modules for Sidebars, representing depths or other data displayed to the side of material descriptions.""" + +from .a_above_b_sidebar import AAboveBSidebar +from .a_above_b_sidebar_validator import AAboveBSidebarValidator +from .a_to_b_sidebar import AToBSidebar +from .layer_identifier_sidebar import LayerIdentifierSidebar +from .sidebar import Sidebar + +__all__ = ["Sidebar", "AAboveBSidebar", "AToBSidebar", "LayerIdentifierSidebar", "AAboveBSidebarValidator"] diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar.py b/src/stratigraphy/sidebar/a_above_b_sidebar.py new file mode 100644 index 00000000..90bdc2fd --- /dev/null +++ b/src/stratigraphy/sidebar/a_above_b_sidebar.py @@ -0,0 +1,219 @@ +"""Module for the AAboveBSidebar, where the depths of layer interfaces are defined above/below each other.""" + +from __future__ import annotations + +from dataclasses import dataclass + +import fitz +import numpy as np + +from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry +from stratigraphy.layer.layer import IntervalBlockGroup +from stratigraphy.lines.line import TextLine +from stratigraphy.text.find_description import get_description_blocks +from stratigraphy.util.dataclasses import Line +from stratigraphy.util.interval import AAboveBInterval + +from .sidebar import Sidebar + + +@dataclass +class AAboveBSidebar(Sidebar[DepthColumnEntry]): + """Represents a sidebar where the depths of the layer boundaries are displayed in a column, above each other. + + Usually, the vertical position of a depth label on the page is proportional to the depth value. + + Example: + 0m + + 0.2m + + + 0.5m + ... + """ + + entries: list[DepthColumnEntry] + + def __repr__(self): + return "AAboveBSidebar({})".format(", ".join([str(entry) for entry in self.entries])) + + def valid_initial_segment(self, rect: fitz.Rect) -> AAboveBSidebar: + for i in range(len(self.entries) - 1): + initial_segment = AAboveBSidebar(self.entries[: -i - 1]) + if initial_segment.can_be_appended(rect): + return initial_segment + return AAboveBSidebar(entries=[]) + + def strictly_contains(self, other: AAboveBSidebar) -> bool: + return len(other.entries) < len(self.entries) and all( + other_entry in self.entries for other_entry in other.entries + ) + + def is_strictly_increasing(self) -> bool: + return all(i.value < j.value for i, j in zip(self.entries, self.entries[1:], strict=False)) + + def depth_intervals(self) -> list[AAboveBInterval]: + """Creates a list of depth intervals from the depth column entries. + + The first depth interval has an open start value (i.e. None). + + Returns: + list[AAboveBInterval]: A list of depth intervals. + """ + depth_intervals = [AAboveBInterval(None, self.entries[0])] + for i in range(len(self.entries) - 1): + depth_intervals.append(AAboveBInterval(self.entries[i], self.entries[i + 1])) + depth_intervals.append( + AAboveBInterval(self.entries[len(self.entries) - 1], None) + ) # even though no open ended intervals are allowed, they are still useful for matching, + # especially for documents where the material description rectangle is too tall + # (and includes additional lines below the actual material descriptions). + return depth_intervals + + def significant_arithmetic_progression(self) -> bool: + # to allow for OCR errors or gaps in the progression, we only require a segment of length 6 that is an + # arithmetic progression + segment_length = 6 + if len(self.entries) < segment_length: + return self.is_arithmetic_progression() + else: + for i in range(len(self.entries) - segment_length + 1): + if AAboveBSidebar(self.entries[i : i + segment_length]).is_arithmetic_progression(): + return True + return False + + def is_arithmetic_progression(self) -> bool: + if len(self.entries) <= 2: + return True + + progression = np.array(range(len(self.entries))) + entries = np.array([entry.value for entry in self.entries]) + + # Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0. + if np.std(entries) == 0: + return False + + scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item() + return abs(scale_pearson_correlation_coef) >= 0.9999 + + def pearson_correlation_coef(self) -> float: + # We look at the lower y coordinate, because most often the baseline of the depth value text is aligned with + # the line of the corresponding layer boundary. + positions = np.array([entry.rect.y1 for entry in self.entries]) + entries = np.array([entry.value for entry in self.entries]) + + # Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0. + if np.std(entries) == 0 or np.std(positions) == 0: + return 0 + + return np.corrcoef(positions, entries)[0, 1].item() + + def remove_entry_by_correlation_gradient(self) -> AAboveBSidebar | None: + if len(self.entries) < 3: + return None + + new_columns = [ + AAboveBSidebar([entry for index, entry in enumerate(self.entries) if index != remove_index]) + for remove_index in range(len(self.entries)) + ] + return max(new_columns, key=lambda column: column.pearson_correlation_coef()) + + def break_on_double_descending(self) -> list[AAboveBSidebar]: + segments = [] + segment_start = 0 + for index, current_entry in enumerate(self.entries): + if ( + index >= 2 + and index + 1 < len(self.entries) + and current_entry.value < self.entries[index - 2].value + and current_entry.value < self.entries[index - 1].value + and self.entries[index + 1].value < self.entries[index - 2].value + and self.entries[index + 1].value < self.entries[index - 1].value + ): + # big big || small small + segments.append(self.entries[segment_start:index]) + segment_start = index + + final_segment = self.entries[segment_start:] + if final_segment: + segments.append(final_segment) + + return [AAboveBSidebar(segment) for segment in segments] + + def identify_groups( + self, + description_lines: list[TextLine], + geometric_lines: list[Line], + material_description_rect: fitz.Rect, + **params, + ) -> list[IntervalBlockGroup]: + """Identifies groups of description blocks that correspond to depth intervals. + + Note: includes a heuristic of whether there should be a group corresponding to a final depth interval + starting from the last depth entry without any end value. + + Args: + description_lines (list[TextLine]): A list of text lines that are part of the description. + geometric_lines (list[Line]): A list of geometric lines that are part of the description. + material_description_rect (fitz.Rect): The bounding box of the material description. + params (dict): A dictionary of relevant parameters. + + Returns: + list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. + + Example: + [ + { + "depth_intervals": [BoundaryInterval(None, 0.1), BoundaryInterval(0.1, 0.3), ...], + "blocks": [DescriptionBlock(...), DescriptionBlock(...), ...] + }, + { + "depth_intervals": [BoundaryInterval(0.3, 0.7)], + "blocks": [DescriptionBlock(...), DescriptionBlock(...), ...] + }, + ... + ] + """ + depth_intervals = self.depth_intervals() + + groups = [] + + current_intervals = [] + current_blocks = [] + all_blocks = get_description_blocks( + description_lines, + geometric_lines, + material_description_rect, + params["block_line_ratio"], + left_line_length_threshold=params["left_line_length_threshold"], + target_layer_count=len(depth_intervals), + ) + + block_index = 0 + + for interval in depth_intervals: + # don't allow a layer above depth 0 + if interval.start is None and interval.end.value == 0: + continue + + pre, exact, post = interval.matching_blocks(all_blocks, block_index) + block_index += len(pre) + len(exact) + len(post) + + current_blocks.extend(pre) + if len(exact): + if len(current_intervals) > 0 or len(current_blocks) > 0: + groups.append(IntervalBlockGroup(depth_intervals=current_intervals, blocks=current_blocks)) + groups.append(IntervalBlockGroup(depth_intervals=[interval], blocks=exact)) + current_blocks = post + current_intervals = [] + else: + # The final open-ended interval should not be added, since borehole profiles do typically not come + # with open-ended intervals. + if interval.end is not None: + current_intervals.append(interval) + + if len(current_intervals) > 0 or len(current_blocks) > 0: + groups.append(IntervalBlockGroup(depth_intervals=current_intervals, blocks=current_blocks)) + + return groups diff --git a/src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py similarity index 77% rename from src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py rename to src/stratigraphy/sidebar/a_above_b_sidebar_validator.py index 1636a09d..c64d49bc 100644 --- a/src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py @@ -1,32 +1,32 @@ -"""This module contains logic to validate BoundaryDepthColumn instances.""" +"""This module contains logic to validate AAboveBSidebar instances.""" import dataclasses -from stratigraphy.depthcolumn.depthcolumn import BoundaryDepthColumn from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry from stratigraphy.lines.line import TextWord +from stratigraphy.sidebar import AAboveBSidebar @dataclasses.dataclass -class BoundaryDepthColumnValidator: - """Validation logic for instances of the BoundaryDepthColumn class. +class AAboveBSidebarValidator: + """Validation logic for instances of the AAboveBSidebar class. Args: all_words (list[TextLine]): A list of all text lines on the page. - noise_count_threshold (float): Noise count threshold deciding how much noise is allowed in a column + noise_count_threshold (float): Noise count threshold deciding how much noise is allowed in a sidebar to be valid. noise_count_offset (int): Offset for the noise count threshold. Affects the noise count criterion. - Effective specifically for depth columns with very few entries. + Effective specifically for sidebars with very few entries. """ all_words: list[TextWord] noise_count_threshold: float noise_count_offset: int - def is_valid(self, column: BoundaryDepthColumn, corr_coef_threshold: float = 0.99) -> bool: - """Checks whether the depth column is valid. + def is_valid(self, sidebar: AAboveBSidebar, corr_coef_threshold: float = 0.99) -> bool: + """Checks whether the sidebar is valid. - The depth column is considered valid if: + The sidebar is considered valid if: - The number of entries is at least 3. - The number of words that intersect with the depth column entries is less than the noise count threshold time the number of entries minus the noise count offset. @@ -37,13 +37,13 @@ def is_valid(self, column: BoundaryDepthColumn, corr_coef_threshold: float = 0.9 even though they are. Args: - column (BoundaryDepthColumn): The depth column to validate. + sidebar (AAboveBSidebar): The AAboveBSidebar to validate. corr_coef_threshold (float): The minimal correlation coefficient for the column to be deemed valid. Returns: bool: True if the depth column is valid, False otherwise. """ - if len(column.entries) < 3: + if len(sidebar.entries) < 3: return False # When too much other text is in the column, then it is probably not valid. @@ -51,28 +51,28 @@ def is_valid(self, column: BoundaryDepthColumn, corr_coef_threshold: float = 0.9 # than columns with more entries. The more entries we have, the less likely it is that we found them by chance. # TODO: Once evaluation data is of good enough qualities, we should optimize for the parameter below. if ( - column.noise_count(self.all_words) - > self.noise_count_threshold * (len(column.entries) - self.noise_count_offset) ** 2 + sidebar.noise_count(self.all_words) + > self.noise_count_threshold * (len(sidebar.entries) - self.noise_count_offset) ** 2 ): return False # Check if the entries are strictly increasing. - if not column.is_strictly_increasing(): + if not sidebar.is_strictly_increasing(): return False - corr_coef = column.pearson_correlation_coef() + corr_coef = sidebar.pearson_correlation_coef() return corr_coef and corr_coef > corr_coef_threshold - def reduce_until_valid(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn: + def reduce_until_valid(self, column: AAboveBSidebar) -> AAboveBSidebar: """Removes entries from the depth column until it fulfills the is_valid condition. is_valid checks whether there is too much noise (i.e. other text) in the column and whether the entries are linearly correlated with their vertical position. Args: - column (BoundaryDepthColumn): The depth column to validate + column (AAboveBSidebar): The depth column to validate Returns: - BoundaryDepthColumn: The current depth column with entries removed until it is valid. + AAboveBSidebar: The current depth column with entries removed until it is valid. """ while column: if self.is_valid(column): @@ -82,7 +82,7 @@ def reduce_until_valid(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn else: column = column.remove_entry_by_correlation_gradient() - def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn | None: + def correct_OCR_mistakes(self, sidebar: AAboveBSidebar) -> AAboveBSidebar | None: """Corrects OCR mistakes in the depth column entries. Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the @@ -100,15 +100,15 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu Note: Common mistakes should be extended as needed. Args: - column (BoundaryDepthColumn): The depth column to validate + sidebar (AAboveBSidebar): The AAboveBSidebar to validate Returns: BoundaryDepthColumn | None: The corrected depth column, or None if no correction was possible. """ - new_columns = [BoundaryDepthColumn(entries=[])] - for entry in column.entries: + new_columns = [AAboveBSidebar(entries=[])] + for entry in sidebar.entries: new_columns = [ - BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value)]) + AAboveBSidebar([*column.entries, DepthColumnEntry(entry.rect, new_value)]) for column in new_columns for new_value in _value_alternatives(entry.value) ] diff --git a/src/stratigraphy/sidebar/a_to_b_sidebar.py b/src/stratigraphy/sidebar/a_to_b_sidebar.py new file mode 100644 index 00000000..1e1812da --- /dev/null +++ b/src/stratigraphy/sidebar/a_to_b_sidebar.py @@ -0,0 +1,114 @@ +"""Module for the AToBSidebar, which contains depth intervals defined like "0.2m - 1.3m".""" + +from __future__ import annotations + +from dataclasses import dataclass + +import fitz + +from stratigraphy.depthcolumn.depthcolumnentry import AToBDepthColumnEntry +from stratigraphy.layer.layer import IntervalBlockGroup +from stratigraphy.lines.line import TextLine +from stratigraphy.util.dataclasses import Line +from stratigraphy.util.interval import AToBInterval + +from .sidebar import Sidebar + + +@dataclass +class AToBSidebar(Sidebar[AToBDepthColumnEntry]): + """Represents a sidebar where the upper and lower depths of each layer are explicitly specified. + + Example:: + 0 - 0.1m: xxx + 0.1 - 0.3m: yyy + 0.3 - 0.8m: zzz + ... + """ + + entries: list[AToBDepthColumnEntry] + + def __repr__(self): + """Converts the object to a string. + + Returns: + str: The object as a string. + """ + return "AToBSidebar({})".format(", ".join([str(entry) for entry in self.entries])) + + def depth_intervals(self) -> list[AToBInterval]: + return [AToBInterval(entry) for entry in self.entries] + + def break_on_mismatch(self) -> list[AToBSidebar]: + """Breaks the sidebar into segments where the depths are not in an arithmetic progression. + + Returns: + list[AToBSidebar]: A list of depth column segments. + """ + segments = [] + segment_start = 0 + for index, current_entry in enumerate(self.entries): + if index >= 1 and current_entry.start.value < self.entries[index - 1].end.value: + # (_, big) || (small, _) + segments.append(self.entries[segment_start:index]) + segment_start = index + + final_segment = self.entries[segment_start:] + if final_segment: + segments.append(final_segment) + + return [AToBSidebar(segment) for segment in segments] + + def is_valid(self) -> bool: + """Checks if the sidebar is valid. + + An AToBSidebar is valid if it is strictly increasing and the depth intervals are significant. + + Returns: + bool: True if the depth column is valid, False otherwise. + """ + if len(self.entries) <= 2: + return False + + # At least half of the "end" values must match the subsequent "start" value (e.g. 2-5m, 5-9m). + sequence_matches_count = 0 + for index, entry in enumerate(self.entries): + if index >= 1 and self.entries[index - 1].end.value == entry.start.value: + sequence_matches_count += 1 + + return sequence_matches_count / (len(self.entries) - 1) > 0.5 + + def identify_groups( + self, + description_lines: list[TextLine], + geometric_lines: list[Line], + material_description_rect: fitz.Rect, + **params, + ) -> list[IntervalBlockGroup]: + """Identifies groups of description blocks that correspond to depth intervals. + + Args: + description_lines (list[TextLine]): A list of text lines that are part of the description. + geometric_lines (list[Line]): A list of geometric lines that are part of the description. + material_description_rect (fitz.Rect): The bounding box of the material description. + params (dict): A dictionary of relevant parameters. + + Returns: + list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. + """ + depth_intervals = self.depth_intervals() + + groups = [] + line_index = 0 + + for interval_index, interval in enumerate(depth_intervals): + # don't allow a layer above depth 0 + if interval.start is None and interval.end.value == 0: + continue + + next_interval = depth_intervals[interval_index + 1] if interval_index + 1 < len(depth_intervals) else None + + matched_blocks = interval.matching_blocks(description_lines, line_index, next_interval) + line_index += sum([len(block.lines) for block in matched_blocks]) + groups.append(IntervalBlockGroup(depth_intervals=[interval], blocks=matched_blocks)) + return groups diff --git a/src/stratigraphy/depthcolumn/find_depth_columns.py b/src/stratigraphy/sidebar/find_sidebars.py similarity index 80% rename from src/stratigraphy/depthcolumn/find_depth_columns.py rename to src/stratigraphy/sidebar/find_sidebars.py index 68ce6bbc..3f09460b 100644 --- a/src/stratigraphy/depthcolumn/find_depth_columns.py +++ b/src/stratigraphy/sidebar/find_sidebars.py @@ -1,14 +1,14 @@ -"""This module contains functionalities to find depth columns in a pdf page.""" +"""This module contains functionalities to find sidebars in a pdf page.""" import re import fitz -from stratigraphy.depthcolumn.boundarydepthcolumnvalidator import BoundaryDepthColumnValidator -from stratigraphy.depthcolumn.depthcolumn import BoundaryDepthColumn, LayerDepthColumn -from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry, LayerDepthColumnEntry + +from stratigraphy.depthcolumn.depthcolumnentry import AToBDepthColumnEntry, DepthColumnEntry from stratigraphy.lines.line import TextWord +from stratigraphy.sidebar import AAboveBSidebar, AAboveBSidebarValidator, AToBSidebar from stratigraphy.text.textblock import TextBlock -from stratigraphy.util.interval import LayerInterval +from stratigraphy.util.interval import AToBInterval def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]: @@ -54,7 +54,7 @@ def value_as_float(string_value: str) -> float: # noqa: D103 def extract_layer_depth_interval( text: str, rect: fitz.Rect, require_start_of_string: bool = True -) -> LayerDepthColumnEntry | None: +) -> AToBDepthColumnEntry | None: """Extracts a LayerDepthColumnEntry from a string. Args: @@ -64,7 +64,7 @@ def extract_layer_depth_interval( at the start of a string. Defaults to True. Returns: - LayerDepthColumnEntry | None: The extracted LayerDepthColumnEntry or None if none is found. + AToBDepthColumnEntry | None: The extracted LayerDepthColumnEntry or None if none is found. """ input_string = text.strip().replace(",", ".") @@ -79,18 +79,18 @@ def extract_layer_depth_interval( value2 = value_as_float(match.group(3)) second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1) - return LayerDepthColumnEntry( + return AToBDepthColumnEntry( DepthColumnEntry(first_half_rect, value1), DepthColumnEntry(second_half_rect, value2), ) return None -def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[TextWord]) -> list[LayerDepthColumn]: - """Finds all layer depth columns. +def find_a_to_b_sidebars(entries: list[DepthColumnEntry], all_words: list[TextWord]) -> list[AToBSidebar]: + """Finds all AToBSidebars. Generates a list of LayerDepthColumnEntry objects by finding consecutive pairs of DepthColumnEntry objects. - Different columns are grouped together in LayerDepthColumn objects. Finally, a list of LayerDepthColumn objects, + Different columns are grouped together in LayerDepthColumn objects. Finally, a list of AToBSidebars objects, one for each column, is returned. A layer corresponds to a material layer. The layer is defined using a start and end point (e.g. 1.10-1.60m). @@ -101,7 +101,7 @@ def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[Te all_words (list[TextWord]): List of all TextWord objects. Returns: - list[LayerDepthColumn]: List of all layer depth columns identified. + list[AToBSidebar]: List of all AToBSidebars identified. """ def find_pair(entry: DepthColumnEntry) -> DepthColumnEntry | None: # noqa: D103 @@ -129,43 +129,43 @@ def find_pair(entry: DepthColumnEntry) -> DepthColumnEntry | None: # noqa: D103 pairs = [(entry, find_pair(entry)) for entry in entries] - columns = [] + sidebars = [] for first, second in pairs: if second is not None: - entry = LayerDepthColumnEntry(first, second) + entry = AToBDepthColumnEntry(first, second) is_matched = False - for column in columns: - column_rect = column.rect() + for sidebar in sidebars: + column_rect = sidebar.rect() new_start_middle = (entry.start.rect.x0 + entry.start.rect.x1) / 2 if column_rect.x0 < new_start_middle < column_rect.x1: is_matched = True - column.entries.append(entry) + sidebar.entries.append(entry) if not is_matched: - columns.append(LayerDepthColumn([entry])) + sidebars.append(AToBSidebar([entry])) return [ - column_segment - for column in columns - for column_segment in column.break_on_mismatch() - if column_segment.is_valid() + sidebar_segment + for sidebar in sidebars + for sidebar_segment in sidebar.break_on_mismatch() + if sidebar_segment.is_valid() ] -def find_depth_columns( - entries: list[DepthColumnEntry], all_words: list[TextWord], depth_column_params: dict -) -> list[BoundaryDepthColumn]: - """Construct all possible BoundaryDepthColumn objects from the given DepthColumnEntry objects. +def find_a_above_b_sidebars( + entries: list[DepthColumnEntry], all_words: list[TextWord], sidebar_params: dict +) -> list[AAboveBSidebar]: + """Construct all possible AAboveBSidebar objects from the given DepthColumnEntry objects. Args: entries (list[DepthColumnEntry]): All found depth column entries in the page. all_words (list[TextLine]): All words in the page. - depth_column_params (dict): Parameters for the BoundaryDepthColumn objects. + sidebar_params (dict): Parameters for the BoundaryDepthColumn objects. Returns: - list[BoundaryDepthColumn]: Found BoundaryDepthColumn objects. + list[AAboveBSidebar]: Found BoundaryDepthColumn objects. """ - numeric_columns: list[BoundaryDepthColumn] = [] + numeric_columns: list[AAboveBSidebar] = [] for entry in entries: has_match = False additional_columns = [] @@ -182,7 +182,7 @@ def find_depth_columns( numeric_columns.extend(additional_columns) if not has_match: - numeric_columns.append(BoundaryDepthColumn(entries=[entry])) + numeric_columns.append(AAboveBSidebar(entries=[entry])) # only keep columns that are not contained in a different column numeric_columns = [ @@ -191,7 +191,7 @@ def find_depth_columns( if all(not other.strictly_contains(column) for other in numeric_columns) ] - boundary_depth_column_validator = BoundaryDepthColumnValidator(all_words, **depth_column_params) + boundary_depth_column_validator = AAboveBSidebarValidator(all_words, **sidebar_params) numeric_columns = [ boundary_depth_column_validator.reduce_until_valid(column) @@ -208,7 +208,7 @@ def find_depth_columns( ) -def get_depth_interval_from_textblock(block: TextBlock) -> LayerInterval | None: +def get_depth_interval_from_textblock(block: TextBlock) -> AToBInterval | None: """Extract depth interval from a material description block. For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description @@ -220,7 +220,7 @@ def get_depth_interval_from_textblock(block: TextBlock) -> LayerInterval | None: block (TextBlock): The block to calculate the depth interval for. Returns: - LayerInterval | None: The depth interval. + AToBInterval | None: The depth interval. """ depth_entries = [] for line in block.lines: @@ -238,6 +238,6 @@ def get_depth_interval_from_textblock(block: TextBlock) -> LayerInterval | None: start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value) end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value) - return LayerInterval(LayerDepthColumnEntry(start, end)) + return AToBInterval(AToBDepthColumnEntry(start, end)) else: return None diff --git a/src/stratigraphy/layer/layer_identifier_column.py b/src/stratigraphy/sidebar/layer_identifier_sidebar.py similarity index 82% rename from src/stratigraphy/layer/layer_identifier_column.py rename to src/stratigraphy/sidebar/layer_identifier_sidebar.py index 801a41f8..a7dcc976 100644 --- a/src/stratigraphy/layer/layer_identifier_column.py +++ b/src/stratigraphy/sidebar/layer_identifier_sidebar.py @@ -1,16 +1,18 @@ -"""Module for the LayerIdentifierColumn class.""" +"""Module for the layer identifier sidebars.""" import re from dataclasses import dataclass import fitz -from stratigraphy.depthcolumn.depthcolumn import DepthColumn -from stratigraphy.depthcolumn.find_depth_columns import get_depth_interval_from_textblock + from stratigraphy.layer.layer import IntervalBlockGroup from stratigraphy.lines.line import TextLine +from stratigraphy.sidebar.find_sidebars import get_depth_interval_from_textblock from stratigraphy.text.textblock import TextBlock from stratigraphy.util.dataclasses import Line +from .sidebar import Sidebar + class LayerIdentifierEntry: """Class for a layer identifier entry. @@ -27,8 +29,8 @@ def __repr__(self): @dataclass -class LayerIdentifierColumn(DepthColumn[LayerIdentifierEntry]): - """Class for a layer identifier column. +class LayerIdentifierSidebar(Sidebar[LayerIdentifierEntry]): + """Class for a layer identifier sidebar. Layer identifiers are labels that are particularly common in Deriaz layout borehole profiles. They can be sequential such as in 1007.pdf - a), b), c), etc. - or contain some semantic meaning such as in 10781.pdf - @@ -108,11 +110,11 @@ def matching_blocks( else: return [] - def strictly_contains(self, other: "LayerIdentifierColumn") -> bool: + def strictly_contains(self, other: "LayerIdentifierSidebar") -> bool: """Check if the layer identifier column strictly contains another layer identifier column. Args: - other (LayerIdentifierColumn): The other layer identifier column to check if it is strictly contained. + other (LayerIdentifierSidebar): The other layer identifier column to check if it is strictly contained. Returns: bool: True if the layer identifier column strictly contains the other layer identifier column, False @@ -139,8 +141,8 @@ def is_contained(self, rect: fitz.Rect) -> bool: ) -def find_layer_identifier_column_entries(lines: list[TextLine]) -> list[LayerIdentifierEntry]: - r"""Find the layer identifier column entries. +def find_layer_identifier_sidebar_entries(lines: list[TextLine]) -> list[LayerIdentifierEntry]: + r"""Find the layer identifier sidebar entries. Regex explanation: - \b is a word boundary. This ensures that the match must start at the beginning of a word. @@ -150,10 +152,10 @@ def find_layer_identifier_column_entries(lines: list[TextLine]) -> list[LayerIde This regular expression will match strings like "1)", "2)", "a)", "b)", "1a4)", "6de)", etc. Args: - lines (list[TextLine]): The lines to search for layer identifier columns. + lines (list[TextLine]): The lines to search for layer identifier entries. Returns: - list[LayerIdentifierEntry]: The layer identifier column entries. + list[LayerIdentifierEntry]: The layer identifier sidebar entries. """ entries = [] for line in sorted(lines, key=lambda line: line.rect.y0): @@ -168,7 +170,7 @@ def find_layer_identifier_column_entries(lines: list[TextLine]) -> list[LayerIde return entries -def find_layer_identifier_column(entries: list[LayerIdentifierEntry]) -> list[LayerIdentifierColumn]: +def find_layer_identifier_sidebars(entries: list[LayerIdentifierEntry]) -> list[LayerIdentifierSidebar]: """Find the layer identifier column given the index column entries. Note: Similar to find_depth_columns.find_depth_columns. Refactoring may be desired. @@ -177,33 +179,33 @@ def find_layer_identifier_column(entries: list[LayerIdentifierEntry]) -> list[La entries (list[LayerIdentifierEntry]): The layer identifier column entries. Returns: - list[LayerIdentifierColumn]: The found layer identifier columns. + list[LayerIdentifierSidebar]: The found layer identifier sidebar. """ - layer_identifier_columns = [LayerIdentifierColumn([entries[0]])] + layer_identifier_sidebars = [LayerIdentifierSidebar([entries[0]])] for entry in entries[1:]: has_match = False - for column in layer_identifier_columns: + for column in layer_identifier_sidebars: if column.can_be_appended(entry.rect): column.entries.append(entry) has_match = True if not has_match: - layer_identifier_columns.append(LayerIdentifierColumn([entry])) + layer_identifier_sidebars.append(LayerIdentifierSidebar([entry])) # only keep columns whose entries are not fully contained in a different column - layer_identifier_columns = [ + layer_identifier_sidebars = [ column - for column in layer_identifier_columns - if all(not other.strictly_contains(column) for other in layer_identifier_columns) + for column in layer_identifier_sidebars + if all(not other.strictly_contains(column) for other in layer_identifier_sidebars) ] # check if the column rect is a subset of another column rect. If so, merge the entries and sort them by y0. - for column in layer_identifier_columns: - for other in layer_identifier_columns: + for column in layer_identifier_sidebars: + for other in layer_identifier_sidebars: if column != other and column.is_contained(other.rect()): for entry in other.entries: if entry not in column.entries: column.entries.append(entry) column.entries.sort(key=lambda entry: entry.rect.y0) - layer_identifier_columns.remove(other) + layer_identifier_sidebars.remove(other) break - layer_identifier_columns = [column for column in layer_identifier_columns if len(column.entries) > 2] - return layer_identifier_columns + layer_identifier_sidebars = [column for column in layer_identifier_sidebars if len(column.entries) > 2] + return layer_identifier_sidebars diff --git a/src/stratigraphy/sidebar/sidebar.py b/src/stratigraphy/sidebar/sidebar.py new file mode 100644 index 00000000..046d9094 --- /dev/null +++ b/src/stratigraphy/sidebar/sidebar.py @@ -0,0 +1,108 @@ +"""This module contains the Sidebar class, used to represent a depth column (or similar) of a borehole profile.""" + +from __future__ import annotations + +import abc +from dataclasses import dataclass +from typing import Generic, TypeVar + +import fitz + +from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry +from stratigraphy.layer.layer import IntervalBlockGroup +from stratigraphy.lines.line import TextLine, TextWord +from stratigraphy.util.dataclasses import Line + +EntryT = TypeVar("EntryT", bound=DepthColumnEntry) + + +@dataclass +class Sidebar(abc.ABC, Generic[EntryT]): + """Abstract Sidebar class, representing depths or other data displayed to the side of material descriptions.""" + + entries: list[EntryT] + + def rects(self) -> list[fitz.Rect]: + """Get the rectangles of the depth column entries.""" + return [entry.rect for entry in self.entries] + + def rect(self) -> fitz.Rect: + """Get the bounding box of the depth column entries.""" + x0 = min([rect.x0 for rect in self.rects()]) + x1 = max([rect.x1 for rect in self.rects()]) + y0 = min([rect.y0 for rect in self.rects()]) + y1 = max([rect.y1 for rect in self.rects()]) + return fitz.Rect(x0, y0, x1, y1) + + @property + def max_x0(self) -> float: + """Get the maximum x0 value of the depth column entries.""" + return max([rect.x0 for rect in self.rects()]) + + @property + def min_x1(self) -> float: + """Get the minimum x1 value of the depth column entries.""" + return min([rect.x1 for rect in self.rects()]) + + def noise_count(self, all_words: list[TextWord]) -> int: + """Counts the number of words that intersect with the depth column entries. + + Returns the number of words that intersect with the depth column entries, but are not part of the depth column. + + Args: + all_words (list[TextWord]): A list of all text lines on the page. + + Returns: + int: The number of words that intersect with the depth column entries but are not part of it. + """ + + def significant_intersection(other_rect): + intersection = fitz.Rect(other_rect).intersect(self.rect()) + return intersection.is_valid and intersection.width > 0.25 * self.rect().width + + return len([word for word in all_words if significant_intersection(word.rect)]) - len(self.entries) + + @abc.abstractmethod + def identify_groups( + self, + description_lines: list[TextLine], + geometric_lines: list[Line], + material_description_rect: fitz.Rect, + **params, + ) -> list[IntervalBlockGroup]: + """Identifies groups of description blocks that correspond to depth intervals. + + Args: + description_lines (list[TextLine]): A list of text lines that are part of the description. + geometric_lines (list[Line]): A list of geometric lines that are part of the description. + material_description_rect (fitz.Rect): The bounding box of the material description. + params (dict): A dictionary of relevant parameters. + + Returns: + list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. + """ + pass + + def can_be_appended(self, rect: fitz.Rect) -> bool: + """Checks if a new depth column entry can be appended to the current depth column. + + Check if the middle of the new rect is between the outer horizontal boundaries of the column, and if there is + an intersection with the minimal horizontal boundaries of the column. + + The checks are: + - The width of the new rectangle is greater than the width of the current depth column. Or; + - The middle of the new rectangle is within the horizontal boundaries of the current depth column. + - The new rectangle intersects with the minimal horizontal boundaries of the current depth column. + + Args: + rect (fitz.Rect): Rect of the depth column entry to be appended. + + Returns: + bool: True if the new depth column entry can be appended, False otherwise. + """ + new_middle = (rect.x0 + rect.x1) / 2 + if (self.rect().width < rect.width or self.rect().x0 < new_middle < self.rect().x1) and ( + rect.x0 <= self.min_x1 and self.max_x0 <= rect.x1 + ): + return True + return False diff --git a/src/stratigraphy/util/interval.py b/src/stratigraphy/util/interval.py index 8a4cd5e3..bd321d42 100644 --- a/src/stratigraphy/util/interval.py +++ b/src/stratigraphy/util/interval.py @@ -7,8 +7,8 @@ import fitz from stratigraphy.depthcolumn.depthcolumnentry import ( + AToBDepthColumnEntry, DepthColumnEntry, - LayerDepthColumnEntry, ) from stratigraphy.lines.line import TextLine from stratigraphy.text.textblock import TextBlock @@ -58,11 +58,8 @@ def to_json(self): } -class BoundaryInterval(Interval): - """Class for boundary intervals. - - Boundary intervals are intervals that are defined by a start and an end point. - """ +class AAboveBInterval(Interval): + """Class for depth intervals where the upper depth is located above the lower depth on the page.""" @property def line_anchor(self) -> fitz.Point | None: @@ -143,14 +140,10 @@ def matching_blocks( return pre, exact, post -class LayerInterval(Interval): - """Class for layer intervals. - - A layer interval is an interval whose start and end-points are defined in a single entry. - E.g. 1.00 - 2.30m. - """ +class AToBInterval(Interval): + """Class for intervals that are defined in a single line like "1.00 - 2.30m".""" - def __init__(self, layer_depth_column_entry: LayerDepthColumnEntry): + def __init__(self, layer_depth_column_entry: AToBDepthColumnEntry): self.entry = layer_depth_column_entry super().__init__(layer_depth_column_entry.start, layer_depth_column_entry.end) diff --git a/tests/test_depthcolumn.py b/tests/test_depthcolumn.py index 58501db3..2ca98178 100644 --- a/tests/test_depthcolumn.py +++ b/tests/test_depthcolumn.py @@ -1,13 +1,13 @@ """Test suite for the find_depth_columns module.""" import fitz -from stratigraphy.depthcolumn.depthcolumn import BoundaryDepthColumn from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry +from stratigraphy.sidebar import AAboveBSidebar -def test_boundarydepthcolumn_isarithmeticprogression(): # noqa: D103 - """Test the is_arithmetic_progression method of the BoundaryDepthColumn class.""" - column = BoundaryDepthColumn( +def test_aabovebsidebar_isarithmeticprogression(): # noqa: D103 + """Test the is_arithmetic_progression method of the AAboveBSidebar class.""" + column = AAboveBSidebar( [ DepthColumnEntry(fitz.Rect(), value=1), DepthColumnEntry(fitz.Rect(), value=2), @@ -18,7 +18,7 @@ def test_boundarydepthcolumn_isarithmeticprogression(): # noqa: D103 ) assert column.is_arithmetic_progression(), "The column should be recognized as arithmetic progression" - column = BoundaryDepthColumn( + column = AAboveBSidebar( [ DepthColumnEntry(fitz.Rect(), value=17.6), DepthColumnEntry(fitz.Rect(), value=18.15), diff --git a/tests/test_find_depth_columns.py b/tests/test_find_sidebar.py similarity index 94% rename from tests/test_find_depth_columns.py rename to tests/test_find_sidebar.py index 621eda07..ffcaa539 100644 --- a/tests/test_find_depth_columns.py +++ b/tests/test_find_sidebar.py @@ -3,12 +3,12 @@ import fitz import pytest from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry -from stratigraphy.depthcolumn.find_depth_columns import ( +from stratigraphy.lines.line import TextWord +from stratigraphy.sidebar.find_sidebars import ( depth_column_entries, - find_depth_columns, - find_layer_depth_columns, + find_a_above_b_sidebars, + find_a_to_b_sidebars, ) -from stratigraphy.lines.line import TextWord PAGE_NUMBER = 1 ALL_WORDS_FIND_DEPTH_COLUMN = [ @@ -86,10 +86,10 @@ def test_find_depth_columns_arithmetic_progression(): # noqa: D103 DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), ] - columns = find_depth_columns( + columns = find_a_above_b_sidebars( entries, ALL_WORDS_FIND_DEPTH_COLUMN, - depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, + sidebar_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) assert len(columns) == 0, "There should be 0 columns as the above is a perfect arithmetic progression" @@ -104,10 +104,10 @@ def test_find_depth_columns(): # noqa: D103 DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), ] - columns = find_depth_columns( + columns = find_a_above_b_sidebars( entries, ALL_WORDS_FIND_DEPTH_COLUMN, - depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, + sidebar_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) assert len(columns) == 1, "There should be 1 column" assert len(columns[0].entries) == 5, "The column should have 5 entries" @@ -133,10 +133,10 @@ def test_two_columns_find_depth_columns(): # noqa: D103 DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0), DepthColumnEntry(fitz.Rect(20, 10, 25, 11), 61.0), ] - columns = find_depth_columns( + columns = find_a_above_b_sidebars( entries, ALL_WORDS_FIND_DEPTH_COLUMN, - depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, + sidebar_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) assert len(columns) == 2, "There should be 2 columns" assert len(columns[0].entries) == 5, "The first column should have 5 entries" @@ -158,7 +158,7 @@ def test_find_layer_depth_columns(): # noqa: D103 DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0), ] - columns = find_layer_depth_columns(entries, ALL_WORDS_FIND_DEPTH_COLUMN) + columns = find_a_to_b_sidebars(entries, ALL_WORDS_FIND_DEPTH_COLUMN) assert len(columns) == 1, "There should be 1 column" assert len(columns[0].entries) == 5, "The column should have 5 entries" assert columns[0].entries[0].start.value == 12.0, "The first entry should have a value of 12.0" @@ -198,7 +198,7 @@ def test_two_columns_find_layer_depth_columns(): # noqa: D103 DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0), # layer 50.0-60.0m DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 60.0), ] - columns = find_layer_depth_columns(entries, ALL_WORDS_FIND_LAYER_DEPTH_COLUMN) + columns = find_a_to_b_sidebars(entries, ALL_WORDS_FIND_LAYER_DEPTH_COLUMN) assert len(columns) == 2, "There should be 2 columns" assert len(columns[0].entries) == 5, "The first column should have 5 entries" assert len(columns[1].entries) == 5, "The second column should have 5 entries" diff --git a/tests/test_interval.py b/tests/test_interval.py index b53c0718..7d6d4118 100644 --- a/tests/test_interval.py +++ b/tests/test_interval.py @@ -1,34 +1,34 @@ """Test suite for the interval module.""" import fitz -from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry, LayerDepthColumnEntry -from stratigraphy.util.interval import BoundaryInterval, LayerInterval +from stratigraphy.depthcolumn.depthcolumnentry import AToBDepthColumnEntry, DepthColumnEntry +from stratigraphy.util.interval import AAboveBInterval, AToBInterval def test_line_anchor(): # noqa: D103 """Test the line anchor property of the BoundaryInterval and LayerInterval classes.""" start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5) end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10) - boundary_interval = BoundaryInterval(start, end) + boundary_interval = AAboveBInterval(start, end) assert boundary_interval.line_anchor == fitz.Point(1, 1.5), ( "The 'line anchor' for a BoundaryInterval should be halfway between the bottom-right of the start depth and " "the top-right of the end depth." ) - boundary_interval = BoundaryInterval(start, end=None) + boundary_interval = AAboveBInterval(start, end=None) assert boundary_interval.line_anchor == fitz.Point( 1, 1 ), "The 'line anchor' for a BoundaryInterval without end should be the bottom-right of the start depth." - boundary_interval = BoundaryInterval(start=None, end=end) + boundary_interval = AAboveBInterval(start=None, end=end) assert boundary_interval.line_anchor == fitz.Point( 1, 2 ), "The 'line anchor' for a BoundaryInterval without start should be the top-right of the end depth." start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5) end = DepthColumnEntry(fitz.Rect(2, 0, 3, 1), 10) - entry = LayerDepthColumnEntry(start, end) - layer_interval = LayerInterval(entry) + entry = AToBDepthColumnEntry(start, end) + layer_interval = AToBInterval(entry) assert layer_interval.line_anchor == fitz.Point( 3, 0.5 ), "The 'line anchor' for a LayerInterval should be the midpoint of the right-hand-side of the end rect." @@ -38,7 +38,7 @@ def test_background_rect(): # noqa: D103 """Test the background_rect property of the BoundaryInterval class.""" start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5) end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10) - boundary_interval = BoundaryInterval(start, end) + boundary_interval = AAboveBInterval(start, end) assert boundary_interval.background_rect == fitz.Rect( start.rect.x0, start.rect.y1, start.rect.x1, end.rect.y0 ), "The background rect should be (0, 1, 1, 2)" From a1b403b73c85f18db5ee559f3bea7aa22fbefada Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Tue, 12 Nov 2024 13:37:08 +0100 Subject: [PATCH 11/20] LGVISIUM-102: cleanup --- src/stratigraphy/depthcolumn/depthcolumnentry.py | 16 ++++++++-------- .../bounding_boxes.py | 4 ++-- .../material_description_rect_with_sidebar.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/stratigraphy/depthcolumn/depthcolumnentry.py b/src/stratigraphy/depthcolumn/depthcolumnentry.py index bea3778c..c8f56905 100644 --- a/src/stratigraphy/depthcolumn/depthcolumnentry.py +++ b/src/stratigraphy/depthcolumn/depthcolumnentry.py @@ -20,16 +20,16 @@ def to_json(self) -> dict[str, Any]: return {"value": self.value, "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1]} @classmethod - def from_json(cls, json_depth_column_entry: dict) -> "DepthColumnEntry": + def from_json(cls, data: dict) -> "DepthColumnEntry": """Converts a dictionary to an object. Args: - json_depth_column_entry (dict): A dictionary representing the depth column entry. + data (dict): A dictionary representing the depth column entry. Returns: DepthColumnEntry: The depth column entry object. """ - return cls(rect=fitz.Rect(json_depth_column_entry["rect"]), value=json_depth_column_entry["value"]) + return cls(rect=fitz.Rect(data["rect"]), value=data["value"]) class AToBDepthColumnEntry: # noqa: D101 @@ -56,15 +56,15 @@ def to_json(self) -> dict[str, Any]: } @classmethod - def from_json(cls, json_layer_depth_column_entry: dict) -> "AToBDepthColumnEntry": + def from_json(cls, data: dict) -> "AToBDepthColumnEntry": """Converts a dictionary to an object. Args: - json_layer_depth_column_entry (dict): A dictionary representing the layer depth column entry. + data (dict): A dictionary representing the layer depth column entry. Returns: - AToBDepthColumnEntry: The layer depth column entry object. + AToBDepthColumnEntry: The A-to-B depth column entry object. """ - start = DepthColumnEntry.from_json(json_layer_depth_column_entry["start"]) - end = DepthColumnEntry.from_json(json_layer_depth_column_entry["end"]) + start = DepthColumnEntry.from_json(data["start"]) + end = DepthColumnEntry.from_json(data["end"]) return cls(start, end) diff --git a/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py index 2731713f..ba1ebcd6 100644 --- a/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py +++ b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py @@ -1,4 +1,4 @@ -"""Definition of the DepthsMaterialsColumnPairs class.""" +"""Classes for JSON-serializable bounding boxes of different parts of a borehole profile.""" from dataclasses import dataclass @@ -34,7 +34,7 @@ def from_json(cls, data) -> "BoundingBox": @dataclass class BoundingBoxes: - """A class to represent the bounding boxes of depth columns and associated material descriptions.""" + """A class to represent the bounding boxes of sidebars and associated material descriptions.""" sidebar_bbox: BoundingBox | None depth_column_entry_bboxes: list[BoundingBox] diff --git a/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py b/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py index ec1d9557..8d4aa39b 100644 --- a/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py +++ b/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py @@ -1,4 +1,4 @@ -"""Definition of the DepthsMaterialsColumnPairs class.""" +"""Definition of the MaterialDescriptionRectWithSidebar class.""" import math from dataclasses import dataclass From 7c0518256c73cde373b386c09debaf2fcdac7002 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Tue, 12 Nov 2024 15:12:06 +0100 Subject: [PATCH 12/20] LGVISIUM-102: create Sidebar Extractor classes --- .../depthcolumn/depthcolumnentry.py | 85 +++++- src/stratigraphy/extract.py | 14 +- src/stratigraphy/sidebar/__init__.py | 12 +- .../sidebar/a_above_b_sidebar_extractor.py | 74 ++++++ .../sidebar/a_above_b_sidebar_validator.py | 3 +- .../sidebar/a_to_b_sidebar_extractor.py | 77 ++++++ src/stratigraphy/sidebar/find_sidebars.py | 242 ------------------ .../sidebar/layer_identifier_sidebar.py | 38 ++- tests/test_find_sidebar.py | 191 ++++++-------- 9 files changed, 372 insertions(+), 364 deletions(-) create mode 100644 src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py create mode 100644 src/stratigraphy/sidebar/a_to_b_sidebar_extractor.py diff --git a/src/stratigraphy/depthcolumn/depthcolumnentry.py b/src/stratigraphy/depthcolumn/depthcolumnentry.py index c8f56905..ac469bb0 100644 --- a/src/stratigraphy/depthcolumn/depthcolumnentry.py +++ b/src/stratigraphy/depthcolumn/depthcolumnentry.py @@ -1,8 +1,12 @@ """Contains dataclasses for entries in a depth column.""" +from __future__ import annotations + +import re from typing import Any import fitz +from stratigraphy.lines.line import TextWord class DepthColumnEntry: # noqa: D101 @@ -20,7 +24,7 @@ def to_json(self) -> dict[str, Any]: return {"value": self.value, "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1]} @classmethod - def from_json(cls, data: dict) -> "DepthColumnEntry": + def from_json(cls, data: dict) -> DepthColumnEntry: """Converts a dictionary to an object. Args: @@ -31,6 +35,42 @@ def from_json(cls, data: dict) -> "DepthColumnEntry": """ return cls(rect=fitz.Rect(data["rect"]), value=data["value"]) + @classmethod + def find_in_words(cls, all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]: + """Find all depth column entries given a list of TextWord objects. + + Note: Only depths up to two digits before the decimal point are supported. + + Args: + all_words (list[TextWord]): List of text words to extract depth column entries from. + include_splits (bool): Whether to include split entries. + + Returns: + list[DepthColumnEntry]: The extracted depth column entries. + """ + entries = [] + for word in sorted(all_words, key=lambda word: word.rect.y0): + try: + input_string = word.text.strip().replace(",", ".") + regex = re.compile(r"^-?\.?([0-9]+(\.[0-9]+)?)[müMN\\.]*$") + # numbers such as '.40' are not supported. The reason is that sometimes the OCR + # recognizes a '-' as a '.' and we just ommit the leading '.' to avoid this issue. + match = regex.match(input_string) + if match: + value = value_as_float(match.group(1)) + entries.append(DepthColumnEntry(word.rect, value)) + elif include_splits: + # support for e.g. "1.10-1.60m" extracted as a single word + layer_depth_column_entry = AToBDepthColumnEntry.from_text(input_string, word.rect) + entries.extend( + [layer_depth_column_entry.start, layer_depth_column_entry.end] + if layer_depth_column_entry + else [] + ) + except ValueError: + pass + return entries + class AToBDepthColumnEntry: # noqa: D101 """Class to represent a layer depth column entry.""" @@ -56,7 +96,7 @@ def to_json(self) -> dict[str, Any]: } @classmethod - def from_json(cls, data: dict) -> "AToBDepthColumnEntry": + def from_json(cls, data: dict) -> AToBDepthColumnEntry: """Converts a dictionary to an object. Args: @@ -68,3 +108,44 @@ def from_json(cls, data: dict) -> "AToBDepthColumnEntry": start = DepthColumnEntry.from_json(data["start"]) end = DepthColumnEntry.from_json(data["end"]) return cls(start, end) + + @classmethod + def from_text( + cls, text: str, rect: fitz.Rect, require_start_of_string: bool = True + ) -> AToBDepthColumnEntry | None: + """Attempts to extract a AToBDepthColumnEntry from a string. + + Args: + text (str): The string to extract the depth interval from. + rect (fitz.Rect): The rectangle of the text. + require_start_of_string (bool, optional): Whether the number to extract needs to be + at the start of a string. Defaults to True. + + Returns: + AToBDepthColumnEntry | None: The extracted LayerDepthColumnEntry or None if none is found. + """ + input_string = text.strip().replace(",", ".") + + query = r"-?([0-9]+(\.[0-9]+)?)[müMN\]*[\s-]+([0-9]+(\.[0-9]+)?)[müMN\\.]*" + if not require_start_of_string: + query = r".*?" + query + regex = re.compile(query) + match = regex.match(input_string) + if match: + value1 = value_as_float(match.group(1)) + first_half_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 - rect.width / 2, rect.y1) + + value2 = value_as_float(match.group(3)) + second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1) + return AToBDepthColumnEntry( + DepthColumnEntry(first_half_rect, value1), + DepthColumnEntry(second_half_rect, value2), + ) + return None + + +def value_as_float(string_value: str) -> float: # noqa: D103 + """Converts a string to a float.""" + # OCR sometimes tends to miss the decimal comma + parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value) + return abs(float(parsed_text)) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index bfb43e54..981bbe51 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -12,7 +12,7 @@ ) from stratigraphy.layer.layer import IntervalBlockPair, Layer from stratigraphy.lines.line import TextLine -from stratigraphy.sidebar import Sidebar, find_sidebars +from stratigraphy.sidebar import AAboveBSidebarExtractor, AToBSidebarExtractor, Sidebar from stratigraphy.sidebar.layer_identifier_sidebar import ( find_layer_identifier_sidebar_entries, find_layer_identifier_sidebars, @@ -84,23 +84,17 @@ def process_page( # We could also think of some scoring mechanism to decide which one to use. if not material_descriptions_sidebar_pairs: words = [word for line in lines for word in line.words] - depth_column_entries = find_sidebars.depth_column_entries(words, include_splits=True) - a_to_b_sidebars = find_sidebars.find_a_to_b_sidebars(depth_column_entries, words) + a_to_b_sidebars = AToBSidebarExtractor.find_in_words(words) used_entry_rects = [] for column in a_to_b_sidebars: for entry in column.entries: used_entry_rects.extend([entry.start.rect, entry.end.rect]) - depth_column_entries = [ - entry - for entry in find_sidebars.depth_column_entries(words, include_splits=False) - if entry.rect not in used_entry_rects - ] sidebars: list[Sidebar] = a_to_b_sidebars sidebars.extend( - find_sidebars.find_a_above_b_sidebars( - depth_column_entries, words, sidebar_params=params["depth_column_params"] + AAboveBSidebarExtractor.find_in_words( + words, used_entry_rects, sidebar_params=params["depth_column_params"] ) ) diff --git a/src/stratigraphy/sidebar/__init__.py b/src/stratigraphy/sidebar/__init__.py index fbb94fba..0ff3e45c 100644 --- a/src/stratigraphy/sidebar/__init__.py +++ b/src/stratigraphy/sidebar/__init__.py @@ -1,9 +1,19 @@ """Modules for Sidebars, representing depths or other data displayed to the side of material descriptions.""" from .a_above_b_sidebar import AAboveBSidebar +from .a_above_b_sidebar_extractor import AAboveBSidebarExtractor from .a_above_b_sidebar_validator import AAboveBSidebarValidator from .a_to_b_sidebar import AToBSidebar +from .a_to_b_sidebar_extractor import AToBSidebarExtractor from .layer_identifier_sidebar import LayerIdentifierSidebar from .sidebar import Sidebar -__all__ = ["Sidebar", "AAboveBSidebar", "AToBSidebar", "LayerIdentifierSidebar", "AAboveBSidebarValidator"] +__all__ = [ + "Sidebar", + "AAboveBSidebar", + "AAboveBSidebarExtractor", + "AAboveBSidebarValidator", + "AToBSidebar", + "AToBSidebarExtractor", + "LayerIdentifierSidebar", +] diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py new file mode 100644 index 00000000..68245963 --- /dev/null +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py @@ -0,0 +1,74 @@ +"""Module for finding AAboveBSidebar instances in a borehole profile.""" + +import fitz + +from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry +from stratigraphy.lines.line import TextWord +from stratigraphy.sidebar.a_above_b_sidebar import AAboveBSidebar +from stratigraphy.sidebar.a_above_b_sidebar_validator import AAboveBSidebarValidator + + +class AAboveBSidebarExtractor: + """Class that finds AAboveBSidebar instances in a borehole profile.""" + + @staticmethod + def find_in_words( + all_words: list[TextWord], used_entry_rects: list[fitz.Rect], sidebar_params: dict + ) -> list[AAboveBSidebar]: + """Construct all possible AAboveBSidebar objects from the given DepthColumnEntry objects. + + Args: + all_words (list[TextLine]): All words in the page. + used_entry_rects (list[fitz.Rect]): Part of the document to ignore. + sidebar_params (dict): Parameters for the BoundaryDepthColumn objects. + + Returns: + list[AAboveBSidebar]: Found BoundaryDepthColumn objects. + """ + entries = [ + entry + for entry in DepthColumnEntry.find_in_words(all_words, include_splits=False) + if entry.rect not in used_entry_rects + ] + + numeric_columns: list[AAboveBSidebar] = [] + for entry in entries: + has_match = False + additional_columns = [] + for column in numeric_columns: + if column.can_be_appended(entry.rect): + has_match = True + column.entries.append(entry) + else: + valid_initial_segment = column.valid_initial_segment(entry.rect) + if len(valid_initial_segment.entries) > 0: + has_match = True + valid_initial_segment.entries.append(entry) + additional_columns.append(valid_initial_segment) + + numeric_columns.extend(additional_columns) + if not has_match: + numeric_columns.append(AAboveBSidebar(entries=[entry])) + + # only keep columns that are not contained in a different column + numeric_columns = [ + column + for column in numeric_columns + if all(not other.strictly_contains(column) for other in numeric_columns) + ] + + boundary_depth_column_validator = AAboveBSidebarValidator(all_words, **sidebar_params) + + numeric_columns = [ + boundary_depth_column_validator.reduce_until_valid(column) + for numeric_column in numeric_columns + for column in numeric_column.break_on_double_descending() + # when we have a perfect arithmetic progression, this is usually just a scale + # that does not match the descriptions + if not column.significant_arithmetic_progression() + ] + + return sorted( + [column for column in numeric_columns if column and boundary_depth_column_validator.is_valid(column)], + key=lambda column: len(column.entries), + ) diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py index c64d49bc..0dfbfe9a 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py @@ -4,7 +4,8 @@ from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry from stratigraphy.lines.line import TextWord -from stratigraphy.sidebar import AAboveBSidebar + +from .a_above_b_sidebar import AAboveBSidebar @dataclasses.dataclass diff --git a/src/stratigraphy/sidebar/a_to_b_sidebar_extractor.py b/src/stratigraphy/sidebar/a_to_b_sidebar_extractor.py new file mode 100644 index 00000000..2751e4d7 --- /dev/null +++ b/src/stratigraphy/sidebar/a_to_b_sidebar_extractor.py @@ -0,0 +1,77 @@ +"""Module for finding AToBSidebar instances in a borehole profile.""" + +import re + +from stratigraphy.depthcolumn.depthcolumnentry import AToBDepthColumnEntry, DepthColumnEntry +from stratigraphy.lines.line import TextWord +from stratigraphy.sidebar import AToBSidebar + + +class AToBSidebarExtractor: + """Class that finds AToBSidebar instances in a borehole profile.""" + + @staticmethod + def find_in_words(all_words: list[TextWord]) -> list[AToBSidebar]: + """Finds all AToBSidebars. + + Generates a list of AToBDepthColumnEntry objects by finding consecutive pairs of DepthColumnEntry objects. + Different columns are grouped together in LayerDepthColumn objects. Finally, a list of AToBSidebars objects, + one for each column, is returned. + + A layer corresponds to a material layer. The layer is defined using a start and end point (e.g. 1.10-1.60m). + The start and end points are represented as DepthColumnEntry objects. + + Args: + all_words (list[TextWord]): List of all TextWord objects. + + Returns: + list[AToBSidebar]: List of all AToBSidebars identified. + """ + entries = DepthColumnEntry.find_in_words(all_words, include_splits=True) + + def find_pair(entry: DepthColumnEntry) -> DepthColumnEntry | None: # noqa: D103 + min_y0 = entry.rect.y0 - entry.rect.height / 2 + max_y0 = entry.rect.y0 + entry.rect.height / 2 + for other in entries: + if entry == other: + continue + if other.value <= entry.value: + continue + combined_width = entry.rect.width + other.rect.width + if not entry.rect.x0 <= other.rect.x0 <= entry.rect.x0 + combined_width: + continue + if not min_y0 <= other.rect.y0 <= max_y0: + continue + in_between_text = " ".join( + [ + word.text + for word in all_words + if entry.rect.x0 < word.rect.x0 < other.rect.x0 and min_y0 <= word.rect.y0 <= max_y0 + ] + ) + if re.fullmatch(r"\W*m?\W*", in_between_text): + return other + + pairs = [(entry, find_pair(entry)) for entry in entries] + + sidebars = [] + for first, second in pairs: + if second is not None: + entry = AToBDepthColumnEntry(first, second) + is_matched = False + for sidebar in sidebars: + column_rect = sidebar.rect() + new_start_middle = (entry.start.rect.x0 + entry.start.rect.x1) / 2 + if column_rect.x0 < new_start_middle < column_rect.x1: + is_matched = True + sidebar.entries.append(entry) + + if not is_matched: + sidebars.append(AToBSidebar([entry])) + + return [ + sidebar_segment + for sidebar in sidebars + for sidebar_segment in sidebar.break_on_mismatch() + if sidebar_segment.is_valid() + ] diff --git a/src/stratigraphy/sidebar/find_sidebars.py b/src/stratigraphy/sidebar/find_sidebars.py index 3f09460b..ce2d4599 100644 --- a/src/stratigraphy/sidebar/find_sidebars.py +++ b/src/stratigraphy/sidebar/find_sidebars.py @@ -1,243 +1 @@ """This module contains functionalities to find sidebars in a pdf page.""" - -import re - -import fitz - -from stratigraphy.depthcolumn.depthcolumnentry import AToBDepthColumnEntry, DepthColumnEntry -from stratigraphy.lines.line import TextWord -from stratigraphy.sidebar import AAboveBSidebar, AAboveBSidebarValidator, AToBSidebar -from stratigraphy.text.textblock import TextBlock -from stratigraphy.util.interval import AToBInterval - - -def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]: - """Find all depth column entries given a list of TextLine objects. - - Note: Only depths up to two digits before the decimal point are supported. - - Args: - all_words (list[TextWord]): List of text words to extract depth column entries from. - include_splits (bool): Whether to include split entries. - - Returns: - list[DepthColumnEntry]: The extracted depth column entries. - """ - entries = [] - for word in sorted(all_words, key=lambda word: word.rect.y0): - try: - input_string = word.text.strip().replace(",", ".") - regex = re.compile(r"^-?\.?([0-9]+(\.[0-9]+)?)[müMN\\.]*$") - # numbers such as '.40' are not supported. The reason is that sometimes the OCR - # recognizes a '-' as a '.' and we just ommit the leading '.' to avoid this issue. - match = regex.match(input_string) - if match: - value = value_as_float(match.group(1)) - entries.append(DepthColumnEntry(word.rect, value)) - elif include_splits: - # support for e.g. "1.10-1.60m" extracted as a single word - layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect) - entries.extend( - [layer_depth_column_entry.start, layer_depth_column_entry.end] if layer_depth_column_entry else [] - ) - except ValueError: - pass - return entries - - -def value_as_float(string_value: str) -> float: # noqa: D103 - """Converts a string to a float.""" - # OCR sometimes tends to miss the decimal comma - parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value) - return abs(float(parsed_text)) - - -def extract_layer_depth_interval( - text: str, rect: fitz.Rect, require_start_of_string: bool = True -) -> AToBDepthColumnEntry | None: - """Extracts a LayerDepthColumnEntry from a string. - - Args: - text (str): The string to extract the depth interval from. - rect (fitz.Rect): The rectangle of the text. - require_start_of_string (bool, optional): Whether the number to extract needs to be - at the start of a string. Defaults to True. - - Returns: - AToBDepthColumnEntry | None: The extracted LayerDepthColumnEntry or None if none is found. - """ - input_string = text.strip().replace(",", ".") - - query = r"-?([0-9]+(\.[0-9]+)?)[müMN\]*[\s-]+([0-9]+(\.[0-9]+)?)[müMN\\.]*" - if not require_start_of_string: - query = r".*?" + query - regex = re.compile(query) - match = regex.match(input_string) - if match: - value1 = value_as_float(match.group(1)) - first_half_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 - rect.width / 2, rect.y1) - - value2 = value_as_float(match.group(3)) - second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1) - return AToBDepthColumnEntry( - DepthColumnEntry(first_half_rect, value1), - DepthColumnEntry(second_half_rect, value2), - ) - return None - - -def find_a_to_b_sidebars(entries: list[DepthColumnEntry], all_words: list[TextWord]) -> list[AToBSidebar]: - """Finds all AToBSidebars. - - Generates a list of LayerDepthColumnEntry objects by finding consecutive pairs of DepthColumnEntry objects. - Different columns are grouped together in LayerDepthColumn objects. Finally, a list of AToBSidebars objects, - one for each column, is returned. - - A layer corresponds to a material layer. The layer is defined using a start and end point (e.g. 1.10-1.60m). - The start and end points are represented as DepthColumnEntry objects. - - Args: - entries (list[DepthColumnEntry]): List of depth column entries. - all_words (list[TextWord]): List of all TextWord objects. - - Returns: - list[AToBSidebar]: List of all AToBSidebars identified. - """ - - def find_pair(entry: DepthColumnEntry) -> DepthColumnEntry | None: # noqa: D103 - min_y0 = entry.rect.y0 - entry.rect.height / 2 - max_y0 = entry.rect.y0 + entry.rect.height / 2 - for other in entries: - if entry == other: - continue - if other.value <= entry.value: - continue - combined_width = entry.rect.width + other.rect.width - if not entry.rect.x0 <= other.rect.x0 <= entry.rect.x0 + combined_width: - continue - if not min_y0 <= other.rect.y0 <= max_y0: - continue - in_between_text = " ".join( - [ - word.text - for word in all_words - if entry.rect.x0 < word.rect.x0 < other.rect.x0 and min_y0 <= word.rect.y0 <= max_y0 - ] - ) - if re.fullmatch(r"\W*m?\W*", in_between_text): - return other - - pairs = [(entry, find_pair(entry)) for entry in entries] - - sidebars = [] - for first, second in pairs: - if second is not None: - entry = AToBDepthColumnEntry(first, second) - is_matched = False - for sidebar in sidebars: - column_rect = sidebar.rect() - new_start_middle = (entry.start.rect.x0 + entry.start.rect.x1) / 2 - if column_rect.x0 < new_start_middle < column_rect.x1: - is_matched = True - sidebar.entries.append(entry) - - if not is_matched: - sidebars.append(AToBSidebar([entry])) - - return [ - sidebar_segment - for sidebar in sidebars - for sidebar_segment in sidebar.break_on_mismatch() - if sidebar_segment.is_valid() - ] - - -def find_a_above_b_sidebars( - entries: list[DepthColumnEntry], all_words: list[TextWord], sidebar_params: dict -) -> list[AAboveBSidebar]: - """Construct all possible AAboveBSidebar objects from the given DepthColumnEntry objects. - - Args: - entries (list[DepthColumnEntry]): All found depth column entries in the page. - all_words (list[TextLine]): All words in the page. - sidebar_params (dict): Parameters for the BoundaryDepthColumn objects. - - Returns: - list[AAboveBSidebar]: Found BoundaryDepthColumn objects. - """ - numeric_columns: list[AAboveBSidebar] = [] - for entry in entries: - has_match = False - additional_columns = [] - for column in numeric_columns: - if column.can_be_appended(entry.rect): - has_match = True - column.entries.append(entry) - else: - valid_initial_segment = column.valid_initial_segment(entry.rect) - if len(valid_initial_segment.entries) > 0: - has_match = True - valid_initial_segment.entries.append(entry) - additional_columns.append(valid_initial_segment) - - numeric_columns.extend(additional_columns) - if not has_match: - numeric_columns.append(AAboveBSidebar(entries=[entry])) - - # only keep columns that are not contained in a different column - numeric_columns = [ - column - for column in numeric_columns - if all(not other.strictly_contains(column) for other in numeric_columns) - ] - - boundary_depth_column_validator = AAboveBSidebarValidator(all_words, **sidebar_params) - - numeric_columns = [ - boundary_depth_column_validator.reduce_until_valid(column) - for numeric_column in numeric_columns - for column in numeric_column.break_on_double_descending() - # when we have a perfect arithmetic progression, this is usually just a scale - # that does not match the descriptions - if not column.significant_arithmetic_progression() - ] - - return sorted( - [column for column in numeric_columns if column and boundary_depth_column_validator.is_valid(column)], - key=lambda column: len(column.entries), - ) - - -def get_depth_interval_from_textblock(block: TextBlock) -> AToBInterval | None: - """Extract depth interval from a material description block. - - For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description - of the material. Often, these text descriptions contain a further separation into multiple sub layers. - These sub layers have their own depth intervals. This function extracts the overall depth interval, - spanning across all mentioned sub layers. - - Args: - block (TextBlock): The block to calculate the depth interval for. - - Returns: - AToBInterval | None: The depth interval. - """ - depth_entries = [] - for line in block.lines: - try: - layer_depth_entry = extract_layer_depth_interval(line.text, line.rect, require_start_of_string=False) - # require_start_of_string = False because the depth interval may not always start at the beginning - # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" - if layer_depth_entry: - depth_entries.append(layer_depth_entry) - except ValueError: - pass - - if depth_entries: - # Merge the sub layers into one depth interval. - start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value) - end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value) - - return AToBInterval(AToBDepthColumnEntry(start, end)) - else: - return None diff --git a/src/stratigraphy/sidebar/layer_identifier_sidebar.py b/src/stratigraphy/sidebar/layer_identifier_sidebar.py index a7dcc976..6553b382 100644 --- a/src/stratigraphy/sidebar/layer_identifier_sidebar.py +++ b/src/stratigraphy/sidebar/layer_identifier_sidebar.py @@ -7,10 +7,11 @@ from stratigraphy.layer.layer import IntervalBlockGroup from stratigraphy.lines.line import TextLine -from stratigraphy.sidebar.find_sidebars import get_depth_interval_from_textblock from stratigraphy.text.textblock import TextBlock from stratigraphy.util.dataclasses import Line +from ..depthcolumn.depthcolumnentry import AToBDepthColumnEntry +from ..util.interval import AToBInterval from .sidebar import Sidebar @@ -209,3 +210,38 @@ def find_layer_identifier_sidebars(entries: list[LayerIdentifierEntry]) -> list[ break layer_identifier_sidebars = [column for column in layer_identifier_sidebars if len(column.entries) > 2] return layer_identifier_sidebars + + +def get_depth_interval_from_textblock(block: TextBlock) -> AToBInterval | None: + """Extract depth interval from a material description block. + + For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description + of the material. Often, these text descriptions contain a further separation into multiple sub layers. + These sub layers have their own depth intervals. This function extracts the overall depth interval, + spanning across all mentioned sub layers. + + Args: + block (TextBlock): The block to calculate the depth interval for. + + Returns: + AToBInterval | None: The depth interval. + """ + depth_entries = [] + for line in block.lines: + try: + layer_depth_entry = AToBDepthColumnEntry.from_text(line.text, line.rect, require_start_of_string=False) + # require_start_of_string = False because the depth interval may not always start at the beginning + # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" + if layer_depth_entry: + depth_entries.append(layer_depth_entry) + except ValueError: + pass + + if depth_entries: + # Merge the sub layers into one depth interval. + start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value) + end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value) + + return AToBInterval(AToBDepthColumnEntry(start, end)) + else: + return None diff --git a/tests/test_find_sidebar.py b/tests/test_find_sidebar.py index ffcaa539..51178ec6 100644 --- a/tests/test_find_sidebar.py +++ b/tests/test_find_sidebar.py @@ -4,42 +4,20 @@ import pytest from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry from stratigraphy.lines.line import TextWord -from stratigraphy.sidebar.find_sidebars import ( - depth_column_entries, - find_a_above_b_sidebars, - find_a_to_b_sidebars, -) +from stratigraphy.sidebar import AAboveBSidebarExtractor, AToBSidebarExtractor PAGE_NUMBER = 1 -ALL_WORDS_FIND_DEPTH_COLUMN = [ - TextWord(fitz.Rect(0, 0, 5, 1), "10.00m", PAGE_NUMBER), - TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand", PAGE_NUMBER), - TextWord(fitz.Rect(20, 2, 30, 3), "Kies, verwittert.", PAGE_NUMBER), - TextWord(fitz.Rect(0, 2, 5, 3), "20.0m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 4, 5, 5), "30.0m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 6, 5, 7), "40.0m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 8, 5, 9), "50.0m", PAGE_NUMBER), -] -ALL_WORDS_FIND_LAYER_DEPTH_COLUMN = [ - TextWord(fitz.Rect(0, 0, 5, 1), "12.00-20.0m", PAGE_NUMBER), - TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand", PAGE_NUMBER), - TextWord(fitz.Rect(20, 2, 30, 3), "Kies, verwittert.", PAGE_NUMBER), - TextWord(fitz.Rect(0, 2, 5, 3), "20.0-34.0m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 4, 5, 5), "34.0 - 40.0m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 6, 5, 7), "40.0-50m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 8, 5, 9), "50.0-60m", PAGE_NUMBER), -] def test_depth_column_entries(): # noqa: D103 - """Test the depth_column_entries function.""" + """Test the DepthColumnEntry.find_in_words function.""" all_words = [ TextWord(fitz.Rect(0, 0, 5, 1), "10.00m", PAGE_NUMBER), TextWord(fitz.Rect(0, 2, 5, 3), "20.0m", PAGE_NUMBER), TextWord(fitz.Rect(0, 4, 5, 5), "30.0m", PAGE_NUMBER), TextWord(fitz.Rect(0, 6, 5, 7), "40.0m", PAGE_NUMBER), ] - entries = depth_column_entries(all_words, include_splits=False) + entries = DepthColumnEntry.find_in_words(all_words, include_splits=False) assert len(entries) == 4, "There should be 4 entries" assert pytest.approx(entries[0].value) == 10.0, "The first entry should have a value of 10.0" assert pytest.approx(entries[1].value) == 20.0, "The second entry should have a value of 20.0" @@ -48,12 +26,12 @@ def test_depth_column_entries(): # noqa: D103 def test_depth_column_entries_with_splits(): # noqa: D103 - """Test the depth_column_entries function with include_splits=True.""" + """Test the DepthColumnEntry.find_in_words function with include_splits=True.""" all_words = [ TextWord(fitz.Rect(0, 0, 10, 1), "10.00-20.0m", PAGE_NUMBER), TextWord(fitz.Rect(0, 2, 10, 3), "30.0-40.0m", PAGE_NUMBER), ] - entries = depth_column_entries(all_words, include_splits=True) + entries = DepthColumnEntry.find_in_words(all_words, include_splits=True) assert len(entries) == 4, "There should be 4 entries" assert entries[0].value == 10.0, "The first entry should have a value of 10.0" assert entries[1].value == 20.0, "The second entry should have a value of 20.0" @@ -69,7 +47,7 @@ def test_depth_column_entries_with_leading_character(): # noqa: D103 TextWord(fitz.Rect(0, 4, 5, 5), "-3.0m", PAGE_NUMBER), TextWord(fitz.Rect(0, 6, 5, 7), ".4.2m", PAGE_NUMBER), ] - entries = depth_column_entries(all_words, include_splits=True) + entries = DepthColumnEntry.find_in_words(all_words, include_splits=True) assert len(entries) == 4, "There should be 4 entries" assert entries[0].value == 0.0, "The first entry should have a value of 0" assert entries[1].value == 2.0, "The second entry should have a value of 2.0" @@ -77,36 +55,35 @@ def test_depth_column_entries_with_leading_character(): # noqa: D103 assert entries[3].value == 4.2, "The fourth entry should have a value of 4.2" -def test_find_depth_columns_arithmetic_progression(): # noqa: D103 - """Test the find_depth_columns function with an arithmetic progression.""" - entries = [ - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 10.0), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 30.0), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), +def test_aabovebsidebarextractor_arithmetic_progression(): # noqa: D103 + all_words = [ + TextWord(fitz.Rect(0, 0, 5, 1), "10.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 4, 5, 5), "30.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 8, 5, 9), "50.0", PAGE_NUMBER), ] - columns = find_a_above_b_sidebars( - entries, - ALL_WORDS_FIND_DEPTH_COLUMN, + """Test the AAboveBSidebarExtractor with an arithmetic progression.""" + columns = AAboveBSidebarExtractor.find_in_words( + all_words, + used_entry_rects=[], sidebar_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) assert len(columns) == 0, "There should be 0 columns as the above is a perfect arithmetic progression" -def test_find_depth_columns(): # noqa: D103 - """Test the find_depth_columns function.""" - entries = [ - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), +def test_aabovebsidebarextractor(): # noqa: D103 + """Test the AAboveBSidebarExtractor.""" + all_words = [ + TextWord(fitz.Rect(0, 0, 5, 1), "12.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 4, 5, 5), "34.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 8, 5, 9), "50.0", PAGE_NUMBER), ] - - columns = find_a_above_b_sidebars( - entries, - ALL_WORDS_FIND_DEPTH_COLUMN, + columns = AAboveBSidebarExtractor.find_in_words( + all_words, + used_entry_rects=[], sidebar_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) assert len(columns) == 1, "There should be 1 column" @@ -118,24 +95,25 @@ def test_find_depth_columns(): # noqa: D103 assert pytest.approx(columns[0].entries[4].value) == 50.0, "The fourth entry should have a value of 50.0" -def test_two_columns_find_depth_columns(): # noqa: D103 - """Test the find_depth_columns function with two columns.""" - entries = [ # first depth column - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), - DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0), # second depth column - DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0), - DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0), - DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0), - DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0), - DepthColumnEntry(fitz.Rect(20, 10, 25, 11), 61.0), +def test_aabovebsidebarextractor_two_column(): # noqa: D103 + """Test the AAboveBSidebarExtractor function with two columns.""" + all_words = [ # first depth column + TextWord(fitz.Rect(0, 0, 5, 1), "12.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 4, 5, 5), "34.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 8, 5, 9), "50.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 0, 25, 1), "12.0", PAGE_NUMBER), # second depth column + TextWord(fitz.Rect(20, 2, 25, 3), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 4, 25, 5), "34.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 6, 25, 7), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 8, 25, 9), "50.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 10, 25, 11), "61.0", PAGE_NUMBER), ] - columns = find_a_above_b_sidebars( - entries, - ALL_WORDS_FIND_DEPTH_COLUMN, + + columns = AAboveBSidebarExtractor.find_in_words( + all_words, + used_entry_rects=[], sidebar_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) assert len(columns) == 2, "There should be 2 columns" @@ -143,22 +121,21 @@ def test_two_columns_find_depth_columns(): # noqa: D103 assert len(columns[1].entries) == 6, "The second column should have 6 entries" -def test_find_layer_depth_columns(): # noqa: D103 - """Test the find_layer_depth_columns function.""" - entries = [ - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0), # layer 12.0-20.0m - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), # layer 20.0-34.0m - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0), # layer 34.0-40.0m - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), # layer 40.0-50.0m - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), # layer 50.0-60.0m - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0), +def test_atobsidebarextractor(): # noqa: D103 + """Test the AToBSidebarExtractor.""" + all_words = [ + TextWord(fitz.Rect(0, 0, 5, 1), "12.0", PAGE_NUMBER), # layer 12.0-20.0m + TextWord(fitz.Rect(0, 0, 5, 1), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0", PAGE_NUMBER), # layer 20.0-34.0m + TextWord(fitz.Rect(0, 2, 5, 3), "34.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 4, 5, 5), "34.0", PAGE_NUMBER), # layer 34.0-40.0m + TextWord(fitz.Rect(0, 4, 5, 5), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0", PAGE_NUMBER), # layer 40.0-50.0m + TextWord(fitz.Rect(0, 6, 5, 7), "50.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 8, 5, 9), "50.0", PAGE_NUMBER), # layer 50.0-60.0m + TextWord(fitz.Rect(0, 8, 5, 9), "60.0", PAGE_NUMBER), ] - - columns = find_a_to_b_sidebars(entries, ALL_WORDS_FIND_DEPTH_COLUMN) + columns = AToBSidebarExtractor.find_in_words(all_words) assert len(columns) == 1, "There should be 1 column" assert len(columns[0].entries) == 5, "The column should have 5 entries" assert columns[0].entries[0].start.value == 12.0, "The first entry should have a value of 12.0" @@ -173,32 +150,32 @@ def test_find_layer_depth_columns(): # noqa: D103 assert columns[0].entries[4].end.value == 60.0, "The fourth entry should have a value of 60.0" -def test_two_columns_find_layer_depth_columns(): # noqa: D103 - """Test the find_layer_depth_columns function with two columns.""" - entries = [ # first depth column - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0), # layer 12.0-20.0m - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), # layer 20.0-34.0m - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0), # layer 34.0-40.0m - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), # layer 40.0-50.0m - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), # layer 50.0-60.0m - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0), +def test_atobsidebarextractor_two_columns(): # noqa: D103 + """Test the AToBSidebarExtractor with two columns.""" + all_words = [ # first depth column + TextWord(fitz.Rect(0, 0, 5, 1), "12.0", PAGE_NUMBER), # layer 12.0-20.0m + TextWord(fitz.Rect(0, 0, 5, 1), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0", PAGE_NUMBER), # layer 20.0-34.0m + TextWord(fitz.Rect(0, 2, 5, 3), "34.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 4, 5, 5), "34.0", PAGE_NUMBER), # layer 34.0-40.0m + TextWord(fitz.Rect(0, 4, 5, 5), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0", PAGE_NUMBER), # layer 40.0-50.0m + TextWord(fitz.Rect(0, 6, 5, 7), "50.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 8, 5, 9), "50.0", PAGE_NUMBER), # layer 50.0-60.0m + TextWord(fitz.Rect(0, 8, 5, 9), "60.0", PAGE_NUMBER), # second depth column - DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0), # layer 12.0-20.0m - DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 20.0), - DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0), # layer 20.0-34.0m - DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 34.0), - DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0), # layer 34.0-40.0m - DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 40.0), - DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0), # layer 40.0-50.0m - DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 50.0), - DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0), # layer 50.0-60.0m - DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 60.0), + TextWord(fitz.Rect(20, 0, 25, 1), "12.0", PAGE_NUMBER), # layer 12.0-20.0m + TextWord(fitz.Rect(20, 0, 25, 1), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 2, 25, 3), "20.0", PAGE_NUMBER), # layer 20.0-34.0m + TextWord(fitz.Rect(20, 2, 25, 3), "34.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 4, 25, 5), "34.0", PAGE_NUMBER), # layer 34.0-40.0m + TextWord(fitz.Rect(20, 4, 25, 5), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 6, 25, 7), "40.0", PAGE_NUMBER), # layer 40.0-50.0m + TextWord(fitz.Rect(20, 6, 25, 7), "50.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 8, 25, 9), "50.0", PAGE_NUMBER), # layer 50.0-60.0m + TextWord(fitz.Rect(20, 8, 25, 9), "60.0", PAGE_NUMBER), ] - columns = find_a_to_b_sidebars(entries, ALL_WORDS_FIND_LAYER_DEPTH_COLUMN) + columns = AToBSidebarExtractor.find_in_words(all_words) assert len(columns) == 2, "There should be 2 columns" assert len(columns[0].entries) == 5, "The first column should have 5 entries" assert len(columns[1].entries) == 5, "The second column should have 5 entries" From f331f12d6d5afd4b4d7d9ade22361c0a8ec90045 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Tue, 12 Nov 2024 16:35:50 +0100 Subject: [PATCH 13/20] LGVISIUM-102: create Extractor for LayerIdentifierSidebar --- src/stratigraphy/extract.py | 36 +++--- src/stratigraphy/sidebar/__init__.py | 2 + src/stratigraphy/sidebar/a_above_b_sidebar.py | 2 +- src/stratigraphy/sidebar/a_to_b_sidebar.py | 2 +- src/stratigraphy/sidebar/find_sidebars.py | 1 - .../sidebar/interval_block_group.py | 18 +++ .../sidebar/layer_identifier_sidebar.py | 111 +----------------- .../layer_identifier_sidebar_extractor.py | 86 ++++++++++++++ src/stratigraphy/sidebar/sidebar.py | 2 +- src/stratigraphy/util/interval.py | 35 ++++++ 10 files changed, 163 insertions(+), 132 deletions(-) delete mode 100644 src/stratigraphy/sidebar/find_sidebars.py create mode 100644 src/stratigraphy/sidebar/interval_block_group.py create mode 100644 src/stratigraphy/sidebar/layer_identifier_sidebar_extractor.py diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 981bbe51..469eecf2 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -12,10 +12,11 @@ ) from stratigraphy.layer.layer import IntervalBlockPair, Layer from stratigraphy.lines.line import TextLine -from stratigraphy.sidebar import AAboveBSidebarExtractor, AToBSidebarExtractor, Sidebar -from stratigraphy.sidebar.layer_identifier_sidebar import ( - find_layer_identifier_sidebar_entries, - find_layer_identifier_sidebars, +from stratigraphy.sidebar import ( + AAboveBSidebarExtractor, + AToBSidebarExtractor, + LayerIdentifierSidebarExtractor, + Sidebar, ) from stratigraphy.text.find_description import ( get_description_blocks, @@ -60,24 +61,21 @@ def process_page( Returns: list[dict]: All list of the text of all description blocks. """ - # Detect Layer Index Columns - layer_identifier_entries = find_layer_identifier_sidebar_entries(lines) - layer_identifier_sidebars = ( - find_layer_identifier_sidebars(layer_identifier_entries) if layer_identifier_entries else [] - ) + # Detect Layer Identifier Sidebars + + layer_identifier_sidebars = LayerIdentifierSidebarExtractor.from_lines(lines) material_descriptions_sidebar_pairs = [] - if layer_identifier_sidebars: - for layer_identifier_sidebar in layer_identifier_sidebars: - material_description_rect = find_material_description_column( - lines, layer_identifier_sidebar, language, **params["material_description"] + for layer_identifier_sidebar in layer_identifier_sidebars: + material_description_rect = find_material_description_column( + lines, layer_identifier_sidebar, language, **params["material_description"] + ) + if material_description_rect: + material_descriptions_sidebar_pairs.append( + MaterialDescriptionRectWithSidebar(layer_identifier_sidebar, material_description_rect) ) - if material_description_rect: - material_descriptions_sidebar_pairs.append( - MaterialDescriptionRectWithSidebar(layer_identifier_sidebar, material_description_rect) - ) - if material_descriptions_sidebar_pairs: - material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match()) + if material_descriptions_sidebar_pairs: + material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match()) # If there is a layer identifier sidebar, then we use this directly. # Else, we search for sidebars with depths. diff --git a/src/stratigraphy/sidebar/__init__.py b/src/stratigraphy/sidebar/__init__.py index 0ff3e45c..f4a9cf2b 100644 --- a/src/stratigraphy/sidebar/__init__.py +++ b/src/stratigraphy/sidebar/__init__.py @@ -6,6 +6,7 @@ from .a_to_b_sidebar import AToBSidebar from .a_to_b_sidebar_extractor import AToBSidebarExtractor from .layer_identifier_sidebar import LayerIdentifierSidebar +from .layer_identifier_sidebar_extractor import LayerIdentifierSidebarExtractor from .sidebar import Sidebar __all__ = [ @@ -16,4 +17,5 @@ "AToBSidebar", "AToBSidebarExtractor", "LayerIdentifierSidebar", + "LayerIdentifierSidebarExtractor", ] diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar.py b/src/stratigraphy/sidebar/a_above_b_sidebar.py index 90bdc2fd..1aa81582 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar.py @@ -8,12 +8,12 @@ import numpy as np from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry -from stratigraphy.layer.layer import IntervalBlockGroup from stratigraphy.lines.line import TextLine from stratigraphy.text.find_description import get_description_blocks from stratigraphy.util.dataclasses import Line from stratigraphy.util.interval import AAboveBInterval +from .interval_block_group import IntervalBlockGroup from .sidebar import Sidebar diff --git a/src/stratigraphy/sidebar/a_to_b_sidebar.py b/src/stratigraphy/sidebar/a_to_b_sidebar.py index 1e1812da..d1e1fe19 100644 --- a/src/stratigraphy/sidebar/a_to_b_sidebar.py +++ b/src/stratigraphy/sidebar/a_to_b_sidebar.py @@ -7,11 +7,11 @@ import fitz from stratigraphy.depthcolumn.depthcolumnentry import AToBDepthColumnEntry -from stratigraphy.layer.layer import IntervalBlockGroup from stratigraphy.lines.line import TextLine from stratigraphy.util.dataclasses import Line from stratigraphy.util.interval import AToBInterval +from .interval_block_group import IntervalBlockGroup from .sidebar import Sidebar diff --git a/src/stratigraphy/sidebar/find_sidebars.py b/src/stratigraphy/sidebar/find_sidebars.py deleted file mode 100644 index ce2d4599..00000000 --- a/src/stratigraphy/sidebar/find_sidebars.py +++ /dev/null @@ -1 +0,0 @@ -"""This module contains functionalities to find sidebars in a pdf page.""" diff --git a/src/stratigraphy/sidebar/interval_block_group.py b/src/stratigraphy/sidebar/interval_block_group.py new file mode 100644 index 00000000..6911519c --- /dev/null +++ b/src/stratigraphy/sidebar/interval_block_group.py @@ -0,0 +1,18 @@ +"""Module that contains a helper class for associating depth intervals and text blocks.""" + +from dataclasses import dataclass + +from stratigraphy.text.textblock import TextBlock +from stratigraphy.util.interval import Interval + + +@dataclass +class IntervalBlockGroup: + """Helper class to represent a group of depth intervals and an associated group of text blocks. + + The class is used to simplify the code for obtaining an appropriate one-to-one correspondence between depth + intervals and material descriptions. + """ + + depth_intervals: list[Interval] + blocks: list[TextBlock] diff --git a/src/stratigraphy/sidebar/layer_identifier_sidebar.py b/src/stratigraphy/sidebar/layer_identifier_sidebar.py index 6553b382..40dde80e 100644 --- a/src/stratigraphy/sidebar/layer_identifier_sidebar.py +++ b/src/stratigraphy/sidebar/layer_identifier_sidebar.py @@ -1,17 +1,15 @@ """Module for the layer identifier sidebars.""" -import re from dataclasses import dataclass import fitz -from stratigraphy.layer.layer import IntervalBlockGroup from stratigraphy.lines.line import TextLine from stratigraphy.text.textblock import TextBlock from stratigraphy.util.dataclasses import Line -from ..depthcolumn.depthcolumnentry import AToBDepthColumnEntry from ..util.interval import AToBInterval +from .interval_block_group import IntervalBlockGroup from .sidebar import Sidebar @@ -72,7 +70,7 @@ def identify_groups( result = [] for block in blocks: depth_intervals = [] - depth_interval = get_depth_interval_from_textblock(block) + depth_interval = AToBInterval.get_depth_interval_from_textblock(block) if depth_interval: depth_intervals.append(depth_interval) result.append(IntervalBlockGroup(depth_intervals=depth_intervals, blocks=[block])) @@ -140,108 +138,3 @@ def is_contained(self, rect: fitz.Rect) -> bool: and rect.y0 <= self.rect().y0 and self.rect().y1 <= rect.y1 ) - - -def find_layer_identifier_sidebar_entries(lines: list[TextLine]) -> list[LayerIdentifierEntry]: - r"""Find the layer identifier sidebar entries. - - Regex explanation: - - \b is a word boundary. This ensures that the match must start at the beginning of a word. - - [\da-z]+ matches one or more (+) alphanumeric characters (\d for digits and a-z for lowercase letters). - - \) matches a closing parenthesis. The backslash is necessary because parentheses are special characters - in regular expressions, so we need to escape it to match a literal parenthesis. - This regular expression will match strings like "1)", "2)", "a)", "b)", "1a4)", "6de)", etc. - - Args: - lines (list[TextLine]): The lines to search for layer identifier entries. - - Returns: - list[LayerIdentifierEntry]: The layer identifier sidebar entries. - """ - entries = [] - for line in sorted(lines, key=lambda line: line.rect.y0): - if len(line.words) > 0: - # Only match in the first word of every line, to avoid e.g. matching with "cm)" in a material description - # containing an expression like "(diameter max 6 cm)". - first_word = line.words[0] - regex = re.compile(r"\b[\da-z-]+\)") - match = regex.match(first_word.text) - if match and len(first_word.text) < 7: - entries.append(LayerIdentifierEntry(first_word.rect, first_word.text)) - return entries - - -def find_layer_identifier_sidebars(entries: list[LayerIdentifierEntry]) -> list[LayerIdentifierSidebar]: - """Find the layer identifier column given the index column entries. - - Note: Similar to find_depth_columns.find_depth_columns. Refactoring may be desired. - - Args: - entries (list[LayerIdentifierEntry]): The layer identifier column entries. - - Returns: - list[LayerIdentifierSidebar]: The found layer identifier sidebar. - """ - layer_identifier_sidebars = [LayerIdentifierSidebar([entries[0]])] - for entry in entries[1:]: - has_match = False - for column in layer_identifier_sidebars: - if column.can_be_appended(entry.rect): - column.entries.append(entry) - has_match = True - if not has_match: - layer_identifier_sidebars.append(LayerIdentifierSidebar([entry])) - - # only keep columns whose entries are not fully contained in a different column - layer_identifier_sidebars = [ - column - for column in layer_identifier_sidebars - if all(not other.strictly_contains(column) for other in layer_identifier_sidebars) - ] - # check if the column rect is a subset of another column rect. If so, merge the entries and sort them by y0. - for column in layer_identifier_sidebars: - for other in layer_identifier_sidebars: - if column != other and column.is_contained(other.rect()): - for entry in other.entries: - if entry not in column.entries: - column.entries.append(entry) - column.entries.sort(key=lambda entry: entry.rect.y0) - layer_identifier_sidebars.remove(other) - break - layer_identifier_sidebars = [column for column in layer_identifier_sidebars if len(column.entries) > 2] - return layer_identifier_sidebars - - -def get_depth_interval_from_textblock(block: TextBlock) -> AToBInterval | None: - """Extract depth interval from a material description block. - - For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description - of the material. Often, these text descriptions contain a further separation into multiple sub layers. - These sub layers have their own depth intervals. This function extracts the overall depth interval, - spanning across all mentioned sub layers. - - Args: - block (TextBlock): The block to calculate the depth interval for. - - Returns: - AToBInterval | None: The depth interval. - """ - depth_entries = [] - for line in block.lines: - try: - layer_depth_entry = AToBDepthColumnEntry.from_text(line.text, line.rect, require_start_of_string=False) - # require_start_of_string = False because the depth interval may not always start at the beginning - # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" - if layer_depth_entry: - depth_entries.append(layer_depth_entry) - except ValueError: - pass - - if depth_entries: - # Merge the sub layers into one depth interval. - start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value) - end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value) - - return AToBInterval(AToBDepthColumnEntry(start, end)) - else: - return None diff --git a/src/stratigraphy/sidebar/layer_identifier_sidebar_extractor.py b/src/stratigraphy/sidebar/layer_identifier_sidebar_extractor.py new file mode 100644 index 00000000..d880b46c --- /dev/null +++ b/src/stratigraphy/sidebar/layer_identifier_sidebar_extractor.py @@ -0,0 +1,86 @@ +"""Module for finding LayerIdentifierSidebar instances in a borehole profile.""" + +import re + +from stratigraphy.lines.line import TextLine +from stratigraphy.sidebar import LayerIdentifierSidebar +from stratigraphy.sidebar.layer_identifier_sidebar import LayerIdentifierEntry + + +class LayerIdentifierSidebarExtractor: + """Class that finds LayerIdentifierSidebar instances in a borehole profile.""" + + @classmethod + def find_layer_identifier_sidebar_entries(cls, lines: list[TextLine]) -> list[LayerIdentifierEntry]: + r"""Find the layer identifier sidebar entries. + + Regex explanation: + - \b is a word boundary. This ensures that the match must start at the beginning of a word. + - [\da-z]+ matches one or more (+) alphanumeric characters (\d for digits and a-z for lowercase letters). + - \) matches a closing parenthesis. The backslash is necessary because parentheses are special characters + in regular expressions, so we need to escape it to match a literal parenthesis. + This regular expression will match strings like "1)", "2)", "a)", "b)", "1a4)", "6de)", etc. + + Args: + lines (list[TextLine]): The lines to search for layer identifier entries. + + Returns: + list[LayerIdentifierEntry]: The layer identifier sidebar entries. + """ + entries = [] + for line in sorted(lines, key=lambda line: line.rect.y0): + if len(line.words) > 0: + # Only match in the first word of every line, to avoid e.g. matching with "cm)" in a material + # description containing an expression like "(diameter max 6 cm)". + first_word = line.words[0] + regex = re.compile(r"\b[\da-z-]+\)") + match = regex.match(first_word.text) + if match and len(first_word.text) < 7: + entries.append(LayerIdentifierEntry(first_word.rect, first_word.text)) + return entries + + @classmethod + def from_lines(cls, lines: list[TextLine]) -> list[LayerIdentifierSidebar]: + """Find layer identifier sidebars from text lines. + + TODO: Similar to AToBSidebarExtractor.find_in_words(). Refactoring may be desired. + + Args: + lines (list[TextLine]): The text lines in the document + + Returns: + list[LayerIdentifierSidebar]: The found layer identifier sidebar. + """ + entries = cls.find_layer_identifier_sidebar_entries(lines) + if not entries: + return [] + + layer_identifier_sidebars = [LayerIdentifierSidebar([entries[0]])] + for entry in entries[1:]: + has_match = False + for column in layer_identifier_sidebars: + if column.can_be_appended(entry.rect): + column.entries.append(entry) + has_match = True + if not has_match: + layer_identifier_sidebars.append(LayerIdentifierSidebar([entry])) + + # only keep columns whose entries are not fully contained in a different column + layer_identifier_sidebars = [ + column + for column in layer_identifier_sidebars + if all(not other.strictly_contains(column) for other in layer_identifier_sidebars) + ] + # check if the column rect is a subset of another column rect. If so, merge the entries and sort them by + # y0. + for column in layer_identifier_sidebars: + for other in layer_identifier_sidebars: + if column != other and column.is_contained(other.rect()): + for entry in other.entries: + if entry not in column.entries: + column.entries.append(entry) + column.entries.sort(key=lambda entry: entry.rect.y0) + layer_identifier_sidebars.remove(other) + break + layer_identifier_sidebars = [column for column in layer_identifier_sidebars if len(column.entries) > 2] + return layer_identifier_sidebars diff --git a/src/stratigraphy/sidebar/sidebar.py b/src/stratigraphy/sidebar/sidebar.py index 046d9094..b53781a4 100644 --- a/src/stratigraphy/sidebar/sidebar.py +++ b/src/stratigraphy/sidebar/sidebar.py @@ -9,8 +9,8 @@ import fitz from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry -from stratigraphy.layer.layer import IntervalBlockGroup from stratigraphy.lines.line import TextLine, TextWord +from stratigraphy.sidebar.interval_block_group import IntervalBlockGroup from stratigraphy.util.dataclasses import Line EntryT = TypeVar("EntryT", bound=DepthColumnEntry) diff --git a/src/stratigraphy/util/interval.py b/src/stratigraphy/util/interval.py index bd321d42..00a8d6ff 100644 --- a/src/stratigraphy/util/interval.py +++ b/src/stratigraphy/util/interval.py @@ -177,3 +177,38 @@ def matching_blocks( return [TextBlock(matched_lines)] else: return [] + + @classmethod + def get_depth_interval_from_textblock(cls, block: TextBlock) -> AToBInterval | None: + """Extract depth interval from a material description block. + + For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description + of the material. Often, these text descriptions contain a further separation into multiple sub layers. + These sub layers have their own depth intervals. This function extracts the overall depth interval, + spanning across all mentioned sub layers. + + Args: + block (TextBlock): The block to calculate the depth interval for. + + Returns: + AToBInterval | None: The depth interval. + """ + depth_entries = [] + for line in block.lines: + try: + layer_depth_entry = AToBDepthColumnEntry.from_text(line.text, line.rect, require_start_of_string=False) + # require_start_of_string = False because the depth interval may not always start at the beginning + # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" + if layer_depth_entry: + depth_entries.append(layer_depth_entry) + except ValueError: + pass + + if depth_entries: + # Merge the sub layers into one depth interval. + start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value) + end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value) + + return AToBInterval(AToBDepthColumnEntry(start, end)) + else: + return None From 04cc5ef4ba7c033a51a3abd1ed50cc7cefdfc7d3 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Tue, 12 Nov 2024 17:03:39 +0100 Subject: [PATCH 14/20] LGVISIUM-102: docstrings + cleanup --- src/stratigraphy/sidebar/a_above_b_sidebar.py | 18 +++++++++--------- .../sidebar/a_above_b_sidebar_extractor.py | 12 ++++++------ .../sidebar/a_above_b_sidebar_validator.py | 2 +- .../sidebar/layer_identifier_sidebar.py | 2 +- src/stratigraphy/util/interval.py | 14 +++++++------- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar.py b/src/stratigraphy/sidebar/a_above_b_sidebar.py index 1aa81582..7a13ad52 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar.py @@ -162,16 +162,16 @@ def identify_groups( Returns: list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. - Example: + Example return value: [ - { - "depth_intervals": [BoundaryInterval(None, 0.1), BoundaryInterval(0.1, 0.3), ...], - "blocks": [DescriptionBlock(...), DescriptionBlock(...), ...] - }, - { - "depth_intervals": [BoundaryInterval(0.3, 0.7)], - "blocks": [DescriptionBlock(...), DescriptionBlock(...), ...] - }, + IntervalBlockGroup( + depth_intervals=[AAboveBInterval(None, 0.1), AAboveBInterval(0.1, 0.3), ...], + blocks=[TextBlock(...), TextBlock(...), ...] + ), + IntervalBlockGroup( + depth_intervals=[AAboveBInterval(0.3, 0.7)], + blocks=[TextBlock(...), TextBlock(...), ...] + ), ... ] """ diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py index 68245963..a8391e02 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py @@ -15,15 +15,15 @@ class AAboveBSidebarExtractor: def find_in_words( all_words: list[TextWord], used_entry_rects: list[fitz.Rect], sidebar_params: dict ) -> list[AAboveBSidebar]: - """Construct all possible AAboveBSidebar objects from the given DepthColumnEntry objects. + """Construct all possible AAboveBSidebar objects from the given words. Args: all_words (list[TextLine]): All words in the page. used_entry_rects (list[fitz.Rect]): Part of the document to ignore. - sidebar_params (dict): Parameters for the BoundaryDepthColumn objects. + sidebar_params (dict): Parameters for the AAboveBSidebar objects. Returns: - list[AAboveBSidebar]: Found BoundaryDepthColumn objects. + list[AAboveBSidebar]: Found AAboveBSidebar objects. """ entries = [ entry @@ -57,10 +57,10 @@ def find_in_words( if all(not other.strictly_contains(column) for other in numeric_columns) ] - boundary_depth_column_validator = AAboveBSidebarValidator(all_words, **sidebar_params) + sidebar_validator = AAboveBSidebarValidator(all_words, **sidebar_params) numeric_columns = [ - boundary_depth_column_validator.reduce_until_valid(column) + sidebar_validator.reduce_until_valid(column) for numeric_column in numeric_columns for column in numeric_column.break_on_double_descending() # when we have a perfect arithmetic progression, this is usually just a scale @@ -69,6 +69,6 @@ def find_in_words( ] return sorted( - [column for column in numeric_columns if column and boundary_depth_column_validator.is_valid(column)], + [column for column in numeric_columns if column and sidebar_validator.is_valid(column)], key=lambda column: len(column.entries), ) diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py index 0dfbfe9a..3dbd012b 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py @@ -104,7 +104,7 @@ def correct_OCR_mistakes(self, sidebar: AAboveBSidebar) -> AAboveBSidebar | None sidebar (AAboveBSidebar): The AAboveBSidebar to validate Returns: - BoundaryDepthColumn | None: The corrected depth column, or None if no correction was possible. + AAboveBSidebar | None: The corrected sidebar, or None if no correction was possible. """ new_columns = [AAboveBSidebar(entries=[])] for entry in sidebar.entries: diff --git a/src/stratigraphy/sidebar/layer_identifier_sidebar.py b/src/stratigraphy/sidebar/layer_identifier_sidebar.py index 40dde80e..e2467cf2 100644 --- a/src/stratigraphy/sidebar/layer_identifier_sidebar.py +++ b/src/stratigraphy/sidebar/layer_identifier_sidebar.py @@ -70,7 +70,7 @@ def identify_groups( result = [] for block in blocks: depth_intervals = [] - depth_interval = AToBInterval.get_depth_interval_from_textblock(block) + depth_interval = AToBInterval.get_depth_interval_from_lines(block.lines) if depth_interval: depth_intervals.append(depth_interval) result.append(IntervalBlockGroup(depth_intervals=depth_intervals, blocks=[block])) diff --git a/src/stratigraphy/util/interval.py b/src/stratigraphy/util/interval.py index 00a8d6ff..479e8ffb 100644 --- a/src/stratigraphy/util/interval.py +++ b/src/stratigraphy/util/interval.py @@ -179,22 +179,22 @@ def matching_blocks( return [] @classmethod - def get_depth_interval_from_textblock(cls, block: TextBlock) -> AToBInterval | None: - """Extract depth interval from a material description block. + def get_depth_interval_from_lines(cls, lines: list[TextLine]) -> AToBInterval | None: + """Extract depth interval from text lines. - For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description - of the material. Often, these text descriptions contain a further separation into multiple sub layers. + For borehole profiles in the Deriaz layout, the depth interval is usually found in the text of the material + description. Often, these text descriptions contain a further separation into multiple sub layers. These sub layers have their own depth intervals. This function extracts the overall depth interval, spanning across all mentioned sub layers. Args: - block (TextBlock): The block to calculate the depth interval for. + lines (list[TextLine]): The lines to extract the depth interval from. Returns: - AToBInterval | None: The depth interval. + AToBInterval | None: The depth interval (if any) or None (if no depth interval was found). """ depth_entries = [] - for line in block.lines: + for line in lines: try: layer_depth_entry = AToBDepthColumnEntry.from_text(line.text, line.rect, require_start_of_string=False) # require_start_of_string = False because the depth interval may not always start at the beginning From 4223b968b6a0591bb0f1efe7093137059bfdb794 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Tue, 12 Nov 2024 17:44:52 +0100 Subject: [PATCH 15/20] LGVISIUM-102: add TODOs --- .../depthcolumn/depthcolumnentry.py | 29 +++++++++---------- src/stratigraphy/extract.py | 1 + 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/stratigraphy/depthcolumn/depthcolumnentry.py b/src/stratigraphy/depthcolumn/depthcolumnentry.py index ac469bb0..f20200f8 100644 --- a/src/stratigraphy/depthcolumn/depthcolumnentry.py +++ b/src/stratigraphy/depthcolumn/depthcolumnentry.py @@ -3,18 +3,19 @@ from __future__ import annotations import re +from dataclasses import dataclass from typing import Any import fitz from stratigraphy.lines.line import TextWord +@dataclass class DepthColumnEntry: # noqa: D101 """Class to represent a depth column entry.""" - def __init__(self, rect: fitz.Rect, value: float): - self.rect = rect - self.value = value + rect: fitz.Rect + value: float def __repr__(self) -> str: return str(self.value) @@ -61,10 +62,10 @@ def find_in_words(cls, all_words: list[TextWord], include_splits: bool) -> list[ entries.append(DepthColumnEntry(word.rect, value)) elif include_splits: # support for e.g. "1.10-1.60m" extracted as a single word - layer_depth_column_entry = AToBDepthColumnEntry.from_text(input_string, word.rect) + a_to_b_depth_column_entry = AToBDepthColumnEntry.from_text(input_string, word.rect) entries.extend( - [layer_depth_column_entry.start, layer_depth_column_entry.end] - if layer_depth_column_entry + [a_to_b_depth_column_entry.start, a_to_b_depth_column_entry.end] + if a_to_b_depth_column_entry else [] ) except ValueError: @@ -72,12 +73,14 @@ def find_in_words(cls, all_words: list[TextWord], include_splits: bool) -> list[ return entries +@dataclass class AToBDepthColumnEntry: # noqa: D101 - """Class to represent a layer depth column entry.""" + """Class to represent a depth column entry of the form "1m - 3m".""" - def __init__(self, start: DepthColumnEntry, end: DepthColumnEntry): - self.start = start - self.end = end + # TODO do we need both this class as well as AToBInterval, or can we combine the two classes? + + start: DepthColumnEntry + end: DepthColumnEntry def __repr__(self) -> str: return f"{self.start.value}-{self.end.value}" @@ -89,11 +92,7 @@ def rect(self) -> fitz.Rect: def to_json(self) -> dict[str, Any]: """Convert the layer depth column entry to a JSON serializable format.""" - return { - "start": self.start.to_json(), - "end": self.end.to_json(), - "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], - } + return {"start": self.start.to_json(), "end": self.end.to_json()} @classmethod def from_json(cls, data: dict) -> AToBDepthColumnEntry: diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 469eecf2..d8beeeaa 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -170,6 +170,7 @@ def process_page( rect=pair.block.rect, page=page_number, ), + # TODO don't automatically convert any interval to an AAboveBInterval depth_interval=AAboveBInterval(start=pair.depth_interval.start, end=pair.depth_interval.end) if pair.depth_interval else None, From 107c5dd03e0df6a98d8542f6bd22bdea5e4861f0 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Tue, 19 Nov 2024 11:31:08 +0100 Subject: [PATCH 16/20] LGVISIUM-102: update predictions.json docs --- README.md | 100 +------------------------- README.predictions-json.md | 143 +++++++++++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+), 97 deletions(-) create mode 100644 README.predictions-json.md diff --git a/README.md b/README.md index b91a247f..cc879bdd 100644 --- a/README.md +++ b/README.md @@ -124,103 +124,9 @@ Use `boreholes-extract-all --help` to see all options for the extraction script. 4. **Check the results** -Once the script has finished running, you can check the results in the `data/output/draw` directory. The result is a `predictions.json` file as well as a png file for each page of each PDF in the specified input directory. - -### Output Structure -The `predictions.json` file contains the results of a data extraction process from PDF files. Each key in the JSON object is the name of a PDF file, and the value is a list of extracted items in a dictionary like object. The extracted items for now are the material descriptions in their correct order (given by their depths). - -Example: predictions.json -```json -{ - "685256002-bp.pdf": { # file name - "language": "de", - "metadata": { - "coordinates": null - }, - "layers": [ # a layer corresponds to a material layer in the borehole profile - { - "material_description": { # all information about the complete description of the material of the layer - "text": "grauer, siltig-sandiger Kies (Auffullung)", - "rect": [ - 232.78799438476562, - 130.18496704101562, - 525.6640014648438, - 153.54295349121094 - ], - "lines": [ - { - "text": "grauer, siltig-sandiger Kies (Auffullung)", - "rect": [ - 232.78799438476562, - 130.18496704101562, - 525.6640014648438, - 153.54295349121094 - ], - "page": 1 - } - ], - "page": 1 - }, - "depth_interval": { # information about the depth of the layer - "start": null, - "end": { - "value": 0.4, - "rect": [ - 125.25399780273438, - 140.2349853515625, - 146.10398864746094, - 160.84498596191406 - ], - "page": 1 - } - } - }, - ... - ], - "depths_materials_column_pairs": [ # information about where on the pdf the information for material description as well as depths are taken. - { - "depth_column": { - "rect": [ - 119.05999755859375, - 140.2349853515625, - 146.8470001220703, - 1014.4009399414062 - ], - "entries": [ - { - "value": 0.4, - "rect": [ - 125.25399780273438, - 140.2349853515625, - 146.10398864746094, - 160.84498596191406 - ], - "page": 1 - }, - { - "value": 0.6, - "rect": [ - 125.21800231933594, - 153.8349609375, - 146.0679931640625, - 174.44496154785156 - ], - "page": 1 - }, - ... - ] - } - } - ], - "page_dimensions": [ - { - "height": 1192.0999755859375, - "width": 842.1500244140625 - } - ] - }, -} -``` +The script produces output in two different formats: +- A file `data/output/predictions.json` that contains all extracted data in a machine-readable format. The structure of this file is documented in [README.predictions-json.md](README.predictions-json.md). +- A PNG image of each processed PDF page in the `data/output/draw` directory, where the extracted data is highlighted. # Developer Guidance ## Project Structure diff --git a/README.predictions-json.md b/README.predictions-json.md new file mode 100644 index 00000000..66a7ae14 --- /dev/null +++ b/README.predictions-json.md @@ -0,0 +1,143 @@ +# `predictions.json` output Structure +The `predictions.json` file contains the results of a data extraction process in a machine-readable format. By default, the file is written to `data/output/predictions.json`. + +Each key in the JSON object is the name of a PDF file. The extracted data is listed as an object with the following keys: +- `metadata` + - `elevation`: the detected elevation (if any) and the location in the PDF where they were extraction from. + - `coordinates`: the detected coordinates (if any) and the location in the PDF where they were extraction from. + - `language`: language that was detected for the document. + - `page_dimensions`: dimensions of each page in the PDF, measured in PDF points +- `layers`: a list of objects, where each object represents a layer of the borehole profile, using the following keys: + - `material_description`: the text of the material description, both as a single value as well as line-by-line, and the location in the PDF where the text resp. the lines where extracted from. + - `depth_interval`: the measured depth of the upper and lower limits of the layer, and the location in the PDF where they were extracted from. + - `id`: a unique identifier. +- `bounding_boxes`: a list of objects, one for each (part of a) borehole profile in the PDF, that list some bounding boxes that can be used for visualizations. Each object has the following keys: + - `sidebar_rect`: the area of the page the contains a "sidebar" (if any), which contains depths or other data displayed to the side of material descriptions. + - `depth_column_entries`: list of locations of the entries in the depth column (if any). + - `material_description_rect`: the area of the page that contains all material descriptions. + - `page`: the number of the page of the PDF. +- `page_dimensions`: dimensions of each page in the PDF, measured in PDF points (repeated) +- `groundwater`: a list of objects, one for each groundwater measurement that was extracted from the PDF. Each object has the following keys. + - `date`: extracted date for the groundwater measurement (if any) as a string in YYYY-MM-DD format. + - `depth`: the measured depth (in m) of the groundwater measurement. + - `elevation`: the elevation (in m above sea level) of the groundwater measurement. + - `page` and `rect`: the location in the PDF where the groundwater measurement was extracted from. +- `file_name`: name of the file (repeated) + +All page numbers are counted starting at 1. + +All bounding boxes are measured with PDF points as the unit, and with the top-left of the page as the origin. + +## Example output +```yaml +{ + "B366.pdf": { # file name + "metadata": { + "elevation": { + "elevation": 355.35, + "page": 1, + "rect": [27.49843978881836, 150.2817840576172, 159.42971801757812, 160.76754760742188] + }, + "coordinates": { + "E": 659490.0, + "N": 257200.0, + "rect": [28.263830184936523, 179.63882446289062, 150.3379364013672, 188.7487335205078], + "page": 1 + }, + "language": "de", + "page_dimensions": [ + { + "width": 591.956787109375, + "height": 1030.426025390625 + }, + { + "width": 588.009521484375, + "height": 792.114990234375 + } + ] + }, + "layers": [ + { + "material_description": { + "text": "beiger, massig-dichter, stark dolomitisierter Kalk, mit Muschelresten", + "lines": [ + { + "text": "beiger, massig-dichter, stark", + "page": 1, + "rect": [258.5303039550781, 345.9997253417969, 379.9410705566406, 356.1011657714844] + }, + { + "text": "dolomitisierter Kalk, mit", + "page": 1, + "rect": [258.2362060546875, 354.4559326171875, 363.0706787109375, 364.295654296875] + }, + { + "text": "Muschelresten", + "page": 1, + "rect": [258.48748779296875, 363.6712341308594, 313.03204345703125, 371.3343505859375] + } + ], + "page": 1, + "rect": [258.2362060546875, 345.9997253417969, 379.9410705566406, 371.3343505859375] + }, + "depth_interval": { + "start": { + "value": 1.5, + "rect": [200.63790893554688, 331.3035888671875, 207.83108520507812, 338.30450439453125] + }, + "end": { + "value": 6.0, + "rect": [201.62551879882812, 374.30560302734375, 210.0361328125, 380.828857421875] + } + }, + "id": "2b841b12-1f8d-4845-a873-0916b2a09420" + }, + # ... (more layers) + ], + "bounding_boxes": [ + { + "sidebar_rect": [198.11251831054688, 321.8956298828125, 210.75906372070312, 702.2628173828125], + "depth_column_entries": [ + [200.1201171875, 321.8956298828125, 208.59901428222656, 328.6802062988281], + [200.63790893554688, 331.3035888671875, 207.83108520507812, 338.30450439453125], + [201.62551879882812, 374.30560302734375, 210.0361328125, 380.828857421875], + [199.86251831054688, 434.51556396484375, 210.10894775390625, 441.4538879394531], + [198.11251831054688, 557.5472412109375, 210.35877990722656, 563.9244995117188], + [198.28451538085938, 582.0216674804688, 209.76953125, 588.7603759765625], + [198.7814178466797, 616.177001953125, 209.50042724609375, 622.502197265625], + [198.6378173828125, 663.2830810546875, 210.75906372070312, 669.5428466796875], + [198.26901245117188, 695.974609375, 209.12693786621094, 702.2628173828125] + ], + "material_description_rect": [256.777099609375, 345.9997253417969, 392.46051025390625, 728.2700805664062], + "page": 1 + }, + { + "sidebar_rect": null, + "depth_column_entries": [], + "material_description_rect": [192.3216094970703, 337.677978515625, 291.1827392578125, 633.6331176757812], + "page": 2 + } + ], + "page_dimensions": [ + [ + 591.956787109375, + 1030.426025390625 + ], + [ + 588.009521484375, + 792.114990234375 + ] + ], + "groundwater": [ + { + "date": "1979-11-29", + "depth": 19.28, + "elevation": 336.07, + "page": 1, + "rect": [61.23963928222656, 489.3185119628906, 94.0096435546875, 513.6478881835938] + } + ], + "file_name": "B366.pdf" + } +} +``` \ No newline at end of file From 78ef44f56a6376e796b96a388d9a1fa5534dba11 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Tue, 19 Nov 2024 11:42:28 +0100 Subject: [PATCH 17/20] LGVISIUM-102: remove redundant file_name from predictions.json --- README.predictions-json.md | 4 +--- src/stratigraphy/util/predictions.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/README.predictions-json.md b/README.predictions-json.md index 66a7ae14..004f73bb 100644 --- a/README.predictions-json.md +++ b/README.predictions-json.md @@ -22,7 +22,6 @@ Each key in the JSON object is the name of a PDF file. The extracted data is lis - `depth`: the measured depth (in m) of the groundwater measurement. - `elevation`: the elevation (in m above sea level) of the groundwater measurement. - `page` and `rect`: the location in the PDF where the groundwater measurement was extracted from. -- `file_name`: name of the file (repeated) All page numbers are counted starting at 1. @@ -136,8 +135,7 @@ All bounding boxes are measured with PDF points as the unit, and with the top-le "page": 1, "rect": [61.23963928222656, 489.3185119628906, 94.0096435546875, 513.6478881835938] } - ], - "file_name": "B366.pdf" + ] } } ``` \ No newline at end of file diff --git a/src/stratigraphy/util/predictions.py b/src/stratigraphy/util/predictions.py index 37414381..7c7d7170 100644 --- a/src/stratigraphy/util/predictions.py +++ b/src/stratigraphy/util/predictions.py @@ -46,7 +46,6 @@ def to_json(self) -> dict: "bounding_boxes": [bboxes.to_json() for bboxes in self.bounding_boxes], "page_dimensions": self.metadata.page_dimensions, # TODO: Remove, already in metadata "groundwater": self.groundwater.to_json() if self.groundwater is not None else [], - "file_name": self.file_name, } From 67e85e03faa75839a844980ba87df3c0dd6960fc Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Tue, 19 Nov 2024 12:04:03 +0100 Subject: [PATCH 18/20] LGVISIUM-102: remove redundant page_dimensions from predictions.json --- README.predictions-json.md | 11 ----------- src/stratigraphy/util/predictions.py | 1 - tests/test_predictions.py | 2 +- 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/README.predictions-json.md b/README.predictions-json.md index 004f73bb..a72bb204 100644 --- a/README.predictions-json.md +++ b/README.predictions-json.md @@ -16,7 +16,6 @@ Each key in the JSON object is the name of a PDF file. The extracted data is lis - `depth_column_entries`: list of locations of the entries in the depth column (if any). - `material_description_rect`: the area of the page that contains all material descriptions. - `page`: the number of the page of the PDF. -- `page_dimensions`: dimensions of each page in the PDF, measured in PDF points (repeated) - `groundwater`: a list of objects, one for each groundwater measurement that was extracted from the PDF. Each object has the following keys. - `date`: extracted date for the groundwater measurement (if any) as a string in YYYY-MM-DD format. - `depth`: the measured depth (in m) of the groundwater measurement. @@ -117,16 +116,6 @@ All bounding boxes are measured with PDF points as the unit, and with the top-le "page": 2 } ], - "page_dimensions": [ - [ - 591.956787109375, - 1030.426025390625 - ], - [ - 588.009521484375, - 792.114990234375 - ] - ], "groundwater": [ { "date": "1979-11-29", diff --git a/src/stratigraphy/util/predictions.py b/src/stratigraphy/util/predictions.py index 7c7d7170..51e53f5f 100644 --- a/src/stratigraphy/util/predictions.py +++ b/src/stratigraphy/util/predictions.py @@ -44,7 +44,6 @@ def to_json(self) -> dict: "metadata": self.metadata.to_json(), "layers": [layer.to_json() for layer in self.layers_in_document.layers], "bounding_boxes": [bboxes.to_json() for bboxes in self.bounding_boxes], - "page_dimensions": self.metadata.page_dimensions, # TODO: Remove, already in metadata "groundwater": self.groundwater.to_json() if self.groundwater is not None else [], } diff --git a/tests/test_predictions.py b/tests/test_predictions.py index 0fcc99d2..2aa96a21 100644 --- a/tests/test_predictions.py +++ b/tests/test_predictions.py @@ -58,9 +58,9 @@ def test_to_json(sample_file_prediction: FilePredictions): result = sample_file_prediction.to_json() assert isinstance(result, dict) - assert result["file_name"] == "test_file" assert len(result["layers"]) == 2 assert result["metadata"]["coordinates"]["E"] == 2789456 + assert result["metadata"]["language"] == "en" def test_overall_file_predictions(): From 3b8f8a8a905badea93aadc39849dfa06b1665f47 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Tue, 19 Nov 2024 12:10:33 +0100 Subject: [PATCH 19/20] LGVISIUM-102: remove redundant layer.id from predictions.json --- README.predictions-json.md | 4 +--- src/stratigraphy/layer/layer.py | 5 +---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/README.predictions-json.md b/README.predictions-json.md index a72bb204..375151e4 100644 --- a/README.predictions-json.md +++ b/README.predictions-json.md @@ -10,7 +10,6 @@ Each key in the JSON object is the name of a PDF file. The extracted data is lis - `layers`: a list of objects, where each object represents a layer of the borehole profile, using the following keys: - `material_description`: the text of the material description, both as a single value as well as line-by-line, and the location in the PDF where the text resp. the lines where extracted from. - `depth_interval`: the measured depth of the upper and lower limits of the layer, and the location in the PDF where they were extracted from. - - `id`: a unique identifier. - `bounding_boxes`: a list of objects, one for each (part of a) borehole profile in the PDF, that list some bounding boxes that can be used for visualizations. Each object has the following keys: - `sidebar_rect`: the area of the page the contains a "sidebar" (if any), which contains depths or other data displayed to the side of material descriptions. - `depth_column_entries`: list of locations of the entries in the depth column (if any). @@ -87,8 +86,7 @@ All bounding boxes are measured with PDF points as the unit, and with the top-le "value": 6.0, "rect": [201.62551879882812, 374.30560302734375, 210.0361328125, 380.828857421875] } - }, - "id": "2b841b12-1f8d-4845-a873-0916b2a09420" + } }, # ... (more layers) ], diff --git a/src/stratigraphy/layer/layer.py b/src/stratigraphy/layer/layer.py index bcb445a4..a653d31f 100644 --- a/src/stratigraphy/layer/layer.py +++ b/src/stratigraphy/layer/layer.py @@ -1,7 +1,6 @@ """Layer class definition.""" -import uuid -from dataclasses import dataclass, field +from dataclasses import dataclass import fitz from stratigraphy.data_extractor.data_extractor import ExtractedFeature, FeatureOnPage @@ -17,7 +16,6 @@ class Layer(ExtractedFeature): material_description: FeatureOnPage[MaterialDescription] depth_interval: AAboveBInterval | None - id: uuid.UUID = field(default_factory=uuid.uuid4) def __str__(self) -> str: """Converts the object to a string. @@ -39,7 +37,6 @@ def to_json(self) -> dict: return { "material_description": self.material_description.to_json() if self.material_description else None, "depth_interval": self.depth_interval.to_json() if self.depth_interval else None, - "id": str(self.id), } @classmethod From 474111a1bd89e8a3a18c9f6aa020dd85d4124fe1 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Thu, 21 Nov 2024 17:16:39 +0100 Subject: [PATCH 20/20] LGVISIUM-102: add ticket number for TODO --- README.predictions-json.md | 2 +- src/stratigraphy/extract.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.predictions-json.md b/README.predictions-json.md index 375151e4..306e2260 100644 --- a/README.predictions-json.md +++ b/README.predictions-json.md @@ -1,4 +1,4 @@ -# `predictions.json` output Structure +# `predictions.json` output structure The `predictions.json` file contains the results of a data extraction process in a machine-readable format. By default, the file is written to `data/output/predictions.json`. Each key in the JSON object is the name of a PDF file. The extracted data is listed as an object with the following keys: diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index d8beeeaa..1c99469f 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -170,7 +170,7 @@ def process_page( rect=pair.block.rect, page=page_number, ), - # TODO don't automatically convert any interval to an AAboveBInterval + # TODO LGVISIUM-104 don't automatically convert any interval to an AAboveBInterval depth_interval=AAboveBInterval(start=pair.depth_interval.start, end=pair.depth_interval.end) if pair.depth_interval else None,