diff --git a/README.md b/README.md index b91a247f..cc879bdd 100644 --- a/README.md +++ b/README.md @@ -124,103 +124,9 @@ Use `boreholes-extract-all --help` to see all options for the extraction script. 4. **Check the results** -Once the script has finished running, you can check the results in the `data/output/draw` directory. The result is a `predictions.json` file as well as a png file for each page of each PDF in the specified input directory. - -### Output Structure -The `predictions.json` file contains the results of a data extraction process from PDF files. Each key in the JSON object is the name of a PDF file, and the value is a list of extracted items in a dictionary like object. The extracted items for now are the material descriptions in their correct order (given by their depths). - -Example: predictions.json -```json -{ - "685256002-bp.pdf": { # file name - "language": "de", - "metadata": { - "coordinates": null - }, - "layers": [ # a layer corresponds to a material layer in the borehole profile - { - "material_description": { # all information about the complete description of the material of the layer - "text": "grauer, siltig-sandiger Kies (Auffullung)", - "rect": [ - 232.78799438476562, - 130.18496704101562, - 525.6640014648438, - 153.54295349121094 - ], - "lines": [ - { - "text": "grauer, siltig-sandiger Kies (Auffullung)", - "rect": [ - 232.78799438476562, - 130.18496704101562, - 525.6640014648438, - 153.54295349121094 - ], - "page": 1 - } - ], - "page": 1 - }, - "depth_interval": { # information about the depth of the layer - "start": null, - "end": { - "value": 0.4, - "rect": [ - 125.25399780273438, - 140.2349853515625, - 146.10398864746094, - 160.84498596191406 - ], - "page": 1 - } - } - }, - ... - ], - "depths_materials_column_pairs": [ # information about where on the pdf the information for material description as well as depths are taken. - { - "depth_column": { - "rect": [ - 119.05999755859375, - 140.2349853515625, - 146.8470001220703, - 1014.4009399414062 - ], - "entries": [ - { - "value": 0.4, - "rect": [ - 125.25399780273438, - 140.2349853515625, - 146.10398864746094, - 160.84498596191406 - ], - "page": 1 - }, - { - "value": 0.6, - "rect": [ - 125.21800231933594, - 153.8349609375, - 146.0679931640625, - 174.44496154785156 - ], - "page": 1 - }, - ... - ] - } - } - ], - "page_dimensions": [ - { - "height": 1192.0999755859375, - "width": 842.1500244140625 - } - ] - }, -} -``` +The script produces output in two different formats: +- A file `data/output/predictions.json` that contains all extracted data in a machine-readable format. The structure of this file is documented in [README.predictions-json.md](README.predictions-json.md). +- A PNG image of each processed PDF page in the `data/output/draw` directory, where the extracted data is highlighted. # Developer Guidance ## Project Structure diff --git a/README.predictions-json.md b/README.predictions-json.md new file mode 100644 index 00000000..306e2260 --- /dev/null +++ b/README.predictions-json.md @@ -0,0 +1,128 @@ +# `predictions.json` output structure +The `predictions.json` file contains the results of a data extraction process in a machine-readable format. By default, the file is written to `data/output/predictions.json`. + +Each key in the JSON object is the name of a PDF file. The extracted data is listed as an object with the following keys: +- `metadata` + - `elevation`: the detected elevation (if any) and the location in the PDF where they were extraction from. + - `coordinates`: the detected coordinates (if any) and the location in the PDF where they were extraction from. + - `language`: language that was detected for the document. + - `page_dimensions`: dimensions of each page in the PDF, measured in PDF points +- `layers`: a list of objects, where each object represents a layer of the borehole profile, using the following keys: + - `material_description`: the text of the material description, both as a single value as well as line-by-line, and the location in the PDF where the text resp. the lines where extracted from. + - `depth_interval`: the measured depth of the upper and lower limits of the layer, and the location in the PDF where they were extracted from. +- `bounding_boxes`: a list of objects, one for each (part of a) borehole profile in the PDF, that list some bounding boxes that can be used for visualizations. Each object has the following keys: + - `sidebar_rect`: the area of the page the contains a "sidebar" (if any), which contains depths or other data displayed to the side of material descriptions. + - `depth_column_entries`: list of locations of the entries in the depth column (if any). + - `material_description_rect`: the area of the page that contains all material descriptions. + - `page`: the number of the page of the PDF. +- `groundwater`: a list of objects, one for each groundwater measurement that was extracted from the PDF. Each object has the following keys. + - `date`: extracted date for the groundwater measurement (if any) as a string in YYYY-MM-DD format. + - `depth`: the measured depth (in m) of the groundwater measurement. + - `elevation`: the elevation (in m above sea level) of the groundwater measurement. + - `page` and `rect`: the location in the PDF where the groundwater measurement was extracted from. + +All page numbers are counted starting at 1. + +All bounding boxes are measured with PDF points as the unit, and with the top-left of the page as the origin. + +## Example output +```yaml +{ + "B366.pdf": { # file name + "metadata": { + "elevation": { + "elevation": 355.35, + "page": 1, + "rect": [27.49843978881836, 150.2817840576172, 159.42971801757812, 160.76754760742188] + }, + "coordinates": { + "E": 659490.0, + "N": 257200.0, + "rect": [28.263830184936523, 179.63882446289062, 150.3379364013672, 188.7487335205078], + "page": 1 + }, + "language": "de", + "page_dimensions": [ + { + "width": 591.956787109375, + "height": 1030.426025390625 + }, + { + "width": 588.009521484375, + "height": 792.114990234375 + } + ] + }, + "layers": [ + { + "material_description": { + "text": "beiger, massig-dichter, stark dolomitisierter Kalk, mit Muschelresten", + "lines": [ + { + "text": "beiger, massig-dichter, stark", + "page": 1, + "rect": [258.5303039550781, 345.9997253417969, 379.9410705566406, 356.1011657714844] + }, + { + "text": "dolomitisierter Kalk, mit", + "page": 1, + "rect": [258.2362060546875, 354.4559326171875, 363.0706787109375, 364.295654296875] + }, + { + "text": "Muschelresten", + "page": 1, + "rect": [258.48748779296875, 363.6712341308594, 313.03204345703125, 371.3343505859375] + } + ], + "page": 1, + "rect": [258.2362060546875, 345.9997253417969, 379.9410705566406, 371.3343505859375] + }, + "depth_interval": { + "start": { + "value": 1.5, + "rect": [200.63790893554688, 331.3035888671875, 207.83108520507812, 338.30450439453125] + }, + "end": { + "value": 6.0, + "rect": [201.62551879882812, 374.30560302734375, 210.0361328125, 380.828857421875] + } + } + }, + # ... (more layers) + ], + "bounding_boxes": [ + { + "sidebar_rect": [198.11251831054688, 321.8956298828125, 210.75906372070312, 702.2628173828125], + "depth_column_entries": [ + [200.1201171875, 321.8956298828125, 208.59901428222656, 328.6802062988281], + [200.63790893554688, 331.3035888671875, 207.83108520507812, 338.30450439453125], + [201.62551879882812, 374.30560302734375, 210.0361328125, 380.828857421875], + [199.86251831054688, 434.51556396484375, 210.10894775390625, 441.4538879394531], + [198.11251831054688, 557.5472412109375, 210.35877990722656, 563.9244995117188], + [198.28451538085938, 582.0216674804688, 209.76953125, 588.7603759765625], + [198.7814178466797, 616.177001953125, 209.50042724609375, 622.502197265625], + [198.6378173828125, 663.2830810546875, 210.75906372070312, 669.5428466796875], + [198.26901245117188, 695.974609375, 209.12693786621094, 702.2628173828125] + ], + "material_description_rect": [256.777099609375, 345.9997253417969, 392.46051025390625, 728.2700805664062], + "page": 1 + }, + { + "sidebar_rect": null, + "depth_column_entries": [], + "material_description_rect": [192.3216094970703, 337.677978515625, 291.1827392578125, 633.6331176757812], + "page": 2 + } + ], + "groundwater": [ + { + "date": "1979-11-29", + "depth": 19.28, + "elevation": 336.07, + "page": 1, + "rect": [61.23963928222656, 489.3185119628906, 94.0096435546875, 513.6478881835938] + } + ] + } +} +``` \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 4766964c..fe9026d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,6 @@ dependencies = [ "boto3", "pandas", "levenshtein", - "pathlib", "python-dotenv", "setuptools", "tqdm", diff --git a/src/stratigraphy/annotations/draw.py b/src/stratigraphy/annotations/draw.py index e387d1e5..de513c69 100644 --- a/src/stratigraphy/annotations/draw.py +++ b/src/stratigraphy/annotations/draw.py @@ -8,8 +8,7 @@ import pandas as pd from dotenv import load_dotenv from stratigraphy.data_extractor.data_extractor import FeatureOnPage -from stratigraphy.depthcolumn.depthcolumn import DepthColumn -from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPairs +from stratigraphy.depths_materials_column_pairs.bounding_boxes import BoundingBoxes from stratigraphy.groundwater.groundwater_extraction import Groundwater from stratigraphy.layer.layer import Layer from stratigraphy.metadata.coordinate_extraction import Coordinate @@ -55,7 +54,7 @@ def draw_predictions( for file_prediction in predictions.file_predictions_list: logger.info("Drawing predictions for file %s", file_prediction.file_name) - depths_materials_column_pairs = file_prediction.depths_materials_columns_pairs + bounding_boxes = file_prediction.bounding_boxes coordinates = file_prediction.metadata.coordinates elevation = file_prediction.metadata.elevation @@ -98,7 +97,7 @@ def draw_predictions( draw_depth_columns_and_material_rect( shape, page.derotation_matrix, - [pair for pair in depths_materials_column_pairs if pair.page == page_number], + [bboxes for bboxes in bounding_boxes if bboxes.page == page_number], ) draw_material_descriptions( shape, @@ -245,7 +244,7 @@ def draw_material_descriptions(shape: fitz.Shape, derotation_matrix: fitz.Matrix def draw_depth_columns_and_material_rect( - shape: fitz.Shape, derotation_matrix: fitz.Matrix, depths_materials_column_pairs: list[DepthsMaterialsColumnPairs] + shape: fitz.Shape, derotation_matrix: fitz.Matrix, bounding_boxes: list[BoundingBoxes] ): """Draw depth columns as well as the material rects on a pdf page. @@ -257,25 +256,22 @@ def draw_depth_columns_and_material_rect( Args: shape (fitz.Shape): The shape object for drawing. derotation_matrix (fitz.Matrix): The derotation matrix of the page. - depths_materials_column_pairs (list): List of depth column entries. + bounding_boxes (list[BoundingBoxes]): List of bounding boxes for depth column and material descriptions. """ - for pair in depths_materials_column_pairs: - depth_column: DepthColumn = pair.depth_column - material_description_rect = pair.material_description_rect - - if depth_column: # Draw rectangle for depth columns + for bboxes in bounding_boxes: + if bboxes.sidebar_bbox: # Draw rectangle for depth columns shape.draw_rect( - fitz.Rect(depth_column.rect()) * derotation_matrix, + fitz.Rect(bboxes.sidebar_bbox.rect) * derotation_matrix, ) shape.finish(color=fitz.utils.getColor("green")) - for depth_column_entry in depth_column.entries: # Draw rectangle for depth column entries + for depth_column_entry in bboxes.depth_column_entry_bboxes: # Draw rectangle for depth column entries shape.draw_rect( fitz.Rect(depth_column_entry.rect) * derotation_matrix, ) shape.finish(color=fitz.utils.getColor("purple")) shape.draw_rect( # Draw rectangle for material description column - fitz.Rect(material_description_rect) * derotation_matrix, + bboxes.material_description_bbox.rect * derotation_matrix, ) shape.finish(color=fitz.utils.getColor("red")) diff --git a/src/stratigraphy/depthcolumn/depthcolumn.py b/src/stratigraphy/depthcolumn/depthcolumn.py deleted file mode 100644 index 6e6eb97f..00000000 --- a/src/stratigraphy/depthcolumn/depthcolumn.py +++ /dev/null @@ -1,583 +0,0 @@ -"""This module contains the DepthColumn class, which is used to represent a depth column in a pdf page.""" - -from __future__ import annotations - -import abc -from dataclasses import dataclass - -import fitz -import numpy as np -from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry, LayerDepthColumnEntry -from stratigraphy.layer.layer_identifier_column import LayerIdentifierColumn -from stratigraphy.lines.line import TextLine, TextWord -from stratigraphy.text.find_description import get_description_blocks -from stratigraphy.text.textblock import TextBlock -from stratigraphy.util.dataclasses import Line -from stratigraphy.util.interval import BoundaryInterval, Interval, LayerInterval - - -class DepthColumn(metaclass=abc.ABCMeta): - """Abstract DepthColumn class.""" - - @abc.abstractmethod - def __init__(self): # noqa: D107 - pass - - @abc.abstractmethod - def depth_intervals(self) -> list[Interval]: - """Get the depth intervals of the depth column.""" - pass - - @abc.abstractmethod - def rects(self) -> list[fitz.Rect]: - """Get the rectangles of the depth column entries.""" - pass - - """Used for scoring how well a depth column corresponds to a material description bbox.""" - - def rect(self) -> fitz.Rect: - """Get the bounding box of the depth column entries.""" - x0 = min([rect.x0 for rect in self.rects()]) - x1 = max([rect.x1 for rect in self.rects()]) - y0 = min([rect.y0 for rect in self.rects()]) - y1 = max([rect.y1 for rect in self.rects()]) - return fitz.Rect(x0, y0, x1, y1) - - @property - def max_x0(self) -> float: - """Get the maximum x0 value of the depth column entries.""" - return max([rect.x0 for rect in self.rects()]) - - @property - def min_x1(self) -> float: - """Get the minimum x1 value of the depth column entries.""" - return min([rect.x1 for rect in self.rects()]) - - @abc.abstractmethod - def noise_count(self, all_words: list[TextWord]) -> int: - """Count the number of words that intersect with the depth column entries. - - Args: - all_words (list[TextWord]): A list of all text lines on the page. - - Returns: - int: The number of words that intersect with the depth column entries but are not part of it. - """ - pass - - @abc.abstractmethod - def identify_groups( - self, description_lines: list[TextLine], geometric_lines: list[Line], material_description_rect: fitz.Rect - ) -> list[IntervalBlockGroup]: - """Identifies groups of description blocks that correspond to depth intervals. - - Args: - description_lines (list[TextLine]): A list of text lines that are part of the description. - geometric_lines (list[Line]): A list of geometric lines that are part of the description. - material_description_rect (fitz.Rect): The bounding box of the material description. - - Returns: - list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. - """ - pass - - @abc.abstractmethod - def to_json(self): - """Converts the object to a dictionary.""" - pass - - @classmethod - @abc.abstractmethod - def from_json(cls, json_depth_column: dict) -> DepthColumn: - """Converts a dictionary to an object.""" - pass - - -class DepthColumnFactory: - """Factory class for creating DepthColumn objects.""" - - @staticmethod - def create(data: dict) -> DepthColumn: - """Creates a DepthColumn object from a dictionary. - - Args: - data (dict): A dictionary representing the depth column. - - Returns: - DepthColumn: The depth column object. - """ - column_type = data.get("type") - if column_type == "BoundaryDepthColumn": - return BoundaryDepthColumn.from_json(data) - elif column_type == "LayerDepthColumn": - return LayerDepthColumn.from_json(data) - elif column_type == "LayerIdentifierColumn": - return LayerIdentifierColumn.from_json(data) - else: - raise ValueError(f"Unknown depth column type: {column_type}") - - -class LayerDepthColumn(DepthColumn): - """Represents a depth column where the upper and lower depths of each layer are explicitly specified. - - Example:: - 0 - 0.1m: xxx - 0.1 - 0.3m: yyy - 0.3 - 0.8m: zzz - ... - """ - - entries: list[LayerDepthColumnEntry] - - def __init__(self, entries=None): - super().__init__() - - if entries is not None: - self.entries = entries - else: - self.entries = [] - - def __repr__(self): - """Converts the object to a string. - - Returns: - str: The object as a string. - """ - return "LayerDepthColumn({})".format(", ".join([str(entry) for entry in self.entries])) - - def to_json(self) -> dict: - """Converts the object to a dictionary. - - Returns: - dict: The object as a dictionary. - """ - rect = self.rect() - return { - "rect": [rect.x0, rect.y0, rect.x1, rect.y1], - "entries": [entry.to_json() for entry in self.entries], - "type": "LayerDepthColumn", - } - - @classmethod - def from_json(cls, json_depth_column: dict) -> LayerDepthColumn: - """Converts a dictionary to an object. - - Args: - json_depth_column (dict): A dictionary representing the depth column. - - Returns: - LayerDepthColumn: The depth column object. - """ - entries_data = json_depth_column.get("entries", []) - entries = [LayerDepthColumnEntry.from_json(entry) for entry in entries_data] - return LayerDepthColumn(entries) - - def add_entry(self, entry: LayerDepthColumnEntry) -> LayerDepthColumn: - """Adds a depth column entry to the depth column. - - Args: - entry (LayerDepthColumnEntry): The depth column entry to add. - - Returns: - LayerDepthColumn: The depth column with the new entry. - """ - self.entries.append(entry) - return self - - def depth_intervals(self) -> list[Interval]: - return [LayerInterval(entry) for entry in self.entries] - - def rects(self) -> list[fitz.Rect]: - return [entry.rect for entry in self.entries] - - def noise_count(self, all_words: list[TextWord]) -> int: - # currently, we don't count noise for layer columns - return 0 - - def break_on_mismatch(self) -> list[LayerDepthColumn]: - """Breaks the depth column into segments where the depth intervals are not in an arithmetic progression. - - Returns: - list[LayerDepthColumn]: A list of depth column segments. - """ - segments = [] - segment_start = 0 - for index, current_entry in enumerate(self.entries): - if index >= 1 and current_entry.start.value < self.entries[index - 1].end.value: - # (_, big) || (small, _) - segments.append(self.entries[segment_start:index]) - segment_start = index - - final_segment = self.entries[segment_start:] - if final_segment: - segments.append(final_segment) - - return [LayerDepthColumn(segment) for segment in segments] - - def is_valid(self) -> bool: - """Checks if the depth column is valid. - - A depth column is valid if it is strictly increasing and the depth intervals are significant. - - Returns: - bool: True if the depth column is valid, False otherwise. - """ - if len(self.entries) <= 2: - return False - - # At least half of the "end" values must match the subsequent "start" value (e.g. 2-5m, 5-9m). - sequence_matches_count = 0 - for index, entry in enumerate(self.entries): - if index >= 1 and self.entries[index - 1].end.value == entry.start.value: - sequence_matches_count += 1 - - return sequence_matches_count / (len(self.entries) - 1) > 0.5 - - def identify_groups( - self, - description_lines: list[TextLine], - geometric_lines: list[Line], - material_description_rect: fitz.Rect, - **params, - ) -> list[IntervalBlockGroup]: - """Identifies groups of description blocks that correspond to depth intervals. - - Args: - description_lines (list[TextLine]): A list of text lines that are part of the description. - geometric_lines (list[Line]): A list of geometric lines that are part of the description. - material_description_rect (fitz.Rect): The bounding box of the material description. - params (dict): A dictionary of parameters used for line detection. - - Returns: - list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. - """ - depth_intervals = self.depth_intervals() - - groups = [] - line_index = 0 - - for interval_index, interval in enumerate(depth_intervals): - # don't allow a layer above depth 0 - if interval.start is None and interval.end.value == 0: - continue - - next_interval = depth_intervals[interval_index + 1] if interval_index + 1 < len(depth_intervals) else None - - matched_blocks = interval.matching_blocks(description_lines, line_index, next_interval) - line_index += sum([len(block.lines) for block in matched_blocks]) - groups.append(IntervalBlockGroup(depth_intervals=[interval], blocks=matched_blocks)) - return groups - - -class BoundaryDepthColumn(DepthColumn): - """Represents a depth column. - - The depths of the boundaries between layers are labels, at a vertical position on - the page that is proportional to the depth. - - Example: - 0m - - 0.2m - - - 0.5m - ... - """ - - entries: list[DepthColumnEntry] - - def __init__(self, entries: list = None): - """Initializes a BoundaryDepthColumn object. - - Args: - entries (list, optional): Depth Column Entries for the depth column. Defaults to None. - """ - super().__init__() - - if entries is not None: - self.entries = entries - else: - self.entries = [] - - def rects(self) -> list[fitz.Rect]: - return [entry.rect for entry in self.entries] - - def __repr__(self): - return "DepthColumn({})".format(", ".join([str(entry) for entry in self.entries])) - - def to_json(self) -> dict: - """Converts the object to a dictionary. - - Returns: - dict: The object as a dictionary. - """ - rect = self.rect() - return { - "rect": [rect.x0, rect.y0, rect.x1, rect.y1], - "entries": [entry.to_json() for entry in self.entries], - "type": "BoundaryDepthColumn", - } - - @classmethod - def from_json(cls, json_depth_column: dict) -> BoundaryDepthColumn: - """Converts a dictionary to an object. - - Args: - json_depth_column (dict): A dictionary representing the depth column. - - Returns: - BoundaryDepthColumn: The depth column object. - """ - entries_data = json_depth_column.get("entries", []) - entries = [DepthColumnEntry.from_json(entry) for entry in entries_data] - return BoundaryDepthColumn(entries) - - def add_entry(self, entry: DepthColumnEntry) -> BoundaryDepthColumn: - """Adds a depth column entry to the depth column. - - Args: - entry (DepthColumnEntry): The depth column entry to add. - - Returns: - BoundaryDepthColumn: The depth column with the new entry. - """ - self.entries.append(entry) - return self - - """ - Check if the middle of the new rect is between the outer horizontal boundaries of the column, and if there is an - intersection with the minimal horizontal boundaries of the column. - """ - - def can_be_appended(self, rect: fitz.Rect) -> bool: - """Checks if a new depth column entry can be appended to the current depth column. - - The checks are: - - The width of the new rectangle is greater than the width of the current depth column. Or; - - The middle of the new rectangle is within the horizontal boundaries of the current depth column. - - The new rectangle intersects with the minimal horizontal boundaries of the current depth column. - - - Args: - rect (fitz.Rect): Rect of the depth column entry to be appended. - - Returns: - bool: True if the new depth column entry can be appended, False otherwise. - """ - new_middle = (rect.x0 + rect.x1) / 2 - if (self.rect().width < rect.width or self.rect().x0 < new_middle < self.rect().x1) and ( - rect.x0 <= self.min_x1 and self.max_x0 <= rect.x1 - ): - return True - return False - - def valid_initial_segment(self, rect: fitz.Rect) -> BoundaryDepthColumn: - for i in range(len(self.entries) - 1): - initial_segment = BoundaryDepthColumn(self.entries[: -i - 1]) - if initial_segment.can_be_appended(rect): - return initial_segment - return BoundaryDepthColumn() - - def strictly_contains(self, other: BoundaryDepthColumn) -> bool: - return len(other.entries) < len(self.entries) and all( - other_entry in self.entries for other_entry in other.entries - ) - - def is_strictly_increasing(self) -> bool: - return all(i.value < j.value for i, j in zip(self.entries, self.entries[1:], strict=False)) - - def depth_intervals(self) -> list[BoundaryInterval]: - """Creates a list of depth intervals from the depth column entries. - - The first depth interval has an open start value (i.e. None). - - Returns: - list[BoundaryInterval]: A list of depth intervals. - """ - depth_intervals = [BoundaryInterval(None, self.entries[0])] - for i in range(len(self.entries) - 1): - depth_intervals.append(BoundaryInterval(self.entries[i], self.entries[i + 1])) - depth_intervals.append( - BoundaryInterval(self.entries[len(self.entries) - 1], None) - ) # even though no open ended intervals are allowed, they are still useful for matching, - # especially for documents where the material description rectangle is too tall - # (and includes additional lines below the actual material descriptions). - return depth_intervals - - def significant_arithmetic_progression(self) -> bool: - # to allow for OCR errors or gaps in the progression, we only require a segment of length 6 that is an - # arithmetic progression - segment_length = 6 - if len(self.entries) < segment_length: - return self.is_arithmetic_progression() - else: - for i in range(len(self.entries) - segment_length + 1): - if BoundaryDepthColumn(self.entries[i : i + segment_length]).is_arithmetic_progression(): - return True - return False - - def is_arithmetic_progression(self) -> bool: - if len(self.entries) <= 2: - return True - - progression = np.array(range(len(self.entries))) - entries = np.array([entry.value for entry in self.entries]) - - # Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0. - if np.std(entries) == 0: - return False - - scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item() - return abs(scale_pearson_correlation_coef) >= 0.9999 - - def noise_count(self, all_words: list[TextWord]) -> int: - """Counts the number of words that intersect with the depth column entries. - - Returns the number of words that intersect with the depth column entries, but are not part of the depth column. - - Args: - all_words (list[TextWord]): A list of all text lines on the page. - - Returns: - int: The number of words that intersect with the depth column entries but are not part of it. - """ - - def significant_intersection(other_rect): - intersection = fitz.Rect(other_rect).intersect(self.rect()) - return intersection.is_valid and intersection.width > 0.25 * self.rect().width - - return len([word for word in all_words if significant_intersection(word.rect)]) - len(self.entries) - - def pearson_correlation_coef(self) -> float: - # We look at the lower y coordinate, because most often the baseline of the depth value text is aligned with - # the line of the corresponding layer boundary. - positions = np.array([entry.rect.y1 for entry in self.entries]) - entries = np.array([entry.value for entry in self.entries]) - - # Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0. - if np.std(entries) == 0 or np.std(positions) == 0: - return 0 - - return np.corrcoef(positions, entries)[0, 1].item() - - def remove_entry_by_correlation_gradient(self) -> BoundaryDepthColumn | None: - if len(self.entries) < 3: - return None - - new_columns = [ - BoundaryDepthColumn([entry for index, entry in enumerate(self.entries) if index != remove_index]) - for remove_index in range(len(self.entries)) - ] - return max(new_columns, key=lambda column: column.pearson_correlation_coef()) - - def break_on_double_descending(self) -> list[BoundaryDepthColumn]: - segments = [] - segment_start = 0 - for index, current_entry in enumerate(self.entries): - if ( - index >= 2 - and index + 1 < len(self.entries) - and current_entry.value < self.entries[index - 2].value - and current_entry.value < self.entries[index - 1].value - and self.entries[index + 1].value < self.entries[index - 2].value - and self.entries[index + 1].value < self.entries[index - 1].value - ): - # big big || small small - segments.append(self.entries[segment_start:index]) - segment_start = index - - final_segment = self.entries[segment_start:] - if final_segment: - segments.append(final_segment) - - return [BoundaryDepthColumn(segment) for segment in segments] - - def identify_groups( - self, - description_lines: list[TextLine], - geometric_lines: list[Line], - material_description_rect: fitz.Rect, - **params, - ) -> list[IntervalBlockGroup]: - """Identifies groups of description blocks that correspond to depth intervals. - - Note: includes a heuristic of whether there should be a group corresponding to a final depth interval - starting from the last depth entry without any end value. - - Args: - description_lines (list[TextLine]): A list of text lines that are part of the description. - geometric_lines (list[Line]): A list of geometric lines that are part of the description. - material_description_rect (fitz.Rect): The bounding box of the material description. - params (dict): A dictionary of parameters used for line detection. - - Returns: - list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. - - Example: - [ - { - "depth_intervals": [BoundaryInterval(None, 0.1), BoundaryInterval(0.1, 0.3), ...], - "blocks": [DescriptionBlock(...), DescriptionBlock(...), ...] - }, - { - "depth_intervals": [BoundaryInterval(0.3, 0.7)], - "blocks": [DescriptionBlock(...), DescriptionBlock(...), ...] - }, - ... - ] - """ - depth_intervals = self.depth_intervals() - - groups = [] - - current_intervals = [] - current_blocks = [] - all_blocks = get_description_blocks( - description_lines, - geometric_lines, - material_description_rect, - params["block_line_ratio"], - left_line_length_threshold=params["left_line_length_threshold"], - target_layer_count=len(depth_intervals), - ) - - block_index = 0 - - for interval in depth_intervals: - # don't allow a layer above depth 0 - if interval.start is None and interval.end.value == 0: - continue - - pre, exact, post = interval.matching_blocks(all_blocks, block_index) - block_index += len(pre) + len(exact) + len(post) - - current_blocks.extend(pre) - if len(exact): - if len(current_intervals) > 0 or len(current_blocks) > 0: - groups.append(IntervalBlockGroup(depth_intervals=current_intervals, blocks=current_blocks)) - groups.append(IntervalBlockGroup(depth_intervals=[interval], blocks=exact)) - current_blocks = post - current_intervals = [] - else: - # The final open-ended interval should not be added, since borehole profiles do typically not come - # with open-ended intervals. - if interval.end is not None: - current_intervals.append(interval) - - if len(current_intervals) > 0 or len(current_blocks) > 0: - groups.append(IntervalBlockGroup(depth_intervals=current_intervals, blocks=current_blocks)) - - return groups - - -@dataclass -class IntervalBlockGroup: - """Helper class to represent a group of depth intervals and an associated group of text blocks. - - The class is used to simplify the code for obtaining an appropriate one-to-one correspondence between depth - intervals and material descriptions. - """ - - depth_intervals: list[Interval] - blocks: list[TextBlock] diff --git a/src/stratigraphy/depthcolumn/depthcolumnentry.py b/src/stratigraphy/depthcolumn/depthcolumnentry.py index 0a9faac4..f20200f8 100644 --- a/src/stratigraphy/depthcolumn/depthcolumnentry.py +++ b/src/stratigraphy/depthcolumn/depthcolumnentry.py @@ -1,72 +1,86 @@ """Contains dataclasses for entries in a depth column.""" +from __future__ import annotations + +import re +from dataclasses import dataclass from typing import Any import fitz +from stratigraphy.lines.line import TextWord +@dataclass class DepthColumnEntry: # noqa: D101 """Class to represent a depth column entry.""" - def __init__(self, rect: fitz.Rect, value: float, page_number: int): - self.rect = rect - self.value = value - self.page_number = page_number + rect: fitz.Rect + value: float def __repr__(self) -> str: return str(self.value) def to_json(self) -> dict[str, Any]: """Convert the depth column entry to a JSON serializable format.""" - return { - "value": self.value, - "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], - "page": self.page_number, - } + return {"value": self.value, "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1]} @classmethod - def from_json(cls, json_depth_column_entry: dict) -> "DepthColumnEntry": + def from_json(cls, data: dict) -> DepthColumnEntry: """Converts a dictionary to an object. Args: - json_depth_column_entry (dict): A dictionary representing the depth column entry. + data (dict): A dictionary representing the depth column entry. Returns: DepthColumnEntry: The depth column entry object. """ - return cls( - rect=fitz.Rect(json_depth_column_entry["rect"]), - value=json_depth_column_entry["value"], - page_number=json_depth_column_entry["page"], - ) - - -class AnnotatedDepthColumnEntry(DepthColumnEntry): # noqa: D101 - """Class to represent a depth column entry obtained from LabelStudio. - - The annotation process in label studio does not come with rectangles for depth column entries. - Therefore, we set them to None. - """ - - def __init__(self, value): - super().__init__(None, value, None) - - def to_json(self) -> dict[str, Any]: - return { - "value": self.value, - "rect": self.rect, - "page": self.page_number, - } + return cls(rect=fitz.Rect(data["rect"]), value=data["value"]) + @classmethod + def find_in_words(cls, all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]: + """Find all depth column entries given a list of TextWord objects. -class LayerDepthColumnEntry: # noqa: D101 - """Class to represent a layer depth column entry.""" + Note: Only depths up to two digits before the decimal point are supported. - def __init__(self, start: DepthColumnEntry, end: DepthColumnEntry): - self.start = start - self.end = end + Args: + all_words (list[TextWord]): List of text words to extract depth column entries from. + include_splits (bool): Whether to include split entries. - assert start.page_number == end.page_number, "Start and end entries are on different pages." + Returns: + list[DepthColumnEntry]: The extracted depth column entries. + """ + entries = [] + for word in sorted(all_words, key=lambda word: word.rect.y0): + try: + input_string = word.text.strip().replace(",", ".") + regex = re.compile(r"^-?\.?([0-9]+(\.[0-9]+)?)[müMN\\.]*$") + # numbers such as '.40' are not supported. The reason is that sometimes the OCR + # recognizes a '-' as a '.' and we just ommit the leading '.' to avoid this issue. + match = regex.match(input_string) + if match: + value = value_as_float(match.group(1)) + entries.append(DepthColumnEntry(word.rect, value)) + elif include_splits: + # support for e.g. "1.10-1.60m" extracted as a single word + a_to_b_depth_column_entry = AToBDepthColumnEntry.from_text(input_string, word.rect) + entries.extend( + [a_to_b_depth_column_entry.start, a_to_b_depth_column_entry.end] + if a_to_b_depth_column_entry + else [] + ) + except ValueError: + pass + return entries + + +@dataclass +class AToBDepthColumnEntry: # noqa: D101 + """Class to represent a depth column entry of the form "1m - 3m".""" + + # TODO do we need both this class as well as AToBInterval, or can we combine the two classes? + + start: DepthColumnEntry + end: DepthColumnEntry def __repr__(self) -> str: return f"{self.start.value}-{self.end.value}" @@ -78,23 +92,59 @@ def rect(self) -> fitz.Rect: def to_json(self) -> dict[str, Any]: """Convert the layer depth column entry to a JSON serializable format.""" - return { - "start": self.start.to_json(), - "end": self.end.to_json(), - "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], - "page": self.start.page_number, - } + return {"start": self.start.to_json(), "end": self.end.to_json()} @classmethod - def from_json(cls, json_layer_depth_column_entry: dict) -> "LayerDepthColumnEntry": + def from_json(cls, data: dict) -> AToBDepthColumnEntry: """Converts a dictionary to an object. Args: - json_layer_depth_column_entry (dict): A dictionary representing the layer depth column entry. + data (dict): A dictionary representing the layer depth column entry. Returns: - LayerDepthColumnEntry: The layer depth column entry object. + AToBDepthColumnEntry: The A-to-B depth column entry object. """ - start = DepthColumnEntry.from_json(json_layer_depth_column_entry["start"]) - end = DepthColumnEntry.from_json(json_layer_depth_column_entry["end"]) + start = DepthColumnEntry.from_json(data["start"]) + end = DepthColumnEntry.from_json(data["end"]) return cls(start, end) + + @classmethod + def from_text( + cls, text: str, rect: fitz.Rect, require_start_of_string: bool = True + ) -> AToBDepthColumnEntry | None: + """Attempts to extract a AToBDepthColumnEntry from a string. + + Args: + text (str): The string to extract the depth interval from. + rect (fitz.Rect): The rectangle of the text. + require_start_of_string (bool, optional): Whether the number to extract needs to be + at the start of a string. Defaults to True. + + Returns: + AToBDepthColumnEntry | None: The extracted LayerDepthColumnEntry or None if none is found. + """ + input_string = text.strip().replace(",", ".") + + query = r"-?([0-9]+(\.[0-9]+)?)[müMN\]*[\s-]+([0-9]+(\.[0-9]+)?)[müMN\\.]*" + if not require_start_of_string: + query = r".*?" + query + regex = re.compile(query) + match = regex.match(input_string) + if match: + value1 = value_as_float(match.group(1)) + first_half_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 - rect.width / 2, rect.y1) + + value2 = value_as_float(match.group(3)) + second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1) + return AToBDepthColumnEntry( + DepthColumnEntry(first_half_rect, value1), + DepthColumnEntry(second_half_rect, value2), + ) + return None + + +def value_as_float(string_value: str) -> float: # noqa: D103 + """Converts a string to a float.""" + # OCR sometimes tends to miss the decimal comma + parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value) + return abs(float(parsed_text)) diff --git a/src/stratigraphy/depthcolumn/find_depth_columns.py b/src/stratigraphy/depthcolumn/find_depth_columns.py deleted file mode 100644 index d7aa8321..00000000 --- a/src/stratigraphy/depthcolumn/find_depth_columns.py +++ /dev/null @@ -1,245 +0,0 @@ -"""This module contains functionalities to find depth columns in a pdf page.""" - -import re - -import fitz -from stratigraphy.depthcolumn.boundarydepthcolumnvalidator import BoundaryDepthColumnValidator -from stratigraphy.depthcolumn.depthcolumn import BoundaryDepthColumn, LayerDepthColumn -from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry, LayerDepthColumnEntry -from stratigraphy.lines.line import TextWord -from stratigraphy.text.textblock import TextBlock - - -def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]: - """Find all depth column entries given a list of TextLine objects. - - Note: Only depths up to two digits before the decimal point are supported. - - Args: - all_words (list[TextWord]): List of text words to extract depth column entries from. - include_splits (bool): Whether to include split entries. - - Returns: - list[DepthColumnEntry]: The extracted depth column entries. - """ - entries = [] - for word in sorted(all_words, key=lambda word: word.rect.y0): - try: - input_string = word.text.strip().replace(",", ".") - regex = re.compile(r"^-?\.?([0-9]+(\.[0-9]+)?)[müMN\\.]*$") - # numbers such as '.40' are not supported. The reason is that sometimes the OCR - # recognizes a '-' as a '.' and we just ommit the leading '.' to avoid this issue. - match = regex.match(input_string) - if match: - value = value_as_float(match.group(1)) - entries.append(DepthColumnEntry(word.rect, value, word.page_number)) - elif include_splits: - # support for e.g. "1.10-1.60m" extracted as a single word - layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect, word.page_number) - entries.extend( - [layer_depth_column_entry.start, layer_depth_column_entry.end] if layer_depth_column_entry else [] - ) - except ValueError: - pass - return entries - - -def value_as_float(string_value: str) -> float: # noqa: D103 - """Converts a string to a float.""" - # OCR sometimes tends to miss the decimal comma - parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value) - return abs(float(parsed_text)) - - -def extract_layer_depth_interval( - text: str, rect: fitz.Rect, page_number: int, require_start_of_string: bool = True -) -> LayerDepthColumnEntry | None: - """Extracts a LayerDepthColumnEntry from a string. - - Args: - text (str): The string to extract the depth interval from. - rect (fitz.Rect): The rectangle of the text. - page_number (int): The page number of the text. - require_start_of_string (bool, optional): Whether the number to extract needs to be - at the start of a string. Defaults to True. - - Returns: - LayerDepthColumnEntry | None: The extracted LayerDepthColumnEntry or None if none is found. - """ - input_string = text.strip().replace(",", ".") - - query = r"-?([0-9]+(\.[0-9]+)?)[müMN\]*[\s-]+([0-9]+(\.[0-9]+)?)[müMN\\.]*" - if not require_start_of_string: - query = r".*?" + query - regex = re.compile(query) - match = regex.match(input_string) - if match: - value1 = value_as_float(match.group(1)) - first_half_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 - rect.width / 2, rect.y1) - - value2 = value_as_float(match.group(3)) - second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1) - return LayerDepthColumnEntry( - DepthColumnEntry(first_half_rect, value1, page_number), - DepthColumnEntry(second_half_rect, value2, page_number), - ) - return None - - -def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[TextWord]) -> list[LayerDepthColumn]: - """Finds all layer depth columns. - - Generates a list of LayerDepthColumnEntry objects by finding consecutive pairs of DepthColumnEntry objects. - Different columns are grouped together in LayerDepthColumn objects. Finally, a list of LayerDepthColumn objects, - one for each column, is returned. - - A layer corresponds to a material layer. The layer is defined using a start and end point (e.g. 1.10-1.60m). - The start and end points are represented as DepthColumnEntry objects. - - Args: - entries (list[DepthColumnEntry]): List of depth column entries. - all_words (list[TextWord]): List of all TextWord objects. - - Returns: - list[LayerDepthColumn]: List of all layer depth columns identified. - """ - - def find_pair(entry: DepthColumnEntry) -> DepthColumnEntry | None: # noqa: D103 - min_y0 = entry.rect.y0 - entry.rect.height / 2 - max_y0 = entry.rect.y0 + entry.rect.height / 2 - for other in entries: - if entry == other: - continue - if other.value <= entry.value: - continue - combined_width = entry.rect.width + other.rect.width - if not entry.rect.x0 <= other.rect.x0 <= entry.rect.x0 + combined_width: - continue - if not min_y0 <= other.rect.y0 <= max_y0: - continue - in_between_text = " ".join( - [ - word.text - for word in all_words - if entry.rect.x0 < word.rect.x0 < other.rect.x0 and min_y0 <= word.rect.y0 <= max_y0 - ] - ) - if re.fullmatch(r"\W*m?\W*", in_between_text): - return other - - pairs = [(entry, find_pair(entry)) for entry in entries] - - columns = [] - for first, second in pairs: - if second is not None: - entry = LayerDepthColumnEntry(first, second) - is_matched = False - for column in columns: - column_rect = column.rect() - new_start_middle = (entry.start.rect.x0 + entry.start.rect.x1) / 2 - if column_rect.x0 < new_start_middle < column_rect.x1: - is_matched = True - column.add_entry(entry) - - if not is_matched: - columns.append(LayerDepthColumn([entry])) - - return [ - column_segment - for column in columns - for column_segment in column.break_on_mismatch() - if column_segment.is_valid() - ] - - -def find_depth_columns( - entries: list[DepthColumnEntry], all_words: list[TextWord], page_number: int, depth_column_params: dict -) -> list[BoundaryDepthColumn]: - """Construct all possible BoundaryDepthColumn objects from the given DepthColumnEntry objects. - - Args: - entries (list[DepthColumnEntry]): All found depth column entries in the page. - all_words (list[TextLine]): All words in the page. - page_number (int): The page number of the entries. - depth_column_params (dict): Parameters for the BoundaryDepthColumn objects. - - Returns: - list[BoundaryDepthColumn]: Found BoundaryDepthColumn objects. - """ - numeric_columns: list[BoundaryDepthColumn] = [] - for entry in entries: - has_match = False - additional_columns = [] - for column in numeric_columns: - if column.can_be_appended(entry.rect): - has_match = True - column.add_entry(entry) - else: - valid_initial_segment = column.valid_initial_segment(entry.rect) - if len(valid_initial_segment.entries) > 0: - has_match = True - additional_columns.append(valid_initial_segment.add_entry(entry)) - - numeric_columns.extend(additional_columns) - if not has_match: - numeric_columns.append(BoundaryDepthColumn(entries=[entry])) - - # only keep columns that are not contained in a different column - numeric_columns = [ - column - for column in numeric_columns - if all(not other.strictly_contains(column) for other in numeric_columns) - ] - - boundary_depth_column_validator = BoundaryDepthColumnValidator(all_words, **depth_column_params) - - numeric_columns = [ - boundary_depth_column_validator.reduce_until_valid(column, page_number) - for numeric_column in numeric_columns - for column in numeric_column.break_on_double_descending() - # when we have a perfect arithmetic progression, this is usually just a scale - # that does not match the descriptions - if not column.significant_arithmetic_progression() - ] - - return sorted( - [column for column in numeric_columns if column and boundary_depth_column_validator.is_valid(column)], - key=lambda column: len(column.entries), - ) - - -def get_depth_interval_from_textblock(block: TextBlock) -> LayerDepthColumnEntry | None: - """Extract depth interval from a material description block. - - For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description - of the material. Often, these text descriptions contain a further separation into multiple sub layers. - These sub layers have their own depth intervals. This function extracts the overall depth interval, - spanning across all mentioned sub layers. - - Args: - block (TextBlock): The block to calculate the depth interval for. - - Returns: - LayerDepthColumnEntry | None: The depth interval. - """ - depth_entries = [] - for line in block.lines: - try: - layer_depth_entry = extract_layer_depth_interval( - line.text, line.rect, line.page_number, require_start_of_string=False - ) - # require_start_of_string = False because the depth interval may not always start at the beginning - # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" - if layer_depth_entry: - depth_entries.append(layer_depth_entry) - except ValueError: - pass - - if depth_entries: - # Merge the sub layers into one depth interval. - start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value) - end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value) - - return LayerDepthColumnEntry(start, end) - else: - return None diff --git a/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py new file mode 100644 index 00000000..ba1ebcd6 --- /dev/null +++ b/src/stratigraphy/depths_materials_column_pairs/bounding_boxes.py @@ -0,0 +1,83 @@ +"""Classes for JSON-serializable bounding boxes of different parts of a borehole profile.""" + +from dataclasses import dataclass + +import fitz +from stratigraphy.depths_materials_column_pairs.material_description_rect_with_sidebar import ( + MaterialDescriptionRectWithSidebar, +) + + +@dataclass +class BoundingBox: + """A single bounding box, JSON serializable.""" + + rect: fitz.Rect + + def to_json(self) -> list[int]: + """Converts the object to a dictionary. + + Returns: + list[int]: The object as a list. + """ + return [ + self.rect.x0, + self.rect.y0, + self.rect.x1, + self.rect.y1, + ] + + @classmethod + def from_json(cls, data) -> "BoundingBox": + return cls(rect=fitz.Rect(data)) + + +@dataclass +class BoundingBoxes: + """A class to represent the bounding boxes of sidebars and associated material descriptions.""" + + sidebar_bbox: BoundingBox | None + depth_column_entry_bboxes: list[BoundingBox] + material_description_bbox: BoundingBox + page: int + + def to_json(self) -> dict: + """Converts the object to a dictionary. + + Returns: + dict: The object as a dictionary. + """ + return { + "sidebar_rect": self.sidebar_bbox.to_json() if self.sidebar_bbox else None, + "depth_column_entries": [entry.to_json() for entry in self.depth_column_entry_bboxes], + "material_description_rect": self.material_description_bbox.to_json(), + "page": self.page, + } + + @classmethod + def from_json(cls, data) -> "BoundingBoxes": + """Convert a JSON data structure to a BoundingBoxes object.""" + return cls( + sidebar_bbox=BoundingBox.from_json(data["sidebar_rect"]) if "sidebar_rect" in data else None, + depth_column_entry_bboxes=[BoundingBox.from_json(entry) for entry in data["depth_column_entries"]], + material_description_bbox=BoundingBox.from_json(data["material_description_rect"]), + page=data["page"], + ) + + @classmethod + def from_material_description_rect_with_sidebar( + cls, pair: MaterialDescriptionRectWithSidebar, page_number: int + ) -> "BoundingBoxes": + """Convert a MaterialDescriptionRectWithSidebar instance to a BoundingBoxes object.""" + if pair.sidebar: + depth_column_bbox = BoundingBox(pair.sidebar.rect()) + depth_column_entry_bboxes = [BoundingBox(entry.rect) for entry in pair.sidebar.entries] + else: + depth_column_bbox = None + depth_column_entry_bboxes = [] + return BoundingBoxes( + sidebar_bbox=depth_column_bbox, + depth_column_entry_bboxes=depth_column_entry_bboxes, + material_description_bbox=BoundingBox(pair.material_description_rect), + page=page_number, + ) diff --git a/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py b/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py deleted file mode 100644 index 4afbbef1..00000000 --- a/src/stratigraphy/depths_materials_column_pairs/depths_materials_column_pairs.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Definition of the DepthsMaterialsColumnPairs class.""" - -from dataclasses import dataclass - -import fitz -from stratigraphy.depthcolumn.depthcolumn import DepthColumn, DepthColumnFactory - - -@dataclass -class DepthsMaterialsColumnPairs: - """A class to represent pairs of depth columns and material descriptions.""" - - depth_column: DepthColumn | None - material_description_rect: fitz.Rect - page: int - - def __str__(self) -> str: - """Converts the object to a string. - - Returns: - str: The object as a string. - """ - return ( - f"DepthsMaterialsColumnPairs(depth_column={self.depth_column}," - f"material_description_rect={self.material_description_rect}, page={self.page})" - ) - - def to_json(self) -> dict: - """Converts the object to a dictionary. - - Returns: - dict: The object as a dictionary. - """ - return { - "depth_column": self.depth_column.to_json() if self.depth_column else None, - "material_description_rect": [ - self.material_description_rect.x0, - self.material_description_rect.y0, - self.material_description_rect.x1, - self.material_description_rect.y1, - ], - "page": self.page, - } - - @classmethod - def from_json(cls, json_depths_materials_column_pairs: dict) -> "DepthsMaterialsColumnPairs": - """Converts a dictionary to an object. - - Args: - json_depths_materials_column_pairs (dict): A dictionary representing the depths materials column pairs. - - Returns: - DepthsMaterialsColumnPairs: The depths materials column pairs object. - """ - depth_column_entry = json_depths_materials_column_pairs["depth_column"] - depth_column = DepthColumnFactory.create(depth_column_entry) if depth_column_entry else None - material_description_rect = fitz.Rect(json_depths_materials_column_pairs["material_description_rect"]) - page = json_depths_materials_column_pairs["page"] - - return cls(depth_column, material_description_rect, page) diff --git a/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py b/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py new file mode 100644 index 00000000..8d4aa39b --- /dev/null +++ b/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py @@ -0,0 +1,41 @@ +"""Definition of the MaterialDescriptionRectWithSidebar class.""" + +import math +from dataclasses import dataclass + +import fitz +from stratigraphy.lines.line import TextWord +from stratigraphy.sidebar import Sidebar + + +@dataclass +class MaterialDescriptionRectWithSidebar: + """A class to represent pairs of sidebar and material description rectangle.""" + + sidebar: Sidebar | None + material_description_rect: fitz.Rect + + def score_match(self, all_words: list[TextWord] | None = None) -> float: + """Scores the match between a sidebar and a material description. + + Args: + all_words (list[TextWord] | None, optional): List of the available text words. Defaults to None. + + Returns: + float: The score of the match. + """ + rect = self.sidebar.rect() + top = rect.y0 + bottom = rect.y1 + right = rect.x1 + distance = ( + abs(top - self.material_description_rect.y0) + + abs(bottom - self.material_description_rect.y1) + + abs(right - self.material_description_rect.x0) + ) + + height = bottom - top + + noise_count = self.sidebar.noise_count(all_words) if all_words else 0 + + return (height - distance) * math.pow(0.8, noise_count) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index cdc7dcd8..1c99469f 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -1,30 +1,30 @@ """Contains the main extraction pipeline for stratigraphy.""" import logging -import math from dataclasses import dataclass import fitz from stratigraphy.data_extractor.data_extractor import FeatureOnPage -from stratigraphy.depthcolumn import find_depth_columns -from stratigraphy.depthcolumn.depthcolumn import DepthColumn -from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPairs +from stratigraphy.depths_materials_column_pairs.bounding_boxes import BoundingBox, BoundingBoxes +from stratigraphy.depths_materials_column_pairs.material_description_rect_with_sidebar import ( + MaterialDescriptionRectWithSidebar, +) from stratigraphy.layer.layer import IntervalBlockPair, Layer -from stratigraphy.layer.layer_identifier_column import ( - LayerIdentifierColumn, - find_layer_identifier_column, - find_layer_identifier_column_entries, +from stratigraphy.lines.line import TextLine +from stratigraphy.sidebar import ( + AAboveBSidebarExtractor, + AToBSidebarExtractor, + LayerIdentifierSidebarExtractor, + Sidebar, ) -from stratigraphy.lines.line import TextLine, TextWord from stratigraphy.text.find_description import ( get_description_blocks, - get_description_blocks_from_layer_identifier, get_description_lines, ) from stratigraphy.text.textblock import MaterialDescription, MaterialDescriptionLine, TextBlock, block_distance from stratigraphy.util.dataclasses import Line -from stratigraphy.util.interval import BoundaryInterval, Interval +from stratigraphy.util.interval import AAboveBInterval, Interval from stratigraphy.util.util import ( x_overlap, x_overlap_significant_smallest, @@ -38,7 +38,7 @@ class ProcessPageResult: """The result of processing a single page of a pdf.""" predictions: list[Layer] - depth_material_pairs: list[DepthsMaterialsColumnPairs] + bounding_boxes: list[BoundingBoxes] def process_page( @@ -61,87 +61,88 @@ def process_page( Returns: list[dict]: All list of the text of all description blocks. """ - # Detect Layer Index Columns - layer_identifier_entries = find_layer_identifier_column_entries(lines) - layer_identifier_columns = ( - find_layer_identifier_column(layer_identifier_entries) if layer_identifier_entries else [] - ) - pairs = [] - if layer_identifier_columns: - for layer_identifier_column in layer_identifier_columns: - material_description_rect = find_material_description_column( - lines, layer_identifier_column, language, **params["material_description"] - ) - if material_description_rect: - pairs.append((layer_identifier_column, material_description_rect)) + # Detect Layer Identifier Sidebars - # Obtain the best pair. In contrast to depth columns, there only ever is one layer index column per page. - if pairs: - pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1])) + layer_identifier_sidebars = LayerIdentifierSidebarExtractor.from_lines(lines) + material_descriptions_sidebar_pairs = [] + for layer_identifier_sidebar in layer_identifier_sidebars: + material_description_rect = find_material_description_column( + lines, layer_identifier_sidebar, language, **params["material_description"] + ) + if material_description_rect: + material_descriptions_sidebar_pairs.append( + MaterialDescriptionRectWithSidebar(layer_identifier_sidebar, material_description_rect) + ) - words = [word for line in lines for word in line.words] + if material_descriptions_sidebar_pairs: + material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match()) - # If there is a layer identifier column, then we use this directly. - # Else, we search for depth columns. We could also think of some scoring mechanism to decide which one to use. - if not pairs: - depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True) - layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words) + # If there is a layer identifier sidebar, then we use this directly. + # Else, we search for sidebars with depths. + # We could also think of some scoring mechanism to decide which one to use. + if not material_descriptions_sidebar_pairs: + words = [word for line in lines for word in line.words] + a_to_b_sidebars = AToBSidebarExtractor.find_in_words(words) used_entry_rects = [] - for column in layer_depth_columns: + for column in a_to_b_sidebars: for entry in column.entries: used_entry_rects.extend([entry.start.rect, entry.end.rect]) - depth_column_entries = [ - entry - for entry in find_depth_columns.depth_column_entries(words, include_splits=False) - if entry.rect not in used_entry_rects - ] - depth_columns: list[DepthColumn] = layer_depth_columns - depth_columns.extend( - find_depth_columns.find_depth_columns( - depth_column_entries, words, page_number, depth_column_params=params["depth_column_params"] + sidebars: list[Sidebar] = a_to_b_sidebars + sidebars.extend( + AAboveBSidebarExtractor.find_in_words( + words, used_entry_rects, sidebar_params=params["depth_column_params"] ) ) - for depth_column in depth_columns: + for sidebar in sidebars: material_description_rect = find_material_description_column( - lines, depth_column, language, **params["material_description"] + lines, sidebar, language, **params["material_description"] ) if material_description_rect: - pairs.append((depth_column, material_description_rect)) + material_descriptions_sidebar_pairs.append( + MaterialDescriptionRectWithSidebar(sidebar, material_description_rect) + ) # lowest score first - pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words)) + material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match(words)) to_delete = [] - for i, (_depth_column, material_description_rect) in enumerate(pairs): - if any(material_description_rect.intersects(other_rect) for _, other_rect in pairs[i + 1 :]): + for i, pair in enumerate(material_descriptions_sidebar_pairs): + if any( + pair.material_description_rect.intersects(other_pair.material_description_rect) + for other_pair in material_descriptions_sidebar_pairs[i + 1 :] + ): to_delete.append(i) - filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete] + filtered_pairs = [item for index, item in enumerate(material_descriptions_sidebar_pairs) if index not in to_delete] pairs: list[IntervalBlockPair] = [] # list of matched depth intervals and text blocks - # groups is of the form: [{"depth_interval": BoundaryInterval, "block": TextBlock}] - if filtered_pairs: # match depth column items with material description - for depth_column, material_description_rect in filtered_pairs: - description_lines = get_description_lines(lines, material_description_rect) + if filtered_pairs: # match sidebars with material description + bounding_boxes = [ + BoundingBoxes.from_material_description_rect_with_sidebar(pair, page_number) for pair in filtered_pairs + ] + for pair in filtered_pairs: + description_lines = get_description_lines(lines, pair.material_description_rect) if len(description_lines) > 1: new_pairs = match_columns( - depth_column, description_lines, geometric_lines, material_description_rect, **params + pair.sidebar, description_lines, geometric_lines, pair.material_description_rect, **params ) pairs.extend(new_pairs) - filtered_depth_material_column_pairs = [ - DepthsMaterialsColumnPairs( - depth_column=depth_column, material_description_rect=material_description_rect, page=page_number - ) - for depth_column, material_description_rect in filtered_pairs - ] else: - filtered_depth_material_column_pairs = [] # Fallback when no depth column was found material_description_rect = find_material_description_column( - lines, depth_column=None, language=language, **params["material_description"] + lines, sidebar=None, language=language, **params["material_description"] ) + bounding_boxes = [] if material_description_rect: + bounding_boxes.append( + BoundingBoxes( + sidebar_bbox=None, + depth_column_entry_bboxes=[], + material_description_bbox=BoundingBox(material_description_rect), + page=page_number, + ) + ) description_lines = get_description_lines(lines, material_description_rect) description_blocks = get_description_blocks( description_lines, @@ -151,13 +152,6 @@ def process_page( params["left_line_length_threshold"], ) pairs.extend([IntervalBlockPair(block=block, depth_interval=None) for block in description_blocks]) - filtered_depth_material_column_pairs.extend( - [ - DepthsMaterialsColumnPairs( - depth_column=None, material_description_rect=material_description_rect, page=page_number - ) - ] - ) layer_predictions = [ Layer( @@ -176,61 +170,32 @@ def process_page( rect=pair.block.rect, page=page_number, ), - depth_interval=BoundaryInterval(start=pair.depth_interval.start, end=pair.depth_interval.end) + # TODO LGVISIUM-104 don't automatically convert any interval to an AAboveBInterval + depth_interval=AAboveBInterval(start=pair.depth_interval.start, end=pair.depth_interval.end) if pair.depth_interval else None, ) for pair in pairs ] layer_predictions = [layer for layer in layer_predictions if layer.description_nonempty()] - return ProcessPageResult(layer_predictions, filtered_depth_material_column_pairs) - - -def score_column_match( - depth_column: DepthColumn, material_description_rect: fitz.Rect, all_words: list[TextWord] | None = None -) -> float: - """Scores the match between a depth column and a material description. - - Args: - depth_column (DepthColumn): The depth column. - material_description_rect (fitz.Rect): The material description rectangle. - all_words (list[TextWord] | None, optional): List of the available text words. Defaults to None. - - Returns: - float: The score of the match. - """ - rect = depth_column.rect() - top = rect.y0 - bottom = rect.y1 - right = rect.x1 - distance = ( - abs(top - material_description_rect.y0) - + abs(bottom - material_description_rect.y1) - + abs(right - material_description_rect.x0) - ) - - height = bottom - top - - noise_count = depth_column.noise_count(all_words) if all_words else 0 - - return (height - distance) * math.pow(0.8, noise_count) + return ProcessPageResult(layer_predictions, bounding_boxes) def match_columns( - depth_column: DepthColumn | LayerIdentifierColumn, + sidebar: Sidebar, description_lines: list[TextLine], geometric_lines: list[Line], material_description_rect: fitz.Rect, **params: dict, ) -> list[IntervalBlockPair]: - """Match the depth column entries with the description lines. + """Match the layers that can be derived from the sidebar with the description lines. This function identifies groups of depth intervals and text blocks that are likely to match. - Makes a distinction between DepthColumn and LayerIdentifierColumn and obtains the corresponding text blocks - as well as their depth intervals where present. + The actual matching between text blocks and depth intervals is handled by the implementation of the actual Sidebar + instance (e.b. AAboveBSidebar, AToBSidebar). Args: - depth_column (DepthColumn | LayerIdentifierColumn): The depth column. + sidebar (Sidebar): The sidebar. description_lines (list[TextLine]): The description lines. geometric_lines (list[Line]): The geometric lines. material_description_rect (fitz.Rect): The material description rectangle. @@ -239,28 +204,11 @@ def match_columns( Returns: list[IntervalBlockPair]: The matched depth intervals and text blocks. """ - if isinstance(depth_column, DepthColumn): - return [ - element - for group in depth_column.identify_groups( - description_lines, geometric_lines, material_description_rect, **params - ) - for element in transform_groups(group.depth_intervals, group.blocks, **params) - ] - elif isinstance(depth_column, LayerIdentifierColumn): - blocks = get_description_blocks_from_layer_identifier(depth_column.entries, description_lines) - pairs: list[IntervalBlockPair] = [] - for block in blocks: - depth_interval = find_depth_columns.get_depth_interval_from_textblock(block) - if depth_interval: - pairs.append(IntervalBlockPair(depth_interval=depth_interval, block=block)) - else: - pairs.append(IntervalBlockPair(depth_interval=None, block=block)) - return pairs - else: - raise ValueError( - f"depth_column must be a DepthColumn or a LayerIdentifierColumn object. Got {type(depth_column)}." - ) + return [ + element + for group in sidebar.identify_groups(description_lines, geometric_lines, material_description_rect, **params) + for element in transform_groups(group.depth_intervals, group.blocks, **params) + ] def transform_groups( @@ -280,20 +228,19 @@ def transform_groups( Returns: List[IntervalBlockPair]: Pairing of text blocks and depth intervals. """ - if len(depth_intervals) == 0: - return [] - elif len(depth_intervals) == 1: + if len(depth_intervals) <= 1: concatenated_block = TextBlock( [line for block in blocks for line in block.lines] ) # concatenate all text lines within a block; line separation flag does not matter here. - return [IntervalBlockPair(depth_interval=depth_intervals[0], block=concatenated_block)] + depth_interval = depth_intervals[0] if len(depth_intervals) else None + return [IntervalBlockPair(depth_interval=depth_interval, block=concatenated_block)] else: if len(blocks) < len(depth_intervals): blocks = split_blocks_by_textline_length(blocks, target_split_count=len(depth_intervals) - len(blocks)) if len(blocks) > len(depth_intervals): # create additional depth intervals with end & start value None to match the number of blocks - depth_intervals.extend([BoundaryInterval(None, None) for _ in range(len(blocks) - len(depth_intervals))]) + depth_intervals.extend([AAboveBInterval(None, None) for _ in range(len(blocks) - len(depth_intervals))]) return [ IntervalBlockPair(depth_interval=depth_interval, block=block) @@ -382,30 +329,28 @@ def split_blocks_by_textline_length(blocks: list[TextBlock], target_split_count: def find_material_description_column( - lines: list[TextLine], depth_column: DepthColumn | None, language: str, **params: dict + lines: list[TextLine], sidebar: Sidebar | None, language: str, **params: dict ) -> fitz.Rect | None: """Find the material description column given a depth column. Args: lines (list[TextLine]): The text lines of the page. - depth_column (DepthColumn | None): The depth column. + sidebar (Sidebar | None): The sidebar to be associated with the material descriptions. language (str): The language of the page. **params (dict): Additional parameters for the matching pipeline. Returns: fitz.Rect | None: The material description column. """ - if depth_column: - above_depth_column = [ - line - for line in lines - if x_overlap(line.rect, depth_column.rect()) and line.rect.y0 < depth_column.rect().y0 + if sidebar: + above_sidebar = [ + line for line in lines if x_overlap(line.rect, sidebar.rect()) and line.rect.y0 < sidebar.rect().y0 ] - min_y0 = max(line.rect.y0 for line in above_depth_column) if above_depth_column else -1 + min_y0 = max(line.rect.y0 for line in above_sidebar) if above_sidebar else -1 def check_y0_condition(y0): - return y0 > min_y0 and y0 < depth_column.rect().y1 + return y0 > min_y0 and y0 < sidebar.rect().y1 else: def check_y0_condition(y0): @@ -492,7 +437,10 @@ def is_below(best_x0, best_y1, line): if len(candidate_rects) == 0: return None - if depth_column: - return max(candidate_rects, key=lambda rect: score_column_match(depth_column, rect)) + if sidebar: + return max( + candidate_rects, + key=lambda rect: MaterialDescriptionRectWithSidebar(sidebar, rect).score_match(), + ) else: return candidate_rects[0] diff --git a/src/stratigraphy/layer/layer.py b/src/stratigraphy/layer/layer.py index ca24797a..a653d31f 100644 --- a/src/stratigraphy/layer/layer.py +++ b/src/stratigraphy/layer/layer.py @@ -1,13 +1,12 @@ """Layer class definition.""" -import uuid -from dataclasses import dataclass, field +from dataclasses import dataclass import fitz from stratigraphy.data_extractor.data_extractor import ExtractedFeature, FeatureOnPage from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry from stratigraphy.text.textblock import MaterialDescription, TextBlock -from stratigraphy.util.interval import AnnotatedInterval, BoundaryInterval, Interval +from stratigraphy.util.interval import AAboveBInterval, Interval from stratigraphy.util.util import parse_text @@ -16,8 +15,7 @@ class Layer(ExtractedFeature): """A class to represent predictions for a single layer.""" material_description: FeatureOnPage[MaterialDescription] - depth_interval: BoundaryInterval | AnnotatedInterval | None - id: uuid.UUID = field(default_factory=uuid.uuid4) + depth_interval: AAboveBInterval | None def __str__(self) -> str: """Converts the object to a string. @@ -39,7 +37,6 @@ def to_json(self) -> dict: return { "material_description": self.material_description.to_json() if self.material_description else None, "depth_interval": self.depth_interval.to_json() if self.depth_interval else None, - "id": str(self.id), } @classmethod @@ -58,25 +55,17 @@ def from_json(cls, data: dict) -> "Layer": start_data = depth_interval.get("start") end_data = depth_interval.get("end") start = ( - DepthColumnEntry( - value=start_data["value"], - rect=fitz.Rect(start_data["rect"]), - page_number=start_data["page"], - ) + DepthColumnEntry(value=start_data["value"], rect=fitz.Rect(start_data["rect"])) if start_data is not None else None ) end = ( - DepthColumnEntry( - value=end_data["value"], - rect=fitz.Rect(end_data["rect"]), - page_number=end_data["page"], - ) + DepthColumnEntry(value=end_data["value"], rect=fitz.Rect(end_data["rect"])) if end_data is not None else None ) - depth_interval_prediction = BoundaryInterval(start=start, end=end) + depth_interval_prediction = AAboveBInterval(start=start, end=end) else: depth_interval_prediction = None diff --git a/src/stratigraphy/layer/layer_identifier_column.py b/src/stratigraphy/layer/layer_identifier_column.py deleted file mode 100644 index 518d2aaf..00000000 --- a/src/stratigraphy/layer/layer_identifier_column.py +++ /dev/null @@ -1,246 +0,0 @@ -"""Module for the LayerIdentifierColumn class.""" - -import re - -import fitz -from stratigraphy.lines.line import TextLine - - -class LayerIdentifierEntry: - """Class for a layer identifier entry. - - Note: As of now this is very similar to DepthColumnEntry. Refactoring may be desired. - """ - - def __init__(self, rect: fitz.Rect, text: str): - self.rect = rect - self.text = text - - def __repr__(self): - return str(self.text) - - def to_json(self): - """Convert the layer identifier entry to a JSON serializable format. - - Returns: - dict: The JSON serializable format of the layer identifier entry. - """ - return { - "text": self.text, - "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], - } - - -class LayerIdentifierColumn: - """Class for a layer identifier column.""" - - def __init__(self, entries: list[LayerIdentifierEntry]): - """Initialize the LayerIdentifierColumn object. - - Args: - entries (list[LayerIdentifierEntry]): The entries corresponding to the layer indices. - """ - self.entries: list[LayerIdentifierEntry] = entries - - @property - def max_x0(self) -> float: - """Get the maximum x0 value of the layer identifier column entries. - - Returns: - float: The maximum x0 value of the layer identifier column entries. - """ - return max([rect.x0 for rect in self.rects()]) - - @property - def min_x1(self) -> float: - """Get the minimum x1 value of the layer identifier column entries. - - Returns: - float: The minimum x1 value of the layer identifier column entries. - """ - return min([rect.x1 for rect in self.rects()]) - - def rect(self) -> fitz.Rect: - """Get the rectangle of the layer identifier column. - - Returns: - fitz.Rect: The rectangle of the layer identifier column. - """ - x0 = min([rect.x0 for rect in self.rects()]) - x1 = max([rect.x1 for rect in self.rects()]) - y0 = min([rect.y0 for rect in self.rects()]) - y1 = max([rect.y1 for rect in self.rects()]) - return fitz.Rect(x0, y0, x1, y1) - - def rects(self) -> list[fitz.Rect]: - """Get the rectangles of the layer identifier column entries. - - Returns: - list[fitz.Rect]: The rectangles of the layer identifier column entries. - """ - return [entry.rect for entry in self.entries] - - def add_entry(self, entry: LayerIdentifierEntry): - """Add a new layer identifier column entry to the layer identifier column. - - Args: - entry (LayerIdentifierEntry): The layer identifier column entry to be added. - """ - self.entries.append(entry) - - def can_be_appended(self, rect: fitz.Rect) -> bool: - """Checks if a new layer identifier column entry can be appended to the current layer identifier column. - - The checks are: - - The width of the new rectangle is greater than the width of the current layer identifier column. Or; - - The middle of the new rectangle is within the horizontal boundaries of the current layer identifier column. - - The new rectangle intersects with the minimal horizontal boundaries of the current layer identifier column. - - - Args: - rect (fitz.Rect): Rect of the layer identifier column entry to be appended. - - Returns: - bool: True if the new layer identifier column entry can be appended, False otherwise. - """ - new_middle = (rect.x0 + rect.x1) / 2 - if (self.rect().width < rect.width or self.rect().x0 < new_middle < self.rect().x1) and ( - rect.x0 <= self.min_x1 and self.max_x0 <= rect.x1 - ): - return True - return False - - def strictly_contains(self, other: "LayerIdentifierColumn") -> bool: - """Check if the layer identifier column strictly contains another layer identifier column. - - Args: - other (LayerIdentifierColumn): The other layer identifier column to check if it is strictly contained. - - Returns: - bool: True if the layer identifier column strictly contains the other layer identifier column, False - otherwise. - """ - return len(other.entries) < len(self.entries) and all( - other_entry in self.entries for other_entry in other.entries - ) - - def is_contained(self, rect: fitz.Rect) -> bool: - """Check if the layer identifier column is contained in another rectangle. - - Args: - rect (fitz.Rect): The rectangle to check if it contains the layer identifier column. - - Returns: - bool: True if the layer identifier column is contained in the rectangle, False otherwise. - """ - return ( - rect.x0 <= self.rect().x0 - and self.rect().x1 <= rect.x1 - and rect.y0 <= self.rect().y0 - and self.rect().y1 <= rect.y1 - ) - - def to_json(self): - """Convert the layer identifier column to a JSON serializable format. - - Returns: - dict: The JSON serializable format of the layer identifier column. - """ - rect = self.rect() - return { - "rect": [rect.x0, rect.y0, rect.x1, rect.y1], - "entries": [entry.to_json() for entry in self.entries], - "type": "LayerIdentifierColumn", - } - - @classmethod - def from_json(cls, data: dict) -> "LayerIdentifierColumn": - """Converts a dictionary to an object. - - Args: - data (dict): A dictionary containing 'entries' list with 'rect' and 'text' fields. - - Raises: - ValueError: If the input dictionary is missing required fields or has invalid data. - - Returns: - LayerIdentifierColumn: The layer identifier column object. - """ - if not isinstance(data, dict) or "entries" not in data: - raise ValueError("Invalid input: data must be a dictionary with 'entries' field") - - return LayerIdentifierColumn( - entries=[ - LayerIdentifierEntry(rect=fitz.Rect(entry["rect"]), text=entry["text"]) for entry in data["entries"] - ] - ) - - -def find_layer_identifier_column_entries(lines: list[TextLine]) -> list[LayerIdentifierEntry]: - r"""Find the layer identifier column entries. - - Regex explanation: - - \b is a word boundary. This ensures that the match must start at the beginning of a word. - - [\da-z]+ matches one or more (+) alphanumeric characters (\d for digits and a-z for lowercase letters). - - \) matches a closing parenthesis. The backslash is necessary because parentheses are special characters - in regular expressions, so we need to escape it to match a literal parenthesis. - This regular expression will match strings like "1)", "2)", "a)", "b)", "1a4)", "6de)", etc. - - Args: - lines (list[TextLine]): The lines to search for layer identifier columns. - - Returns: - list[LayerIdentifierEntry]: The layer identifier column entries. - """ - entries = [] - for line in sorted(lines, key=lambda line: line.rect.y0): - if len(line.words) > 0: - # Only match in the first word of every line, to avoid e.g. matching with "cm)" in a material description - # containing an expression like "(diameter max 6 cm)". - first_word = line.words[0] - regex = re.compile(r"\b[\da-z-]+\)") - match = regex.match(first_word.text) - if match and len(first_word.text) < 7: - entries.append(LayerIdentifierEntry(first_word.rect, first_word.text)) - return entries - - -def find_layer_identifier_column(entries: list[LayerIdentifierEntry]) -> list[LayerIdentifierColumn]: - """Find the layer identifier column given the index column entries. - - Note: Similar to find_depth_columns.find_depth_columns. Refactoring may be desired. - - Args: - entries (list[LayerIdentifierEntry]): The layer identifier column entries. - - Returns: - list[LayerIdentifierColumn]: The found layer identifier columns. - """ - layer_identifier_columns = [LayerIdentifierColumn([entries[0]])] - for entry in entries[1:]: - has_match = False - for column in layer_identifier_columns: - if column.can_be_appended(entry.rect): - column.add_entry(entry) - has_match = True - if not has_match: - layer_identifier_columns.append(LayerIdentifierColumn([entry])) - - # only keep columns whose entries are not fully contained in a different column - layer_identifier_columns = [ - column - for column in layer_identifier_columns - if all(not other.strictly_contains(column) for other in layer_identifier_columns) - ] - # check if the column rect is a subset of another column rect. If so, merge the entries and sort them by y0. - for column in layer_identifier_columns: - for other in layer_identifier_columns: - if column != other and column.is_contained(other.rect()): - for entry in other.entries: - if entry not in column.entries: - column.entries.append(entry) - column.entries.sort(key=lambda entry: entry.rect.y0) - layer_identifier_columns.remove(other) - break - layer_identifier_columns = [column for column in layer_identifier_columns if len(column.entries) > 2] - return layer_identifier_columns diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index 35a56c10..9b3ba10b 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -230,7 +230,7 @@ def start_pipeline( # Initialize common variables groundwater_entries = GroundwaterInDocument(filename=filename, groundwater=[]) layers_in_document = LayersInDocument([], filename) - depths_materials_columns_pairs = [] + bounding_boxes = [] if part == "all": # Extract the groundwater levels @@ -262,7 +262,7 @@ def start_pipeline( layer_predictions = process_page_results.predictions layers_in_document.layers.extend(layer_predictions) - depths_materials_columns_pairs.extend(process_page_results.depth_material_pairs) + bounding_boxes.extend(process_page_results.bounding_boxes) if draw_lines: # could be changed to if draw_lines and mflow_tracking: if not mlflow_tracking: @@ -282,7 +282,7 @@ def start_pipeline( metadata=metadata, groundwater=groundwater_entries, layers_in_document=layers_in_document, - depths_materials_columns_pairs=depths_materials_columns_pairs, + bounding_boxes=bounding_boxes, ) ) diff --git a/src/stratigraphy/sidebar/__init__.py b/src/stratigraphy/sidebar/__init__.py new file mode 100644 index 00000000..f4a9cf2b --- /dev/null +++ b/src/stratigraphy/sidebar/__init__.py @@ -0,0 +1,21 @@ +"""Modules for Sidebars, representing depths or other data displayed to the side of material descriptions.""" + +from .a_above_b_sidebar import AAboveBSidebar +from .a_above_b_sidebar_extractor import AAboveBSidebarExtractor +from .a_above_b_sidebar_validator import AAboveBSidebarValidator +from .a_to_b_sidebar import AToBSidebar +from .a_to_b_sidebar_extractor import AToBSidebarExtractor +from .layer_identifier_sidebar import LayerIdentifierSidebar +from .layer_identifier_sidebar_extractor import LayerIdentifierSidebarExtractor +from .sidebar import Sidebar + +__all__ = [ + "Sidebar", + "AAboveBSidebar", + "AAboveBSidebarExtractor", + "AAboveBSidebarValidator", + "AToBSidebar", + "AToBSidebarExtractor", + "LayerIdentifierSidebar", + "LayerIdentifierSidebarExtractor", +] diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar.py b/src/stratigraphy/sidebar/a_above_b_sidebar.py new file mode 100644 index 00000000..7a13ad52 --- /dev/null +++ b/src/stratigraphy/sidebar/a_above_b_sidebar.py @@ -0,0 +1,219 @@ +"""Module for the AAboveBSidebar, where the depths of layer interfaces are defined above/below each other.""" + +from __future__ import annotations + +from dataclasses import dataclass + +import fitz +import numpy as np + +from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry +from stratigraphy.lines.line import TextLine +from stratigraphy.text.find_description import get_description_blocks +from stratigraphy.util.dataclasses import Line +from stratigraphy.util.interval import AAboveBInterval + +from .interval_block_group import IntervalBlockGroup +from .sidebar import Sidebar + + +@dataclass +class AAboveBSidebar(Sidebar[DepthColumnEntry]): + """Represents a sidebar where the depths of the layer boundaries are displayed in a column, above each other. + + Usually, the vertical position of a depth label on the page is proportional to the depth value. + + Example: + 0m + + 0.2m + + + 0.5m + ... + """ + + entries: list[DepthColumnEntry] + + def __repr__(self): + return "AAboveBSidebar({})".format(", ".join([str(entry) for entry in self.entries])) + + def valid_initial_segment(self, rect: fitz.Rect) -> AAboveBSidebar: + for i in range(len(self.entries) - 1): + initial_segment = AAboveBSidebar(self.entries[: -i - 1]) + if initial_segment.can_be_appended(rect): + return initial_segment + return AAboveBSidebar(entries=[]) + + def strictly_contains(self, other: AAboveBSidebar) -> bool: + return len(other.entries) < len(self.entries) and all( + other_entry in self.entries for other_entry in other.entries + ) + + def is_strictly_increasing(self) -> bool: + return all(i.value < j.value for i, j in zip(self.entries, self.entries[1:], strict=False)) + + def depth_intervals(self) -> list[AAboveBInterval]: + """Creates a list of depth intervals from the depth column entries. + + The first depth interval has an open start value (i.e. None). + + Returns: + list[AAboveBInterval]: A list of depth intervals. + """ + depth_intervals = [AAboveBInterval(None, self.entries[0])] + for i in range(len(self.entries) - 1): + depth_intervals.append(AAboveBInterval(self.entries[i], self.entries[i + 1])) + depth_intervals.append( + AAboveBInterval(self.entries[len(self.entries) - 1], None) + ) # even though no open ended intervals are allowed, they are still useful for matching, + # especially for documents where the material description rectangle is too tall + # (and includes additional lines below the actual material descriptions). + return depth_intervals + + def significant_arithmetic_progression(self) -> bool: + # to allow for OCR errors or gaps in the progression, we only require a segment of length 6 that is an + # arithmetic progression + segment_length = 6 + if len(self.entries) < segment_length: + return self.is_arithmetic_progression() + else: + for i in range(len(self.entries) - segment_length + 1): + if AAboveBSidebar(self.entries[i : i + segment_length]).is_arithmetic_progression(): + return True + return False + + def is_arithmetic_progression(self) -> bool: + if len(self.entries) <= 2: + return True + + progression = np.array(range(len(self.entries))) + entries = np.array([entry.value for entry in self.entries]) + + # Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0. + if np.std(entries) == 0: + return False + + scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item() + return abs(scale_pearson_correlation_coef) >= 0.9999 + + def pearson_correlation_coef(self) -> float: + # We look at the lower y coordinate, because most often the baseline of the depth value text is aligned with + # the line of the corresponding layer boundary. + positions = np.array([entry.rect.y1 for entry in self.entries]) + entries = np.array([entry.value for entry in self.entries]) + + # Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0. + if np.std(entries) == 0 or np.std(positions) == 0: + return 0 + + return np.corrcoef(positions, entries)[0, 1].item() + + def remove_entry_by_correlation_gradient(self) -> AAboveBSidebar | None: + if len(self.entries) < 3: + return None + + new_columns = [ + AAboveBSidebar([entry for index, entry in enumerate(self.entries) if index != remove_index]) + for remove_index in range(len(self.entries)) + ] + return max(new_columns, key=lambda column: column.pearson_correlation_coef()) + + def break_on_double_descending(self) -> list[AAboveBSidebar]: + segments = [] + segment_start = 0 + for index, current_entry in enumerate(self.entries): + if ( + index >= 2 + and index + 1 < len(self.entries) + and current_entry.value < self.entries[index - 2].value + and current_entry.value < self.entries[index - 1].value + and self.entries[index + 1].value < self.entries[index - 2].value + and self.entries[index + 1].value < self.entries[index - 1].value + ): + # big big || small small + segments.append(self.entries[segment_start:index]) + segment_start = index + + final_segment = self.entries[segment_start:] + if final_segment: + segments.append(final_segment) + + return [AAboveBSidebar(segment) for segment in segments] + + def identify_groups( + self, + description_lines: list[TextLine], + geometric_lines: list[Line], + material_description_rect: fitz.Rect, + **params, + ) -> list[IntervalBlockGroup]: + """Identifies groups of description blocks that correspond to depth intervals. + + Note: includes a heuristic of whether there should be a group corresponding to a final depth interval + starting from the last depth entry without any end value. + + Args: + description_lines (list[TextLine]): A list of text lines that are part of the description. + geometric_lines (list[Line]): A list of geometric lines that are part of the description. + material_description_rect (fitz.Rect): The bounding box of the material description. + params (dict): A dictionary of relevant parameters. + + Returns: + list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. + + Example return value: + [ + IntervalBlockGroup( + depth_intervals=[AAboveBInterval(None, 0.1), AAboveBInterval(0.1, 0.3), ...], + blocks=[TextBlock(...), TextBlock(...), ...] + ), + IntervalBlockGroup( + depth_intervals=[AAboveBInterval(0.3, 0.7)], + blocks=[TextBlock(...), TextBlock(...), ...] + ), + ... + ] + """ + depth_intervals = self.depth_intervals() + + groups = [] + + current_intervals = [] + current_blocks = [] + all_blocks = get_description_blocks( + description_lines, + geometric_lines, + material_description_rect, + params["block_line_ratio"], + left_line_length_threshold=params["left_line_length_threshold"], + target_layer_count=len(depth_intervals), + ) + + block_index = 0 + + for interval in depth_intervals: + # don't allow a layer above depth 0 + if interval.start is None and interval.end.value == 0: + continue + + pre, exact, post = interval.matching_blocks(all_blocks, block_index) + block_index += len(pre) + len(exact) + len(post) + + current_blocks.extend(pre) + if len(exact): + if len(current_intervals) > 0 or len(current_blocks) > 0: + groups.append(IntervalBlockGroup(depth_intervals=current_intervals, blocks=current_blocks)) + groups.append(IntervalBlockGroup(depth_intervals=[interval], blocks=exact)) + current_blocks = post + current_intervals = [] + else: + # The final open-ended interval should not be added, since borehole profiles do typically not come + # with open-ended intervals. + if interval.end is not None: + current_intervals.append(interval) + + if len(current_intervals) > 0 or len(current_blocks) > 0: + groups.append(IntervalBlockGroup(depth_intervals=current_intervals, blocks=current_blocks)) + + return groups diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py new file mode 100644 index 00000000..a8391e02 --- /dev/null +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py @@ -0,0 +1,74 @@ +"""Module for finding AAboveBSidebar instances in a borehole profile.""" + +import fitz + +from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry +from stratigraphy.lines.line import TextWord +from stratigraphy.sidebar.a_above_b_sidebar import AAboveBSidebar +from stratigraphy.sidebar.a_above_b_sidebar_validator import AAboveBSidebarValidator + + +class AAboveBSidebarExtractor: + """Class that finds AAboveBSidebar instances in a borehole profile.""" + + @staticmethod + def find_in_words( + all_words: list[TextWord], used_entry_rects: list[fitz.Rect], sidebar_params: dict + ) -> list[AAboveBSidebar]: + """Construct all possible AAboveBSidebar objects from the given words. + + Args: + all_words (list[TextLine]): All words in the page. + used_entry_rects (list[fitz.Rect]): Part of the document to ignore. + sidebar_params (dict): Parameters for the AAboveBSidebar objects. + + Returns: + list[AAboveBSidebar]: Found AAboveBSidebar objects. + """ + entries = [ + entry + for entry in DepthColumnEntry.find_in_words(all_words, include_splits=False) + if entry.rect not in used_entry_rects + ] + + numeric_columns: list[AAboveBSidebar] = [] + for entry in entries: + has_match = False + additional_columns = [] + for column in numeric_columns: + if column.can_be_appended(entry.rect): + has_match = True + column.entries.append(entry) + else: + valid_initial_segment = column.valid_initial_segment(entry.rect) + if len(valid_initial_segment.entries) > 0: + has_match = True + valid_initial_segment.entries.append(entry) + additional_columns.append(valid_initial_segment) + + numeric_columns.extend(additional_columns) + if not has_match: + numeric_columns.append(AAboveBSidebar(entries=[entry])) + + # only keep columns that are not contained in a different column + numeric_columns = [ + column + for column in numeric_columns + if all(not other.strictly_contains(column) for other in numeric_columns) + ] + + sidebar_validator = AAboveBSidebarValidator(all_words, **sidebar_params) + + numeric_columns = [ + sidebar_validator.reduce_until_valid(column) + for numeric_column in numeric_columns + for column in numeric_column.break_on_double_descending() + # when we have a perfect arithmetic progression, this is usually just a scale + # that does not match the descriptions + if not column.significant_arithmetic_progression() + ] + + return sorted( + [column for column in numeric_columns if column and sidebar_validator.is_valid(column)], + key=lambda column: len(column.entries), + ) diff --git a/src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py similarity index 71% rename from src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py rename to src/stratigraphy/sidebar/a_above_b_sidebar_validator.py index 477007fa..3dbd012b 100644 --- a/src/stratigraphy/depthcolumn/boundarydepthcolumnvalidator.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py @@ -1,32 +1,33 @@ -"""This module contains logic to validate BoundaryDepthColumn instances.""" +"""This module contains logic to validate AAboveBSidebar instances.""" import dataclasses -from stratigraphy.depthcolumn.depthcolumn import BoundaryDepthColumn from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry from stratigraphy.lines.line import TextWord +from .a_above_b_sidebar import AAboveBSidebar + @dataclasses.dataclass -class BoundaryDepthColumnValidator: - """Validation logic for instances of the BoundaryDepthColumn class. +class AAboveBSidebarValidator: + """Validation logic for instances of the AAboveBSidebar class. Args: all_words (list[TextLine]): A list of all text lines on the page. - noise_count_threshold (float): Noise count threshold deciding how much noise is allowed in a column + noise_count_threshold (float): Noise count threshold deciding how much noise is allowed in a sidebar to be valid. noise_count_offset (int): Offset for the noise count threshold. Affects the noise count criterion. - Effective specifically for depth columns with very few entries. + Effective specifically for sidebars with very few entries. """ all_words: list[TextWord] noise_count_threshold: float noise_count_offset: int - def is_valid(self, column: BoundaryDepthColumn, corr_coef_threshold: float = 0.99) -> bool: - """Checks whether the depth column is valid. + def is_valid(self, sidebar: AAboveBSidebar, corr_coef_threshold: float = 0.99) -> bool: + """Checks whether the sidebar is valid. - The depth column is considered valid if: + The sidebar is considered valid if: - The number of entries is at least 3. - The number of words that intersect with the depth column entries is less than the noise count threshold time the number of entries minus the noise count offset. @@ -37,13 +38,13 @@ def is_valid(self, column: BoundaryDepthColumn, corr_coef_threshold: float = 0.9 even though they are. Args: - column (BoundaryDepthColumn): The depth column to validate. + sidebar (AAboveBSidebar): The AAboveBSidebar to validate. corr_coef_threshold (float): The minimal correlation coefficient for the column to be deemed valid. Returns: bool: True if the depth column is valid, False otherwise. """ - if len(column.entries) < 3: + if len(sidebar.entries) < 3: return False # When too much other text is in the column, then it is probably not valid. @@ -51,39 +52,38 @@ def is_valid(self, column: BoundaryDepthColumn, corr_coef_threshold: float = 0.9 # than columns with more entries. The more entries we have, the less likely it is that we found them by chance. # TODO: Once evaluation data is of good enough qualities, we should optimize for the parameter below. if ( - column.noise_count(self.all_words) - > self.noise_count_threshold * (len(column.entries) - self.noise_count_offset) ** 2 + sidebar.noise_count(self.all_words) + > self.noise_count_threshold * (len(sidebar.entries) - self.noise_count_offset) ** 2 ): return False # Check if the entries are strictly increasing. - if not column.is_strictly_increasing(): + if not sidebar.is_strictly_increasing(): return False - corr_coef = column.pearson_correlation_coef() + corr_coef = sidebar.pearson_correlation_coef() return corr_coef and corr_coef > corr_coef_threshold - def reduce_until_valid(self, column: BoundaryDepthColumn, page_number: int) -> BoundaryDepthColumn: + def reduce_until_valid(self, column: AAboveBSidebar) -> AAboveBSidebar: """Removes entries from the depth column until it fulfills the is_valid condition. is_valid checks whether there is too much noise (i.e. other text) in the column and whether the entries are linearly correlated with their vertical position. Args: - column (BoundaryDepthColumn): The depth column to validate - page_number (int): The page number of the depth column + column (AAboveBSidebar): The depth column to validate Returns: - BoundaryDepthColumn: The current depth column with entries removed until it is valid. + AAboveBSidebar: The current depth column with entries removed until it is valid. """ while column: if self.is_valid(column): return column - elif self.correct_OCR_mistakes(column, page_number) is not None: - return self.correct_OCR_mistakes(column, page_number) + elif self.correct_OCR_mistakes(column) is not None: + return self.correct_OCR_mistakes(column) else: column = column.remove_entry_by_correlation_gradient() - def correct_OCR_mistakes(self, column: BoundaryDepthColumn, page_number: int) -> BoundaryDepthColumn | None: + def correct_OCR_mistakes(self, sidebar: AAboveBSidebar) -> AAboveBSidebar | None: """Corrects OCR mistakes in the depth column entries. Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the @@ -101,16 +101,15 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn, page_number: int) -> Note: Common mistakes should be extended as needed. Args: - column (BoundaryDepthColumn): The depth column to validate - page_number (int): The page number of the depth column + sidebar (AAboveBSidebar): The AAboveBSidebar to validate Returns: - BoundaryDepthColumn | None: The corrected depth column, or None if no correction was possible. + AAboveBSidebar | None: The corrected sidebar, or None if no correction was possible. """ - new_columns = [BoundaryDepthColumn()] - for entry in column.entries: + new_columns = [AAboveBSidebar(entries=[])] + for entry in sidebar.entries: new_columns = [ - BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value, page_number)]) + AAboveBSidebar([*column.entries, DepthColumnEntry(entry.rect, new_value)]) for column in new_columns for new_value in _value_alternatives(entry.value) ] diff --git a/src/stratigraphy/sidebar/a_to_b_sidebar.py b/src/stratigraphy/sidebar/a_to_b_sidebar.py new file mode 100644 index 00000000..d1e1fe19 --- /dev/null +++ b/src/stratigraphy/sidebar/a_to_b_sidebar.py @@ -0,0 +1,114 @@ +"""Module for the AToBSidebar, which contains depth intervals defined like "0.2m - 1.3m".""" + +from __future__ import annotations + +from dataclasses import dataclass + +import fitz + +from stratigraphy.depthcolumn.depthcolumnentry import AToBDepthColumnEntry +from stratigraphy.lines.line import TextLine +from stratigraphy.util.dataclasses import Line +from stratigraphy.util.interval import AToBInterval + +from .interval_block_group import IntervalBlockGroup +from .sidebar import Sidebar + + +@dataclass +class AToBSidebar(Sidebar[AToBDepthColumnEntry]): + """Represents a sidebar where the upper and lower depths of each layer are explicitly specified. + + Example:: + 0 - 0.1m: xxx + 0.1 - 0.3m: yyy + 0.3 - 0.8m: zzz + ... + """ + + entries: list[AToBDepthColumnEntry] + + def __repr__(self): + """Converts the object to a string. + + Returns: + str: The object as a string. + """ + return "AToBSidebar({})".format(", ".join([str(entry) for entry in self.entries])) + + def depth_intervals(self) -> list[AToBInterval]: + return [AToBInterval(entry) for entry in self.entries] + + def break_on_mismatch(self) -> list[AToBSidebar]: + """Breaks the sidebar into segments where the depths are not in an arithmetic progression. + + Returns: + list[AToBSidebar]: A list of depth column segments. + """ + segments = [] + segment_start = 0 + for index, current_entry in enumerate(self.entries): + if index >= 1 and current_entry.start.value < self.entries[index - 1].end.value: + # (_, big) || (small, _) + segments.append(self.entries[segment_start:index]) + segment_start = index + + final_segment = self.entries[segment_start:] + if final_segment: + segments.append(final_segment) + + return [AToBSidebar(segment) for segment in segments] + + def is_valid(self) -> bool: + """Checks if the sidebar is valid. + + An AToBSidebar is valid if it is strictly increasing and the depth intervals are significant. + + Returns: + bool: True if the depth column is valid, False otherwise. + """ + if len(self.entries) <= 2: + return False + + # At least half of the "end" values must match the subsequent "start" value (e.g. 2-5m, 5-9m). + sequence_matches_count = 0 + for index, entry in enumerate(self.entries): + if index >= 1 and self.entries[index - 1].end.value == entry.start.value: + sequence_matches_count += 1 + + return sequence_matches_count / (len(self.entries) - 1) > 0.5 + + def identify_groups( + self, + description_lines: list[TextLine], + geometric_lines: list[Line], + material_description_rect: fitz.Rect, + **params, + ) -> list[IntervalBlockGroup]: + """Identifies groups of description blocks that correspond to depth intervals. + + Args: + description_lines (list[TextLine]): A list of text lines that are part of the description. + geometric_lines (list[Line]): A list of geometric lines that are part of the description. + material_description_rect (fitz.Rect): The bounding box of the material description. + params (dict): A dictionary of relevant parameters. + + Returns: + list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. + """ + depth_intervals = self.depth_intervals() + + groups = [] + line_index = 0 + + for interval_index, interval in enumerate(depth_intervals): + # don't allow a layer above depth 0 + if interval.start is None and interval.end.value == 0: + continue + + next_interval = depth_intervals[interval_index + 1] if interval_index + 1 < len(depth_intervals) else None + + matched_blocks = interval.matching_blocks(description_lines, line_index, next_interval) + line_index += sum([len(block.lines) for block in matched_blocks]) + groups.append(IntervalBlockGroup(depth_intervals=[interval], blocks=matched_blocks)) + return groups diff --git a/src/stratigraphy/sidebar/a_to_b_sidebar_extractor.py b/src/stratigraphy/sidebar/a_to_b_sidebar_extractor.py new file mode 100644 index 00000000..2751e4d7 --- /dev/null +++ b/src/stratigraphy/sidebar/a_to_b_sidebar_extractor.py @@ -0,0 +1,77 @@ +"""Module for finding AToBSidebar instances in a borehole profile.""" + +import re + +from stratigraphy.depthcolumn.depthcolumnentry import AToBDepthColumnEntry, DepthColumnEntry +from stratigraphy.lines.line import TextWord +from stratigraphy.sidebar import AToBSidebar + + +class AToBSidebarExtractor: + """Class that finds AToBSidebar instances in a borehole profile.""" + + @staticmethod + def find_in_words(all_words: list[TextWord]) -> list[AToBSidebar]: + """Finds all AToBSidebars. + + Generates a list of AToBDepthColumnEntry objects by finding consecutive pairs of DepthColumnEntry objects. + Different columns are grouped together in LayerDepthColumn objects. Finally, a list of AToBSidebars objects, + one for each column, is returned. + + A layer corresponds to a material layer. The layer is defined using a start and end point (e.g. 1.10-1.60m). + The start and end points are represented as DepthColumnEntry objects. + + Args: + all_words (list[TextWord]): List of all TextWord objects. + + Returns: + list[AToBSidebar]: List of all AToBSidebars identified. + """ + entries = DepthColumnEntry.find_in_words(all_words, include_splits=True) + + def find_pair(entry: DepthColumnEntry) -> DepthColumnEntry | None: # noqa: D103 + min_y0 = entry.rect.y0 - entry.rect.height / 2 + max_y0 = entry.rect.y0 + entry.rect.height / 2 + for other in entries: + if entry == other: + continue + if other.value <= entry.value: + continue + combined_width = entry.rect.width + other.rect.width + if not entry.rect.x0 <= other.rect.x0 <= entry.rect.x0 + combined_width: + continue + if not min_y0 <= other.rect.y0 <= max_y0: + continue + in_between_text = " ".join( + [ + word.text + for word in all_words + if entry.rect.x0 < word.rect.x0 < other.rect.x0 and min_y0 <= word.rect.y0 <= max_y0 + ] + ) + if re.fullmatch(r"\W*m?\W*", in_between_text): + return other + + pairs = [(entry, find_pair(entry)) for entry in entries] + + sidebars = [] + for first, second in pairs: + if second is not None: + entry = AToBDepthColumnEntry(first, second) + is_matched = False + for sidebar in sidebars: + column_rect = sidebar.rect() + new_start_middle = (entry.start.rect.x0 + entry.start.rect.x1) / 2 + if column_rect.x0 < new_start_middle < column_rect.x1: + is_matched = True + sidebar.entries.append(entry) + + if not is_matched: + sidebars.append(AToBSidebar([entry])) + + return [ + sidebar_segment + for sidebar in sidebars + for sidebar_segment in sidebar.break_on_mismatch() + if sidebar_segment.is_valid() + ] diff --git a/src/stratigraphy/sidebar/interval_block_group.py b/src/stratigraphy/sidebar/interval_block_group.py new file mode 100644 index 00000000..6911519c --- /dev/null +++ b/src/stratigraphy/sidebar/interval_block_group.py @@ -0,0 +1,18 @@ +"""Module that contains a helper class for associating depth intervals and text blocks.""" + +from dataclasses import dataclass + +from stratigraphy.text.textblock import TextBlock +from stratigraphy.util.interval import Interval + + +@dataclass +class IntervalBlockGroup: + """Helper class to represent a group of depth intervals and an associated group of text blocks. + + The class is used to simplify the code for obtaining an appropriate one-to-one correspondence between depth + intervals and material descriptions. + """ + + depth_intervals: list[Interval] + blocks: list[TextBlock] diff --git a/src/stratigraphy/sidebar/layer_identifier_sidebar.py b/src/stratigraphy/sidebar/layer_identifier_sidebar.py new file mode 100644 index 00000000..e2467cf2 --- /dev/null +++ b/src/stratigraphy/sidebar/layer_identifier_sidebar.py @@ -0,0 +1,140 @@ +"""Module for the layer identifier sidebars.""" + +from dataclasses import dataclass + +import fitz + +from stratigraphy.lines.line import TextLine +from stratigraphy.text.textblock import TextBlock +from stratigraphy.util.dataclasses import Line + +from ..util.interval import AToBInterval +from .interval_block_group import IntervalBlockGroup +from .sidebar import Sidebar + + +class LayerIdentifierEntry: + """Class for a layer identifier entry. + + Note: As of now this is very similar to DepthColumnEntry. Refactoring may be desired. + """ + + def __init__(self, rect: fitz.Rect, text: str): + self.rect = rect + self.text = text + + def __repr__(self): + return str(self.text) + + +@dataclass +class LayerIdentifierSidebar(Sidebar[LayerIdentifierEntry]): + """Class for a layer identifier sidebar. + + Layer identifiers are labels that are particularly common in Deriaz layout borehole profiles. They can be + sequential such as in 1007.pdf - a), b), c), etc. - or contain some semantic meaning such as in 10781.pdf - + 5c12), 4a), etc. + """ + + entries: list[LayerIdentifierEntry] + + def identify_groups( + self, + description_lines: list[TextLine], + geometric_lines: list[Line], + material_description_rect: fitz.Rect, + **params, + ) -> list[IntervalBlockGroup]: + """Divide the description lines into blocks based on the layer identifier entries. + + Args: + description_lines (list[TextLine]): A list of text lines that are part of the description. + geometric_lines (list[Line]): A list of geometric lines that are part of the description. + material_description_rect (fitz.Rect): The bounding box of the material description. + params (dict): A dictionary of relevant parameters. + + Returns: + list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. + """ + blocks = [] + line_index = 0 + for layer_identifier_idx, _layer_index in enumerate(self.entries): + next_layer_identifier = ( + self.entries[layer_identifier_idx + 1] if layer_identifier_idx + 1 < len(self.entries) else None + ) + + matched_block = self.matching_blocks(description_lines, line_index, next_layer_identifier) + line_index += sum([len(block.lines) for block in matched_block]) + blocks.extend(matched_block) + + result = [] + for block in blocks: + depth_intervals = [] + depth_interval = AToBInterval.get_depth_interval_from_lines(block.lines) + if depth_interval: + depth_intervals.append(depth_interval) + result.append(IntervalBlockGroup(depth_intervals=depth_intervals, blocks=[block])) + + return result + + @staticmethod + def matching_blocks( + all_lines: list[TextLine], line_index: int, next_layer_identifier: LayerIdentifierEntry | None + ) -> list[TextBlock]: + """Adds lines to a block until the next layer identifier is reached. + + Args: + all_lines (list[TextLine]): All TextLine objects constituting the material description. + line_index (int): The index of the last line that is already assigned to a block. + next_layer_identifier (TextLine | None): The next layer identifier. + + Returns: + list[TextBlock]: The next block or an empty list if no lines are added. + """ + y1_threshold = None + if next_layer_identifier: + next_interval_start_rect = next_layer_identifier.rect + y1_threshold = next_interval_start_rect.y0 + next_interval_start_rect.height / 2 + + matched_lines = [] + + for current_line in all_lines[line_index:]: + if y1_threshold is None or current_line.rect.y1 < y1_threshold: + matched_lines.append(current_line) + else: + break + + if matched_lines: + return [TextBlock(matched_lines)] + else: + return [] + + def strictly_contains(self, other: "LayerIdentifierSidebar") -> bool: + """Check if the layer identifier column strictly contains another layer identifier column. + + Args: + other (LayerIdentifierSidebar): The other layer identifier column to check if it is strictly contained. + + Returns: + bool: True if the layer identifier column strictly contains the other layer identifier column, False + otherwise. + """ + return len(other.entries) < len(self.entries) and all( + other_entry in self.entries for other_entry in other.entries + ) + + def is_contained(self, rect: fitz.Rect) -> bool: + """Check if the layer identifier column is contained in another rectangle. + + Args: + rect (fitz.Rect): The rectangle to check if it contains the layer identifier column. + + Returns: + bool: True if the layer identifier column is contained in the rectangle, False otherwise. + """ + return ( + rect.x0 <= self.rect().x0 + and self.rect().x1 <= rect.x1 + and rect.y0 <= self.rect().y0 + and self.rect().y1 <= rect.y1 + ) diff --git a/src/stratigraphy/sidebar/layer_identifier_sidebar_extractor.py b/src/stratigraphy/sidebar/layer_identifier_sidebar_extractor.py new file mode 100644 index 00000000..d880b46c --- /dev/null +++ b/src/stratigraphy/sidebar/layer_identifier_sidebar_extractor.py @@ -0,0 +1,86 @@ +"""Module for finding LayerIdentifierSidebar instances in a borehole profile.""" + +import re + +from stratigraphy.lines.line import TextLine +from stratigraphy.sidebar import LayerIdentifierSidebar +from stratigraphy.sidebar.layer_identifier_sidebar import LayerIdentifierEntry + + +class LayerIdentifierSidebarExtractor: + """Class that finds LayerIdentifierSidebar instances in a borehole profile.""" + + @classmethod + def find_layer_identifier_sidebar_entries(cls, lines: list[TextLine]) -> list[LayerIdentifierEntry]: + r"""Find the layer identifier sidebar entries. + + Regex explanation: + - \b is a word boundary. This ensures that the match must start at the beginning of a word. + - [\da-z]+ matches one or more (+) alphanumeric characters (\d for digits and a-z for lowercase letters). + - \) matches a closing parenthesis. The backslash is necessary because parentheses are special characters + in regular expressions, so we need to escape it to match a literal parenthesis. + This regular expression will match strings like "1)", "2)", "a)", "b)", "1a4)", "6de)", etc. + + Args: + lines (list[TextLine]): The lines to search for layer identifier entries. + + Returns: + list[LayerIdentifierEntry]: The layer identifier sidebar entries. + """ + entries = [] + for line in sorted(lines, key=lambda line: line.rect.y0): + if len(line.words) > 0: + # Only match in the first word of every line, to avoid e.g. matching with "cm)" in a material + # description containing an expression like "(diameter max 6 cm)". + first_word = line.words[0] + regex = re.compile(r"\b[\da-z-]+\)") + match = regex.match(first_word.text) + if match and len(first_word.text) < 7: + entries.append(LayerIdentifierEntry(first_word.rect, first_word.text)) + return entries + + @classmethod + def from_lines(cls, lines: list[TextLine]) -> list[LayerIdentifierSidebar]: + """Find layer identifier sidebars from text lines. + + TODO: Similar to AToBSidebarExtractor.find_in_words(). Refactoring may be desired. + + Args: + lines (list[TextLine]): The text lines in the document + + Returns: + list[LayerIdentifierSidebar]: The found layer identifier sidebar. + """ + entries = cls.find_layer_identifier_sidebar_entries(lines) + if not entries: + return [] + + layer_identifier_sidebars = [LayerIdentifierSidebar([entries[0]])] + for entry in entries[1:]: + has_match = False + for column in layer_identifier_sidebars: + if column.can_be_appended(entry.rect): + column.entries.append(entry) + has_match = True + if not has_match: + layer_identifier_sidebars.append(LayerIdentifierSidebar([entry])) + + # only keep columns whose entries are not fully contained in a different column + layer_identifier_sidebars = [ + column + for column in layer_identifier_sidebars + if all(not other.strictly_contains(column) for other in layer_identifier_sidebars) + ] + # check if the column rect is a subset of another column rect. If so, merge the entries and sort them by + # y0. + for column in layer_identifier_sidebars: + for other in layer_identifier_sidebars: + if column != other and column.is_contained(other.rect()): + for entry in other.entries: + if entry not in column.entries: + column.entries.append(entry) + column.entries.sort(key=lambda entry: entry.rect.y0) + layer_identifier_sidebars.remove(other) + break + layer_identifier_sidebars = [column for column in layer_identifier_sidebars if len(column.entries) > 2] + return layer_identifier_sidebars diff --git a/src/stratigraphy/sidebar/sidebar.py b/src/stratigraphy/sidebar/sidebar.py new file mode 100644 index 00000000..b53781a4 --- /dev/null +++ b/src/stratigraphy/sidebar/sidebar.py @@ -0,0 +1,108 @@ +"""This module contains the Sidebar class, used to represent a depth column (or similar) of a borehole profile.""" + +from __future__ import annotations + +import abc +from dataclasses import dataclass +from typing import Generic, TypeVar + +import fitz + +from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry +from stratigraphy.lines.line import TextLine, TextWord +from stratigraphy.sidebar.interval_block_group import IntervalBlockGroup +from stratigraphy.util.dataclasses import Line + +EntryT = TypeVar("EntryT", bound=DepthColumnEntry) + + +@dataclass +class Sidebar(abc.ABC, Generic[EntryT]): + """Abstract Sidebar class, representing depths or other data displayed to the side of material descriptions.""" + + entries: list[EntryT] + + def rects(self) -> list[fitz.Rect]: + """Get the rectangles of the depth column entries.""" + return [entry.rect for entry in self.entries] + + def rect(self) -> fitz.Rect: + """Get the bounding box of the depth column entries.""" + x0 = min([rect.x0 for rect in self.rects()]) + x1 = max([rect.x1 for rect in self.rects()]) + y0 = min([rect.y0 for rect in self.rects()]) + y1 = max([rect.y1 for rect in self.rects()]) + return fitz.Rect(x0, y0, x1, y1) + + @property + def max_x0(self) -> float: + """Get the maximum x0 value of the depth column entries.""" + return max([rect.x0 for rect in self.rects()]) + + @property + def min_x1(self) -> float: + """Get the minimum x1 value of the depth column entries.""" + return min([rect.x1 for rect in self.rects()]) + + def noise_count(self, all_words: list[TextWord]) -> int: + """Counts the number of words that intersect with the depth column entries. + + Returns the number of words that intersect with the depth column entries, but are not part of the depth column. + + Args: + all_words (list[TextWord]): A list of all text lines on the page. + + Returns: + int: The number of words that intersect with the depth column entries but are not part of it. + """ + + def significant_intersection(other_rect): + intersection = fitz.Rect(other_rect).intersect(self.rect()) + return intersection.is_valid and intersection.width > 0.25 * self.rect().width + + return len([word for word in all_words if significant_intersection(word.rect)]) - len(self.entries) + + @abc.abstractmethod + def identify_groups( + self, + description_lines: list[TextLine], + geometric_lines: list[Line], + material_description_rect: fitz.Rect, + **params, + ) -> list[IntervalBlockGroup]: + """Identifies groups of description blocks that correspond to depth intervals. + + Args: + description_lines (list[TextLine]): A list of text lines that are part of the description. + geometric_lines (list[Line]): A list of geometric lines that are part of the description. + material_description_rect (fitz.Rect): The bounding box of the material description. + params (dict): A dictionary of relevant parameters. + + Returns: + list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup. + """ + pass + + def can_be_appended(self, rect: fitz.Rect) -> bool: + """Checks if a new depth column entry can be appended to the current depth column. + + Check if the middle of the new rect is between the outer horizontal boundaries of the column, and if there is + an intersection with the minimal horizontal boundaries of the column. + + The checks are: + - The width of the new rectangle is greater than the width of the current depth column. Or; + - The middle of the new rectangle is within the horizontal boundaries of the current depth column. + - The new rectangle intersects with the minimal horizontal boundaries of the current depth column. + + Args: + rect (fitz.Rect): Rect of the depth column entry to be appended. + + Returns: + bool: True if the new depth column entry can be appended, False otherwise. + """ + new_middle = (rect.x0 + rect.x1) / 2 + if (self.rect().width < rect.width or self.rect().x0 < new_middle < self.rect().x1) and ( + rect.x0 <= self.min_x1 and self.max_x0 <= rect.x1 + ): + return True + return False diff --git a/src/stratigraphy/text/find_description.py b/src/stratigraphy/text/find_description.py index 48902ab4..57b3114f 100644 --- a/src/stratigraphy/text/find_description.py +++ b/src/stratigraphy/text/find_description.py @@ -1,7 +1,6 @@ """This module contains functions to find the description (blocks) of a material in a pdf page.""" import fitz -from stratigraphy.layer.layer_identifier_column import LayerIdentifierEntry from stratigraphy.lines.line import TextLine from stratigraphy.text.description_block_splitter import ( SplitDescriptionBlockByLeftHandSideSeparator, @@ -35,66 +34,6 @@ def get_description_lines(lines: list[TextLine], material_description_rect: fitz return sorted([line for line in filtered_lines if line], key=lambda line: line.rect.y0) -def get_description_blocks_from_layer_identifier( - layer_identifier_entries: list[LayerIdentifierEntry], description_lines: list[TextLine] -) -> list[TextBlock]: - """Divide the description lines into blocks based on the layer identifier entries. - - Args: - layer_identifier_entries (list[LayerIdentifierEntry]): The layer identifier entries. - description_lines (list[TextLine]): All lines constituting the material description. - - Returns: - list[TextBlock]: The blocks of the material description. - """ - blocks = [] - line_index = 0 - for layer_identifier_idx, _layer_index in enumerate(layer_identifier_entries): - next_layer_identifier = ( - layer_identifier_entries[layer_identifier_idx + 1] - if layer_identifier_idx + 1 < len(layer_identifier_entries) - else None - ) - - matched_block = matching_blocks(description_lines, line_index, next_layer_identifier) - line_index += sum([len(block.lines) for block in matched_block]) - blocks.extend(matched_block) - - return blocks - - -def matching_blocks( - all_lines: list[TextLine], line_index: int, next_layer_identifier: TextLine | None -) -> list[TextBlock]: - """Adds lines to a block until the next layer identifier is reached. - - Args: - all_lines (list[TextLine]): All TextLine objects constituting the material description. - line_index (int): The index of the last line that is already assigned to a block. - next_layer_identifier (TextLine | None): The next layer identifier. - - Returns: - list[TextBlock]: The next block or an empty list if no lines are added. - """ - y1_threshold = None - if next_layer_identifier: - next_interval_start_rect = next_layer_identifier.rect - y1_threshold = next_interval_start_rect.y0 + next_interval_start_rect.height / 2 - - matched_lines = [] - - for current_line in all_lines[line_index:]: - if y1_threshold is None or current_line.rect.y1 < y1_threshold: - matched_lines.append(current_line) - else: - break - - if matched_lines: - return [TextBlock(matched_lines)] - else: - return [] - - def get_description_blocks( description_lines: list[TextLine], geometric_lines: list[Line], diff --git a/src/stratigraphy/util/interval.py b/src/stratigraphy/util/interval.py index 793193c5..479e8ffb 100644 --- a/src/stratigraphy/util/interval.py +++ b/src/stratigraphy/util/interval.py @@ -7,9 +7,8 @@ import fitz from stratigraphy.depthcolumn.depthcolumnentry import ( - AnnotatedDepthColumnEntry, + AToBDepthColumnEntry, DepthColumnEntry, - LayerDepthColumnEntry, ) from stratigraphy.lines.line import TextLine from stratigraphy.text.textblock import TextBlock @@ -59,20 +58,8 @@ def to_json(self): } -class AnnotatedInterval: - """Class for annotated intervals.""" - - def __init__(self, start: float, end: float, background_rect: fitz.Rect): - self.start = AnnotatedDepthColumnEntry(start) - self.end = AnnotatedDepthColumnEntry(end) - self.background_rect = background_rect - - -class BoundaryInterval(Interval): - """Class for boundary intervals. - - Boundary intervals are intervals that are defined by a start and an end point. - """ +class AAboveBInterval(Interval): + """Class for depth intervals where the upper depth is located above the lower depth on the page.""" @property def line_anchor(self) -> fitz.Point | None: @@ -153,14 +140,10 @@ def matching_blocks( return pre, exact, post -class LayerInterval(Interval): - """Class for layer intervals. +class AToBInterval(Interval): + """Class for intervals that are defined in a single line like "1.00 - 2.30m".""" - A layer interval is an interval whose start and end-points are defined in a single entry. - E.g. 1.00 - 2.30m. - """ - - def __init__(self, layer_depth_column_entry: LayerDepthColumnEntry): + def __init__(self, layer_depth_column_entry: AToBDepthColumnEntry): self.entry = layer_depth_column_entry super().__init__(layer_depth_column_entry.start, layer_depth_column_entry.end) @@ -194,3 +177,38 @@ def matching_blocks( return [TextBlock(matched_lines)] else: return [] + + @classmethod + def get_depth_interval_from_lines(cls, lines: list[TextLine]) -> AToBInterval | None: + """Extract depth interval from text lines. + + For borehole profiles in the Deriaz layout, the depth interval is usually found in the text of the material + description. Often, these text descriptions contain a further separation into multiple sub layers. + These sub layers have their own depth intervals. This function extracts the overall depth interval, + spanning across all mentioned sub layers. + + Args: + lines (list[TextLine]): The lines to extract the depth interval from. + + Returns: + AToBInterval | None: The depth interval (if any) or None (if no depth interval was found). + """ + depth_entries = [] + for line in lines: + try: + layer_depth_entry = AToBDepthColumnEntry.from_text(line.text, line.rect, require_start_of_string=False) + # require_start_of_string = False because the depth interval may not always start at the beginning + # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" + if layer_depth_entry: + depth_entries.append(layer_depth_entry) + except ValueError: + pass + + if depth_entries: + # Merge the sub layers into one depth interval. + start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value) + end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value) + + return AToBInterval(AToBDepthColumnEntry(start, end)) + else: + return None diff --git a/src/stratigraphy/util/predictions.py b/src/stratigraphy/util/predictions.py index f1d187c3..51e53f5f 100644 --- a/src/stratigraphy/util/predictions.py +++ b/src/stratigraphy/util/predictions.py @@ -5,7 +5,7 @@ from stratigraphy.benchmark.ground_truth import GroundTruth from stratigraphy.benchmark.metrics import OverallMetricsCatalog from stratigraphy.data_extractor.data_extractor import FeatureOnPage -from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPairs +from stratigraphy.depths_materials_column_pairs.bounding_boxes import BoundingBoxes from stratigraphy.evaluation.evaluation_dataclasses import OverallBoreholeMetadataMetrics from stratigraphy.evaluation.groundwater_evaluator import GroundwaterEvaluator from stratigraphy.evaluation.layer_evaluator import LayerEvaluator @@ -26,10 +26,10 @@ def __init__( file_name: str, metadata: BoreholeMetadata, groundwater: GroundwaterInDocument, - depths_materials_columns_pairs: list[DepthsMaterialsColumnPairs], + bounding_boxes: list[BoundingBoxes], ): self.layers_in_document: LayersInDocument = layers_in_document - self.depths_materials_columns_pairs: list[DepthsMaterialsColumnPairs] = depths_materials_columns_pairs + self.bounding_boxes: list[BoundingBoxes] = bounding_boxes self.file_name: str = file_name self.metadata: BoreholeMetadata = metadata self.groundwater: GroundwaterInDocument = groundwater @@ -43,12 +43,8 @@ def to_json(self) -> dict: return { "metadata": self.metadata.to_json(), "layers": [layer.to_json() for layer in self.layers_in_document.layers], - "depths_materials_column_pairs": [dmc_pair.to_json() for dmc_pair in self.depths_materials_columns_pairs] - if self.depths_materials_columns_pairs is not None - else [], - "page_dimensions": self.metadata.page_dimensions, # TODO: Remove, already in metadata + "bounding_boxes": [bboxes.to_json() for bboxes in self.bounding_boxes], "groundwater": self.groundwater.to_json() if self.groundwater is not None else [], - "file_name": self.file_name, } @@ -103,10 +99,7 @@ def from_json(cls, prediction_from_file: dict) -> "OverallFilePredictions": layers = [Layer.from_json(data) for data in file_data["layers"]] layers_in_doc = LayersInDocument(layers=layers, filename=file_name) - depths_materials_columns_pairs = [ - DepthsMaterialsColumnPairs.from_json(dmc_pair) - for dmc_pair in file_data["depths_materials_column_pairs"] - ] + bounding_boxes = [BoundingBoxes.from_json(bboxes) for bboxes in file_data["bounding_boxes"]] groundwater_entries = [FeatureOnPage.from_json(entry, Groundwater) for entry in file_data["groundwater"]] groundwater_in_document = GroundwaterInDocument(groundwater=groundwater_entries, filename=file_name) @@ -115,7 +108,7 @@ def from_json(cls, prediction_from_file: dict) -> "OverallFilePredictions": layers_in_document=layers_in_doc, file_name=file_name, metadata=metadata, - depths_materials_columns_pairs=depths_materials_columns_pairs, + bounding_boxes=bounding_boxes, groundwater=groundwater_in_document, ) ) diff --git a/tests/test_depthcolumn.py b/tests/test_depthcolumn.py index 484a8e7a..2ca98178 100644 --- a/tests/test_depthcolumn.py +++ b/tests/test_depthcolumn.py @@ -1,32 +1,31 @@ """Test suite for the find_depth_columns module.""" import fitz -from stratigraphy.depthcolumn.depthcolumn import BoundaryDepthColumn from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry +from stratigraphy.sidebar import AAboveBSidebar -def test_boundarydepthcolumn_isarithmeticprogression(): # noqa: D103 - """Test the is_arithmetic_progression method of the BoundaryDepthColumn class.""" - page_number = 1 - column = BoundaryDepthColumn( +def test_aabovebsidebar_isarithmeticprogression(): # noqa: D103 + """Test the is_arithmetic_progression method of the AAboveBSidebar class.""" + column = AAboveBSidebar( [ - DepthColumnEntry(fitz.Rect(), value=1, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=2, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=3, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=4, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=5, page_number=page_number), + DepthColumnEntry(fitz.Rect(), value=1), + DepthColumnEntry(fitz.Rect(), value=2), + DepthColumnEntry(fitz.Rect(), value=3), + DepthColumnEntry(fitz.Rect(), value=4), + DepthColumnEntry(fitz.Rect(), value=5), ] ) assert column.is_arithmetic_progression(), "The column should be recognized as arithmetic progression" - column = BoundaryDepthColumn( + column = AAboveBSidebar( [ - DepthColumnEntry(fitz.Rect(), value=17.6, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=18.15, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=18.65, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=19.3, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=19.9, page_number=page_number), - DepthColumnEntry(fitz.Rect(), value=20.5, page_number=page_number), + DepthColumnEntry(fitz.Rect(), value=17.6), + DepthColumnEntry(fitz.Rect(), value=18.15), + DepthColumnEntry(fitz.Rect(), value=18.65), + DepthColumnEntry(fitz.Rect(), value=19.3), + DepthColumnEntry(fitz.Rect(), value=19.9), + DepthColumnEntry(fitz.Rect(), value=20.5), ] ) assert not column.is_arithmetic_progression(), "The column should not be recognized as arithmetic progression" diff --git a/tests/test_find_depth_columns.py b/tests/test_find_depth_columns.py deleted file mode 100644 index d8fd9294..00000000 --- a/tests/test_find_depth_columns.py +++ /dev/null @@ -1,207 +0,0 @@ -"""Test suite for the find_depth_columns module.""" - -import fitz -import pytest -from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry -from stratigraphy.depthcolumn.find_depth_columns import ( - depth_column_entries, - find_depth_columns, - find_layer_depth_columns, -) -from stratigraphy.lines.line import TextLine, TextWord - -PAGE_NUMBER = 1 -ALL_WORDS_FIND_DEPTH_COLUMN = [ - TextWord(fitz.Rect(0, 0, 5, 1), "10.00m", PAGE_NUMBER), - TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand", PAGE_NUMBER), - TextWord(fitz.Rect(20, 2, 30, 3), "Kies, verwittert.", PAGE_NUMBER), - TextWord(fitz.Rect(0, 2, 5, 3), "20.0m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 4, 5, 5), "30.0m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 6, 5, 7), "40.0m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 8, 5, 9), "50.0m", PAGE_NUMBER), -] -ALL_WORDS_FIND_LAYER_DEPTH_COLUMN = [ - TextWord(fitz.Rect(0, 0, 5, 1), "12.00-20.0m", PAGE_NUMBER), - TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand", PAGE_NUMBER), - TextWord(fitz.Rect(20, 2, 30, 3), "Kies, verwittert.", PAGE_NUMBER), - TextWord(fitz.Rect(0, 2, 5, 3), "20.0-34.0m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 4, 5, 5), "34.0 - 40.0m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 6, 5, 7), "40.0-50m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 8, 5, 9), "50.0-60m", PAGE_NUMBER), -] - - -def test_depth_column_entries(): # noqa: D103 - """Test the depth_column_entries function.""" - all_words = [ - TextWord(fitz.Rect(0, 0, 5, 1), "10.00m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 2, 5, 3), "20.0m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 4, 5, 5), "30.0m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 6, 5, 7), "40.0m", PAGE_NUMBER), - ] - entries = depth_column_entries(all_words, include_splits=False) - assert len(entries) == 4, "There should be 4 entries" - assert pytest.approx(entries[0].value) == 10.0, "The first entry should have a value of 10.0" - assert pytest.approx(entries[1].value) == 20.0, "The second entry should have a value of 20.0" - assert pytest.approx(entries[2].value) == 30.0, "The third entry should have a value of 30.0" - assert pytest.approx(entries[3].value) == 40.0, "The fourth entry should have a value of 40.0" - - -def test_depth_column_entries_with_splits(): # noqa: D103 - """Test the depth_column_entries function with include_splits=True.""" - all_words = [ - TextLine([TextWord(fitz.Rect(0, 0, 10, 1), "10.00-20.0m", PAGE_NUMBER)]), - TextLine([TextWord(fitz.Rect(0, 2, 10, 3), "30.0-40.0m", PAGE_NUMBER)]), - ] - entries = depth_column_entries(all_words, include_splits=True) - assert len(entries) == 4, "There should be 4 entries" - assert entries[0].value == 10.0, "The first entry should have a value of 10.0" - assert entries[1].value == 20.0, "The second entry should have a value of 20.0" - assert entries[2].value == 30.0, "The third entry should have a value of 30.0" - assert entries[3].value == 40.0, "The fourth entry should have a value of 40.0" - - -def test_depth_column_entries_with_leading_character(): # noqa: D103 - """Test the depth_column_entries function with a leading character.""" - all_words = [ - TextWord(fitz.Rect(0, 0, 5, 1), "0.00m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 2, 5, 3), ".2m", PAGE_NUMBER), # this is a test for an ocr error from '-2m' to '.2m' - TextWord(fitz.Rect(0, 4, 5, 5), "-3.0m", PAGE_NUMBER), - TextWord(fitz.Rect(0, 6, 5, 7), ".4.2m", PAGE_NUMBER), - ] - entries = depth_column_entries(all_words, include_splits=True) - assert len(entries) == 4, "There should be 4 entries" - assert entries[0].value == 0.0, "The first entry should have a value of 0" - assert entries[1].value == 2.0, "The second entry should have a value of 2.0" - assert entries[2].value == 3.0, "The third entry should have a value of 3.0" - assert entries[3].value == 4.2, "The fourth entry should have a value of 4.2" - - -def test_find_depth_columns_arithmetic_progression(): # noqa: D103 - """Test the find_depth_columns function with an arithmetic progression.""" - entries = [ - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 10.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 30.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, PAGE_NUMBER), - ] - columns = find_depth_columns( - entries, - ALL_WORDS_FIND_DEPTH_COLUMN, - PAGE_NUMBER, - depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, - ) - assert len(columns) == 0, "There should be 0 columns as the above is a perfect arithmetic progression" - - -def test_find_depth_columns(): # noqa: D103 - """Test the find_depth_columns function.""" - entries = [ - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, PAGE_NUMBER), - ] - - columns = find_depth_columns( - entries, - ALL_WORDS_FIND_DEPTH_COLUMN, - PAGE_NUMBER, - depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, - ) - assert len(columns) == 1, "There should be 1 column" - assert len(columns[0].entries) == 5, "The column should have 5 entries" - assert pytest.approx(columns[0].entries[0].value) == 12.0, "The first entry should have a value of 12.0" - assert pytest.approx(columns[0].entries[1].value) == 20.0, "The second entry should have a value of 20.0" - assert pytest.approx(columns[0].entries[2].value) == 34.0, "The third entry should have a value of 34.0" - assert pytest.approx(columns[0].entries[3].value) == 40.0, "The fourth entry should have a value of 40.0" - assert pytest.approx(columns[0].entries[4].value) == 50.0, "The fourth entry should have a value of 50.0" - - -def test_two_columns_find_depth_columns(): # noqa: D103 - """Test the find_depth_columns function with two columns.""" - entries = [ # first depth column - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0, PAGE_NUMBER), # second depth column - DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 10, 25, 11), 61.0, PAGE_NUMBER), - ] - columns = find_depth_columns( - entries, - ALL_WORDS_FIND_DEPTH_COLUMN, - PAGE_NUMBER, - depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, - ) - assert len(columns) == 2, "There should be 2 columns" - assert len(columns[0].entries) == 5, "The first column should have 5 entries" - assert len(columns[1].entries) == 6, "The second column should have 6 entries" - - -def test_find_layer_depth_columns(): # noqa: D103 - """Test the find_layer_depth_columns function.""" - entries = [ - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, PAGE_NUMBER), # layer 12.0-20.0m - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, PAGE_NUMBER), # layer 20.0-34.0m - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, PAGE_NUMBER), # layer 34.0-40.0m - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, PAGE_NUMBER), # layer 40.0-50.0m - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, PAGE_NUMBER), # layer 50.0-60.0m - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0, PAGE_NUMBER), - ] - - columns = find_layer_depth_columns(entries, ALL_WORDS_FIND_DEPTH_COLUMN) - assert len(columns) == 1, "There should be 1 column" - assert len(columns[0].entries) == 5, "The column should have 5 entries" - assert columns[0].entries[0].start.value == 12.0, "The first entry should have a value of 12.0" - assert columns[0].entries[0].end.value == 20.0, "The first entry should have a value of 20.0" - assert columns[0].entries[1].start.value == 20.0, "The second entry should have a value of 20.0" - assert columns[0].entries[1].end.value == 34.0, "The second entry should have a value of 34.0" - assert columns[0].entries[2].start.value == 34.0, "The third entry should have a value of 34.0" - assert columns[0].entries[2].end.value == 40.0, "The third entry should have a value of 40.0" - assert columns[0].entries[3].start.value == 40.0, "The fourth entry should have a value of 40.0" - assert columns[0].entries[3].end.value == 50.0, "The fourth entry should have a value of 50.0" - assert columns[0].entries[4].start.value == 50.0, "The fourth entry should have a value of 50.0" - assert columns[0].entries[4].end.value == 60.0, "The fourth entry should have a value of 60.0" - - -def test_two_columns_find_layer_depth_columns(): # noqa: D103 - """Test the find_layer_depth_columns function with two columns.""" - entries = [ # first depth column - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, PAGE_NUMBER), # layer 12.0-20.0m - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, PAGE_NUMBER), # layer 20.0-34.0m - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, PAGE_NUMBER), # layer 34.0-40.0m - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, PAGE_NUMBER), # layer 40.0-50.0m - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, PAGE_NUMBER), # layer 50.0-60.0m - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0, PAGE_NUMBER), - # second depth column - DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0, PAGE_NUMBER), # layer 12.0-20.0m - DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 20.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0, PAGE_NUMBER), # layer 20.0-34.0m - DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 34.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0, PAGE_NUMBER), # layer 34.0-40.0m - DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 40.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0, PAGE_NUMBER), # layer 40.0-50.0m - DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 50.0, PAGE_NUMBER), - DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0, PAGE_NUMBER), # layer 50.0-60.0m - DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 60.0, PAGE_NUMBER), - ] - columns = find_layer_depth_columns(entries, ALL_WORDS_FIND_LAYER_DEPTH_COLUMN) - assert len(columns) == 2, "There should be 2 columns" - assert len(columns[0].entries) == 5, "The first column should have 5 entries" - assert len(columns[1].entries) == 5, "The second column should have 5 entries" diff --git a/tests/test_find_sidebar.py b/tests/test_find_sidebar.py new file mode 100644 index 00000000..51178ec6 --- /dev/null +++ b/tests/test_find_sidebar.py @@ -0,0 +1,181 @@ +"""Test suite for the find_depth_columns module.""" + +import fitz +import pytest +from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry +from stratigraphy.lines.line import TextWord +from stratigraphy.sidebar import AAboveBSidebarExtractor, AToBSidebarExtractor + +PAGE_NUMBER = 1 + + +def test_depth_column_entries(): # noqa: D103 + """Test the DepthColumnEntry.find_in_words function.""" + all_words = [ + TextWord(fitz.Rect(0, 0, 5, 1), "10.00m", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0m", PAGE_NUMBER), + TextWord(fitz.Rect(0, 4, 5, 5), "30.0m", PAGE_NUMBER), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0m", PAGE_NUMBER), + ] + entries = DepthColumnEntry.find_in_words(all_words, include_splits=False) + assert len(entries) == 4, "There should be 4 entries" + assert pytest.approx(entries[0].value) == 10.0, "The first entry should have a value of 10.0" + assert pytest.approx(entries[1].value) == 20.0, "The second entry should have a value of 20.0" + assert pytest.approx(entries[2].value) == 30.0, "The third entry should have a value of 30.0" + assert pytest.approx(entries[3].value) == 40.0, "The fourth entry should have a value of 40.0" + + +def test_depth_column_entries_with_splits(): # noqa: D103 + """Test the DepthColumnEntry.find_in_words function with include_splits=True.""" + all_words = [ + TextWord(fitz.Rect(0, 0, 10, 1), "10.00-20.0m", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 10, 3), "30.0-40.0m", PAGE_NUMBER), + ] + entries = DepthColumnEntry.find_in_words(all_words, include_splits=True) + assert len(entries) == 4, "There should be 4 entries" + assert entries[0].value == 10.0, "The first entry should have a value of 10.0" + assert entries[1].value == 20.0, "The second entry should have a value of 20.0" + assert entries[2].value == 30.0, "The third entry should have a value of 30.0" + assert entries[3].value == 40.0, "The fourth entry should have a value of 40.0" + + +def test_depth_column_entries_with_leading_character(): # noqa: D103 + """Test the depth_column_entries function with a leading character.""" + all_words = [ + TextWord(fitz.Rect(0, 0, 5, 1), "0.00m", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 5, 3), ".2m", PAGE_NUMBER), # this is a test for an ocr error from '-2m' to '.2m' + TextWord(fitz.Rect(0, 4, 5, 5), "-3.0m", PAGE_NUMBER), + TextWord(fitz.Rect(0, 6, 5, 7), ".4.2m", PAGE_NUMBER), + ] + entries = DepthColumnEntry.find_in_words(all_words, include_splits=True) + assert len(entries) == 4, "There should be 4 entries" + assert entries[0].value == 0.0, "The first entry should have a value of 0" + assert entries[1].value == 2.0, "The second entry should have a value of 2.0" + assert entries[2].value == 3.0, "The third entry should have a value of 3.0" + assert entries[3].value == 4.2, "The fourth entry should have a value of 4.2" + + +def test_aabovebsidebarextractor_arithmetic_progression(): # noqa: D103 + all_words = [ + TextWord(fitz.Rect(0, 0, 5, 1), "10.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 4, 5, 5), "30.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 8, 5, 9), "50.0", PAGE_NUMBER), + ] + """Test the AAboveBSidebarExtractor with an arithmetic progression.""" + columns = AAboveBSidebarExtractor.find_in_words( + all_words, + used_entry_rects=[], + sidebar_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, + ) + assert len(columns) == 0, "There should be 0 columns as the above is a perfect arithmetic progression" + + +def test_aabovebsidebarextractor(): # noqa: D103 + """Test the AAboveBSidebarExtractor.""" + all_words = [ + TextWord(fitz.Rect(0, 0, 5, 1), "12.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 4, 5, 5), "34.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 8, 5, 9), "50.0", PAGE_NUMBER), + ] + columns = AAboveBSidebarExtractor.find_in_words( + all_words, + used_entry_rects=[], + sidebar_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, + ) + assert len(columns) == 1, "There should be 1 column" + assert len(columns[0].entries) == 5, "The column should have 5 entries" + assert pytest.approx(columns[0].entries[0].value) == 12.0, "The first entry should have a value of 12.0" + assert pytest.approx(columns[0].entries[1].value) == 20.0, "The second entry should have a value of 20.0" + assert pytest.approx(columns[0].entries[2].value) == 34.0, "The third entry should have a value of 34.0" + assert pytest.approx(columns[0].entries[3].value) == 40.0, "The fourth entry should have a value of 40.0" + assert pytest.approx(columns[0].entries[4].value) == 50.0, "The fourth entry should have a value of 50.0" + + +def test_aabovebsidebarextractor_two_column(): # noqa: D103 + """Test the AAboveBSidebarExtractor function with two columns.""" + all_words = [ # first depth column + TextWord(fitz.Rect(0, 0, 5, 1), "12.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 4, 5, 5), "34.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 8, 5, 9), "50.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 0, 25, 1), "12.0", PAGE_NUMBER), # second depth column + TextWord(fitz.Rect(20, 2, 25, 3), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 4, 25, 5), "34.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 6, 25, 7), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 8, 25, 9), "50.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 10, 25, 11), "61.0", PAGE_NUMBER), + ] + + columns = AAboveBSidebarExtractor.find_in_words( + all_words, + used_entry_rects=[], + sidebar_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, + ) + assert len(columns) == 2, "There should be 2 columns" + assert len(columns[0].entries) == 5, "The first column should have 5 entries" + assert len(columns[1].entries) == 6, "The second column should have 6 entries" + + +def test_atobsidebarextractor(): # noqa: D103 + """Test the AToBSidebarExtractor.""" + all_words = [ + TextWord(fitz.Rect(0, 0, 5, 1), "12.0", PAGE_NUMBER), # layer 12.0-20.0m + TextWord(fitz.Rect(0, 0, 5, 1), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0", PAGE_NUMBER), # layer 20.0-34.0m + TextWord(fitz.Rect(0, 2, 5, 3), "34.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 4, 5, 5), "34.0", PAGE_NUMBER), # layer 34.0-40.0m + TextWord(fitz.Rect(0, 4, 5, 5), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0", PAGE_NUMBER), # layer 40.0-50.0m + TextWord(fitz.Rect(0, 6, 5, 7), "50.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 8, 5, 9), "50.0", PAGE_NUMBER), # layer 50.0-60.0m + TextWord(fitz.Rect(0, 8, 5, 9), "60.0", PAGE_NUMBER), + ] + columns = AToBSidebarExtractor.find_in_words(all_words) + assert len(columns) == 1, "There should be 1 column" + assert len(columns[0].entries) == 5, "The column should have 5 entries" + assert columns[0].entries[0].start.value == 12.0, "The first entry should have a value of 12.0" + assert columns[0].entries[0].end.value == 20.0, "The first entry should have a value of 20.0" + assert columns[0].entries[1].start.value == 20.0, "The second entry should have a value of 20.0" + assert columns[0].entries[1].end.value == 34.0, "The second entry should have a value of 34.0" + assert columns[0].entries[2].start.value == 34.0, "The third entry should have a value of 34.0" + assert columns[0].entries[2].end.value == 40.0, "The third entry should have a value of 40.0" + assert columns[0].entries[3].start.value == 40.0, "The fourth entry should have a value of 40.0" + assert columns[0].entries[3].end.value == 50.0, "The fourth entry should have a value of 50.0" + assert columns[0].entries[4].start.value == 50.0, "The fourth entry should have a value of 50.0" + assert columns[0].entries[4].end.value == 60.0, "The fourth entry should have a value of 60.0" + + +def test_atobsidebarextractor_two_columns(): # noqa: D103 + """Test the AToBSidebarExtractor with two columns.""" + all_words = [ # first depth column + TextWord(fitz.Rect(0, 0, 5, 1), "12.0", PAGE_NUMBER), # layer 12.0-20.0m + TextWord(fitz.Rect(0, 0, 5, 1), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0", PAGE_NUMBER), # layer 20.0-34.0m + TextWord(fitz.Rect(0, 2, 5, 3), "34.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 4, 5, 5), "34.0", PAGE_NUMBER), # layer 34.0-40.0m + TextWord(fitz.Rect(0, 4, 5, 5), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0", PAGE_NUMBER), # layer 40.0-50.0m + TextWord(fitz.Rect(0, 6, 5, 7), "50.0", PAGE_NUMBER), + TextWord(fitz.Rect(0, 8, 5, 9), "50.0", PAGE_NUMBER), # layer 50.0-60.0m + TextWord(fitz.Rect(0, 8, 5, 9), "60.0", PAGE_NUMBER), + # second depth column + TextWord(fitz.Rect(20, 0, 25, 1), "12.0", PAGE_NUMBER), # layer 12.0-20.0m + TextWord(fitz.Rect(20, 0, 25, 1), "20.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 2, 25, 3), "20.0", PAGE_NUMBER), # layer 20.0-34.0m + TextWord(fitz.Rect(20, 2, 25, 3), "34.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 4, 25, 5), "34.0", PAGE_NUMBER), # layer 34.0-40.0m + TextWord(fitz.Rect(20, 4, 25, 5), "40.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 6, 25, 7), "40.0", PAGE_NUMBER), # layer 40.0-50.0m + TextWord(fitz.Rect(20, 6, 25, 7), "50.0", PAGE_NUMBER), + TextWord(fitz.Rect(20, 8, 25, 9), "50.0", PAGE_NUMBER), # layer 50.0-60.0m + TextWord(fitz.Rect(20, 8, 25, 9), "60.0", PAGE_NUMBER), + ] + columns = AToBSidebarExtractor.find_in_words(all_words) + assert len(columns) == 2, "There should be 2 columns" + assert len(columns[0].entries) == 5, "The first column should have 5 entries" + assert len(columns[1].entries) == 5, "The second column should have 5 entries" diff --git a/tests/test_interval.py b/tests/test_interval.py index 0d90cded..7d6d4118 100644 --- a/tests/test_interval.py +++ b/tests/test_interval.py @@ -1,35 +1,34 @@ """Test suite for the interval module.""" import fitz -from stratigraphy.depthcolumn.depthcolumnentry import DepthColumnEntry, LayerDepthColumnEntry -from stratigraphy.util.interval import BoundaryInterval, LayerInterval +from stratigraphy.depthcolumn.depthcolumnentry import AToBDepthColumnEntry, DepthColumnEntry +from stratigraphy.util.interval import AAboveBInterval, AToBInterval def test_line_anchor(): # noqa: D103 """Test the line anchor property of the BoundaryInterval and LayerInterval classes.""" - page_number = 1 - start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5, page_number) - end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10, page_number) - boundary_interval = BoundaryInterval(start, end) + start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5) + end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10) + boundary_interval = AAboveBInterval(start, end) assert boundary_interval.line_anchor == fitz.Point(1, 1.5), ( "The 'line anchor' for a BoundaryInterval should be halfway between the bottom-right of the start depth and " "the top-right of the end depth." ) - boundary_interval = BoundaryInterval(start, end=None) + boundary_interval = AAboveBInterval(start, end=None) assert boundary_interval.line_anchor == fitz.Point( 1, 1 ), "The 'line anchor' for a BoundaryInterval without end should be the bottom-right of the start depth." - boundary_interval = BoundaryInterval(start=None, end=end) + boundary_interval = AAboveBInterval(start=None, end=end) assert boundary_interval.line_anchor == fitz.Point( 1, 2 ), "The 'line anchor' for a BoundaryInterval without start should be the top-right of the end depth." - start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5, page_number) - end = DepthColumnEntry(fitz.Rect(2, 0, 3, 1), 10, page_number) - entry = LayerDepthColumnEntry(start, end) - layer_interval = LayerInterval(entry) + start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5) + end = DepthColumnEntry(fitz.Rect(2, 0, 3, 1), 10) + entry = AToBDepthColumnEntry(start, end) + layer_interval = AToBInterval(entry) assert layer_interval.line_anchor == fitz.Point( 3, 0.5 ), "The 'line anchor' for a LayerInterval should be the midpoint of the right-hand-side of the end rect." @@ -37,10 +36,9 @@ def test_line_anchor(): # noqa: D103 def test_background_rect(): # noqa: D103 """Test the background_rect property of the BoundaryInterval class.""" - page_number = 1 - start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5, page_number) - end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10, page_number) - boundary_interval = BoundaryInterval(start, end) + start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5) + end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10) + boundary_interval = AAboveBInterval(start, end) assert boundary_interval.background_rect == fitz.Rect( start.rect.x0, start.rect.y1, start.rect.x1, end.rect.y0 ), "The background rect should be (0, 1, 1, 2)" diff --git a/tests/test_predictions.py b/tests/test_predictions.py index c692f951..2aa96a21 100644 --- a/tests/test_predictions.py +++ b/tests/test_predictions.py @@ -49,7 +49,7 @@ def sample_file_prediction() -> FilePredictions: file_name="test_file", metadata=metadata, groundwater=groundwater_in_doc, - depths_materials_columns_pairs=[], + bounding_boxes=[], ) @@ -58,9 +58,9 @@ def test_to_json(sample_file_prediction: FilePredictions): result = sample_file_prediction.to_json() assert isinstance(result, dict) - assert result["file_name"] == "test_file" assert len(result["layers"]) == 2 assert result["metadata"]["coordinates"]["E"] == 2789456 + assert result["metadata"]["language"] == "en" def test_overall_file_predictions():