Skip to content

Commit

Permalink
LGVISIUM-79: differentiate between IntervalBlockGroup and IntervalBlo…
Browse files Browse the repository at this point in the history
…ckPair
  • Loading branch information
stijnvermeeren-swisstopo committed Nov 5, 2024
1 parent 9319c44 commit d802102
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 47 deletions.
17 changes: 6 additions & 11 deletions src/stratigraphy/depthcolumn/depthcolumn.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def noise_count(self, all_words: list[TextWord]) -> int:
@abc.abstractmethod
def identify_groups(
self, description_lines: list[TextLine], geometric_lines: list[Line], material_description_rect: fitz.Rect
) -> list[dict]:
) -> list[IntervalBlockGroup]:
"""Identifies groups of description blocks that correspond to depth intervals.
Args:
Expand All @@ -76,8 +76,7 @@ def identify_groups(
material_description_rect (fitz.Rect): The bounding box of the material description.
Returns:
list[dict]: A list of groups, where each group is a dictionary
with the keys "depth_intervals" and "blocks".
list[IntervalBlockGroup]: A list of groups, where each group is a IntervalBlockGroup.
"""
pass

Expand Down Expand Up @@ -265,11 +264,7 @@ def identify_groups(

matched_blocks = interval.matching_blocks(description_lines, line_index, next_interval)
line_index += sum([len(block.lines) for block in matched_blocks])
groups.append(
# TODO: This seems to be the only case where a list is passed and most of the time it is a list of one
# element. Seem to need the function: transform_groups().
IntervalBlockGroup(depth_interval=[interval], block=matched_blocks)
)
groups.append(IntervalBlockGroup(depth_intervals=[interval], blocks=matched_blocks))
return groups


Expand Down Expand Up @@ -559,8 +554,8 @@ def identify_groups(
current_blocks.extend(pre)
if len(exact):
if len(current_intervals) > 0 or len(current_blocks) > 0:
groups.append(IntervalBlockGroup(depth_interval=current_intervals, block=current_blocks))
groups.append(IntervalBlockGroup(depth_interval=[interval], block=exact))
groups.append(IntervalBlockGroup(depth_intervals=current_intervals, blocks=current_blocks))
groups.append(IntervalBlockGroup(depth_intervals=[interval], blocks=exact))
current_blocks = post
current_intervals = []
else:
Expand All @@ -570,6 +565,6 @@ def identify_groups(
current_intervals.append(interval)

if len(current_intervals) > 0 or len(current_blocks) > 0:
groups.append(IntervalBlockGroup(depth_interval=current_intervals, block=current_blocks))
groups.append(IntervalBlockGroup(depth_intervals=current_intervals, blocks=current_blocks))

return groups
46 changes: 23 additions & 23 deletions src/stratigraphy/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from stratigraphy.depthcolumn import find_depth_columns
from stratigraphy.depthcolumn.depthcolumn import DepthColumn
from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPairs
from stratigraphy.layer.layer import IntervalBlockGroup, Layer
from stratigraphy.layer.layer import IntervalBlockPair, Layer
from stratigraphy.layer.layer_identifier_column import (
LayerIdentifierColumn,
find_layer_identifier_column,
Expand Down Expand Up @@ -119,16 +119,16 @@ def process_page(
to_delete.append(i)
filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete]

groups: list[IntervalBlockGroup] = [] # list of matched depth intervals and text blocks
pairs: list[IntervalBlockPair] = [] # list of matched depth intervals and text blocks
# groups is of the form: [{"depth_interval": BoundaryInterval, "block": TextBlock}]
if filtered_pairs: # match depth column items with material description
for depth_column, material_description_rect in filtered_pairs:
description_lines = get_description_lines(lines, material_description_rect)
if len(description_lines) > 1:
new_groups = match_columns(
new_pairs = match_columns(
depth_column, description_lines, geometric_lines, material_description_rect, **params
)
groups.extend(new_groups)
pairs.extend(new_pairs)
filtered_depth_material_column_pairs = [
DepthsMaterialsColumnPairs(
depth_column=depth_column, material_description_rect=material_description_rect, page=page_number
Expand All @@ -150,7 +150,7 @@ def process_page(
params["block_line_ratio"],
params["left_line_length_threshold"],
)
groups.extend([IntervalBlockGroup(block=block, depth_interval=None) for block in description_blocks])
pairs.extend([IntervalBlockPair(block=block, depth_interval=None) for block in description_blocks])
filtered_depth_material_column_pairs.extend(
[
DepthsMaterialsColumnPairs(
Expand All @@ -163,24 +163,24 @@ def process_page(
Layer(
material_description=FeatureOnPage(
feature=MaterialDescription(
text=group.block.text,
text=pair.block.text,
lines=[
FeatureOnPage(
feature=MaterialDescriptionLine(text_line.text),
rect=text_line.rect,
page=text_line.page_number,
)
for text_line in group.block.lines
for text_line in pair.block.lines
],
),
rect=group.block.rect,
rect=pair.block.rect,
page=page_number,
),
depth_interval=BoundaryInterval(start=group.depth_interval.start, end=group.depth_interval.end)
if group.depth_interval
depth_interval=BoundaryInterval(start=pair.depth_interval.start, end=pair.depth_interval.end)
if pair.depth_interval
else None,
)
for group in groups
for pair in pairs
]
layer_predictions = [layer for layer in layer_predictions if layer.description_nonempty()]
return ProcessPageResult(layer_predictions, filtered_depth_material_column_pairs)
Expand Down Expand Up @@ -222,7 +222,7 @@ def match_columns(
geometric_lines: list[Line],
material_description_rect: fitz.Rect,
**params: dict,
) -> list[IntervalBlockGroup]:
) -> list[IntervalBlockPair]:
"""Match the depth column entries with the description lines.
This function identifies groups of depth intervals and text blocks that are likely to match.
Expand All @@ -237,26 +237,26 @@ def match_columns(
**params (dict): Additional parameters for the matching pipeline.
Returns:
list[IntervalBlockGroup]: The matched depth intervals and text blocks.
list[IntervalBlockPair]: The matched depth intervals and text blocks.
"""
if isinstance(depth_column, DepthColumn):
return [
element
for group in depth_column.identify_groups(
description_lines, geometric_lines, material_description_rect, **params
)
for element in transform_groups(group.depth_interval, group.block, **params)
for element in transform_groups(group.depth_intervals, group.blocks, **params)
]
elif isinstance(depth_column, LayerIdentifierColumn):
blocks = get_description_blocks_from_layer_identifier(depth_column.entries, description_lines)
groups: list[IntervalBlockGroup] = []
pairs: list[IntervalBlockPair] = []
for block in blocks:
depth_interval = find_depth_columns.get_depth_interval_from_textblock(block)
if depth_interval:
groups.append(IntervalBlockGroup(depth_interval=depth_interval, block=block))
pairs.append(IntervalBlockPair(depth_interval=depth_interval, block=block))
else:
groups.append(IntervalBlockGroup(depth_interval=None, block=block))
return groups
pairs.append(IntervalBlockPair(depth_interval=None, block=block))
return pairs
else:
raise ValueError(
f"depth_column must be a DepthColumn or a LayerIdentifierColumn object. Got {type(depth_column)}."
Expand All @@ -265,7 +265,7 @@ def match_columns(

def transform_groups(
depth_intervals: list[Interval], blocks: list[TextBlock], **params: dict
) -> list[IntervalBlockGroup]:
) -> list[IntervalBlockPair]:
"""Transforms the text blocks such that their number equals the number of depth intervals.
If there are more depth intervals than text blocks, text blocks are splitted. When there
Expand All @@ -278,15 +278,15 @@ def transform_groups(
**params (dict): Additional parameters for the matching pipeline.
Returns:
List[IntervalBlockGroup]: Pairing of text blocks and depth intervals.
List[IntervalBlockPair]: Pairing of text blocks and depth intervals.
"""
if len(depth_intervals) == 0:
return []
elif len(depth_intervals) == 1:
concatenated_block = TextBlock(
[line for block in blocks for line in block.lines]
) # concatenate all text lines within a block; line separation flag does not matter here.
return [IntervalBlockGroup(depth_interval=depth_intervals[0], block=concatenated_block)]
return [IntervalBlockPair(depth_interval=depth_intervals[0], block=concatenated_block)]
else:
if len(blocks) < len(depth_intervals):
blocks = split_blocks_by_textline_length(blocks, target_split_count=len(depth_intervals) - len(blocks))
Expand All @@ -296,15 +296,15 @@ def transform_groups(
depth_intervals.extend([BoundaryInterval(None, None) for _ in range(len(blocks) - len(depth_intervals))])

return [
IntervalBlockGroup(depth_interval=depth_interval, block=block)
IntervalBlockPair(depth_interval=depth_interval, block=block)
for depth_interval, block in zip(depth_intervals, blocks, strict=False)
]


def merge_blocks_by_vertical_spacing(blocks: list[TextBlock], target_merge_count: int) -> list[TextBlock]:
"""Merge textblocks without any geometric lines that separates them.
Note: Deprecated. Currently not in use any more. Kept here until we are sure that it is not needed anymore.
Note: Deprecated. Currently not in use anymore. Kept here until we are sure that it is not needed anymore.
The logic looks at the distances between the textblocks and merges them if they are closer
than a certain cutoff.
Expand Down
14 changes: 11 additions & 3 deletions src/stratigraphy/layer/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,15 @@ class LayersInDocument:

@dataclass
class IntervalBlockGroup:
"""A class to represent a group of depth interval blocks."""
"""A class to represent a group of depth intervals and a group of associated text blocks."""

depth_interval: Interval | list[Interval] | None
block: TextBlock | list[TextBlock]
depth_intervals: list[Interval]
blocks: list[TextBlock]


@dataclass
class IntervalBlockPair:
"""A class to represent an optional depth interval and an associated text block."""

depth_interval: Interval | None
block: TextBlock
11 changes: 1 addition & 10 deletions src/stratigraphy/text/textblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Self
from typing import Self

import fitz
import numpy as np
Expand Down Expand Up @@ -164,15 +164,6 @@ def _is_legend(self) -> bool:
y0_coordinates.append(line.rect.y0)
return number_horizontally_close > 1 or number_vertically_close > 2

def to_json(self) -> dict[str, Any]:
"""Convert the TextBlock object to a JSON serializable dictionary."""
return {
"text": self.text,
"rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1],
"lines": [line.to_json() for line in self.lines],
"page": self.page_number,
}


def _is_close(a: float, b: list, tolerance: float) -> bool:
return any(abs(a - c) < tolerance for c in b)
Expand Down

0 comments on commit d802102

Please sign in to comment.