From 474e0ea8ad1535c6cf95804cc086aa4074dd61a8 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Tue, 21 May 2024 15:01:05 +0200 Subject: [PATCH 1/9] Refactor detection of material blocks using LayerIdentifierColumn. --- src/stratigraphy/extract.py | 129 +++++++----------- .../util/layer_identifier_column.py | 29 +++- 2 files changed, 80 insertions(+), 78 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 6d035013..4aadc630 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -5,7 +5,6 @@ import fitz -from stratigraphy import DATAPATH from stratigraphy.util import find_depth_columns from stratigraphy.util.dataclasses import Line from stratigraphy.util.depthcolumn import DepthColumn @@ -16,6 +15,7 @@ ) from stratigraphy.util.interval import BoundaryInterval, Interval from stratigraphy.util.layer_identifier_column import ( + LayerIdentifierColumn, find_layer_identifier_column, find_layer_identifier_column_entries, ) @@ -71,86 +71,55 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict lines.append(TextLine(current_line_words)) current_line_words = [] - depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True) - layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words) - - used_entry_rects = [] - for column in layer_depth_columns: - for entry in column.entries: - used_entry_rects.extend([entry.start.rect, entry.end.rect]) - - depth_column_entries = [ - entry - for entry in find_depth_columns.depth_column_entries(words, include_splits=False) - if entry.rect not in used_entry_rects - ] - depth_columns: list[DepthColumn] = layer_depth_columns - depth_columns.extend( - find_depth_columns.find_depth_columns( - depth_column_entries, words, depth_column_params=params["depth_column_params"] - ) - ) - # Detect Layer Index Columns layer_identifier_entries = find_layer_identifier_column_entries(words) layer_identifier_columns = ( find_layer_identifier_column(layer_identifier_entries) if layer_identifier_entries else [] ) + pairs = [] if layer_identifier_columns: - layer_identifier_pairs = [] for layer_identifier_column in layer_identifier_columns: material_description_rect = find_material_description_column( lines, layer_identifier_column, language, **params["material_description"] ) if material_description_rect: - layer_identifier_pairs.append((layer_identifier_column, material_description_rect)) - - # Obtain the best pair. In contrast to depth columns, there only ever is one layer index column per page. - if layer_identifier_pairs: - layer_identifier_pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1])) - layer_identifier_column, material_description_rect = layer_identifier_pairs[-1] - # split the material description rect into blocks. - description_lines = get_description_lines(lines, material_description_rect) - blocks = get_description_blocks_from_layer_identifier(layer_identifier_column.entries, description_lines) - - predictions = [{"material_description": block.to_json()} for block in blocks] - predictions = parse_and_remove_empty_predictions(predictions) - - json_filtered_pairs = [ - { - "depth_column": None, - "material_description_rect": [ - material_description_rect.x0, - material_description_rect.y0, - material_description_rect.x1, - material_description_rect.y1, - ], - } - ] - - # Visualization: To be dropped before merging to main. - for layer_identifier_column in layer_identifier_columns: - fitz.utils.draw_rect( - page, layer_identifier_column.rect() * page.derotation_matrix, color=fitz.utils.getColor("blue") - ) - for block in blocks: - fitz.utils.draw_rect(page, block.rect * page.derotation_matrix, color=fitz.utils.getColor("red")) - fitz.utils.draw_rect( - page, material_description_rect * page.derotation_matrix, color=fitz.utils.getColor("blue") + pairs.append((layer_identifier_column, material_description_rect)) + + # Obtain the best pair. In contrast do depth columns, there only ever is one layer index column per page. + if pairs: + pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1])) + + # If there is a layer identifier column, then we use this directly. + # Else, we search for depth columns. We could also think of some scoring mechanism to decide which one to use. + if not pairs: + depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True) + layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words) + + used_entry_rects = [] + for column in layer_depth_columns: + for entry in column.entries: + used_entry_rects.extend([entry.start.rect, entry.end.rect]) + + depth_column_entries = [ + entry + for entry in find_depth_columns.depth_column_entries(words, include_splits=False) + if entry.rect not in used_entry_rects + ] + depth_columns: list[DepthColumn] = layer_depth_columns + depth_columns.extend( + find_depth_columns.find_depth_columns( + depth_column_entries, words, depth_column_params=params["depth_column_params"] ) - page.parent.save(DATAPATH / "_temp" / "output.pdf", garbage=4, deflate=True, clean=True) - - return predictions, json_filtered_pairs - - pairs = [] - for depth_column in depth_columns: - material_description_rect = find_material_description_column( - lines, depth_column, language, **params["material_description"] ) - if material_description_rect: - pairs.append((depth_column, material_description_rect)) - # lowest score first - pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words)) + + for depth_column in depth_columns: + material_description_rect = find_material_description_column( + lines, depth_column, language, **params["material_description"] + ) + if material_description_rect: + pairs.append((depth_column, material_description_rect)) + # lowest score first + pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words)) to_delete = [] for i, (_depth_column, material_description_rect) in enumerate(pairs): @@ -257,7 +226,7 @@ def score_column_match( def match_columns( - depth_column: DepthColumn, + depth_column: DepthColumn | LayerIdentifierColumn, description_lines: list[TextLine], geometric_lines: list[Line], material_description_rect: fitz.Rect, @@ -269,7 +238,7 @@ def match_columns( In this process, the number of text blocks is adjusted to match the number of depth intervals. Args: - depth_column (DepthColumn): The depth column. + depth_column (DepthColumn | LayerIdentifierColumn): The depth column. description_lines (list[TextLine]): The description lines. geometric_lines (list[Line]): The geometric lines. material_description_rect (fitz.Rect): The material description rectangle. @@ -278,13 +247,19 @@ def match_columns( Returns: list: The matched depth intervals and text blocks. """ - return [ - element - for group in depth_column.identify_groups( - description_lines, geometric_lines, material_description_rect, **params - ) - for element in transform_groups(group["depth_intervals"], group["blocks"], **params) - ] + if isinstance(depth_column, DepthColumn): + return [ + element + for group in depth_column.identify_groups( + description_lines, geometric_lines, material_description_rect, **params + ) + for element in transform_groups(group["depth_intervals"], group["blocks"], **params) + ] + elif isinstance(depth_column, LayerIdentifierColumn): + blocks = get_description_blocks_from_layer_identifier(depth_column.entries, description_lines) + return [{"block": block} for block in blocks] + else: + raise ValueError("depth_column must be a DepthColumn or a LayerIdentifierColumn.") def transform_groups( diff --git a/src/stratigraphy/util/layer_identifier_column.py b/src/stratigraphy/util/layer_identifier_column.py index 5d8316bc..ce6f0356 100644 --- a/src/stratigraphy/util/layer_identifier_column.py +++ b/src/stratigraphy/util/layer_identifier_column.py @@ -7,6 +7,26 @@ from stratigraphy.util.line import TextWord +class LayerIdentifierEntry: + """Class for a layer identifier entry. + + Note: As of now this is very similar to DepthColumnEntry. Refactoring may be desired. + """ + + def __init__(self, rect: fitz.Rect, text: str): + self.rect = rect + self.text = text + + def __repr__(self): + return str(self.text) + + def to_json(self): + return { + "text": self.text, + "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], + } + + class LayerIdentifierColumn: """Class for a layer identifier column.""" @@ -16,7 +36,7 @@ def __init__(self, entries: list[TextWord]): Args: entries (list[TextWord]): The entries corresponding to the layer indices. """ - self.entries = entries + self.entries = [LayerIdentifierEntry(entry.rect, entry.text) for entry in entries] @property def max_x0(self) -> float: @@ -92,6 +112,13 @@ def is_contained(self, rect: fitz.Rect) -> bool: and self.rect().y1 <= rect.y1 ) + def to_json(self): + rect = self.rect() + return { + "rect": [rect.x0, rect.y0, rect.x1, rect.y1], + "entries": [entry.to_json() for entry in self.entries], + } + def find_layer_identifier_column_entries(all_words: list[TextWord]) -> list: r"""Find the layer identifier column entries. From 8e866366fb4c06bcf4a1082beab099e6784fbd37 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Tue, 21 May 2024 15:06:02 +0200 Subject: [PATCH 2/9] Documentation and minor refactor. --- src/stratigraphy/extract.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 4aadc630..2f23c8d6 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -235,7 +235,8 @@ def match_columns( """Match the depth column entries with the description lines. This function identifies groups of depth intervals and text blocks that are likely to match. - In this process, the number of text blocks is adjusted to match the number of depth intervals. + Makes a distinction between DepthColumn and LayerIdentifierColumn and obtains the corresponding text blocks + as well as their depth intervals where present. Args: depth_column (DepthColumn | LayerIdentifierColumn): The depth column. @@ -259,7 +260,9 @@ def match_columns( blocks = get_description_blocks_from_layer_identifier(depth_column.entries, description_lines) return [{"block": block} for block in blocks] else: - raise ValueError("depth_column must be a DepthColumn or a LayerIdentifierColumn.") + raise ValueError( + f"depth_column must be a DepthColumn or a LayerIdentifierColumn object. Got {type(depth_column)}." + ) def transform_groups( From 44a492e687fcfd0b3272356fa161b6512add970e Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 22 May 2024 10:16:06 +0200 Subject: [PATCH 3/9] Depth interval recognition for Deriaz layouts. --- src/stratigraphy/extract.py | 9 ++- .../util/layer_identifier_column.py | 65 +++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 2f23c8d6..d48f9bf5 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -258,7 +258,14 @@ def match_columns( ] elif isinstance(depth_column, LayerIdentifierColumn): blocks = get_description_blocks_from_layer_identifier(depth_column.entries, description_lines) - return [{"block": block} for block in blocks] + groups = [] + for block in blocks: + depth_interval = depth_column.get_depth_interval(block) + if depth_interval: + groups.append({"depth_interval": depth_interval, "block": block}) + else: + groups.append({"block": block}) + return groups else: raise ValueError( f"depth_column must be a DepthColumn or a LayerIdentifierColumn object. Got {type(depth_column)}." diff --git a/src/stratigraphy/util/layer_identifier_column.py b/src/stratigraphy/util/layer_identifier_column.py index ce6f0356..85533338 100644 --- a/src/stratigraphy/util/layer_identifier_column.py +++ b/src/stratigraphy/util/layer_identifier_column.py @@ -3,8 +3,11 @@ import re import fitz +import numpy as np from stratigraphy.util.line import TextWord +from stratigraphy.util.depthcolumn import DepthColumnEntry, LayerDepthColumnEntry +from stratigraphy.util.textblock import TextBlock class LayerIdentifierEntry: @@ -112,6 +115,68 @@ def is_contained(self, rect: fitz.Rect) -> bool: and self.rect().y1 <= rect.y1 ) + def get_depth_interval(self, block: TextBlock) -> LayerDepthColumnEntry: + """Extract depth interval from a material description block. + + For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description + of the material. Often, these text descriptions contain a further separation into multiple sub layers. + These sub layers have their own depth intervals. This function extracts the overall depth interval, + spanning across all mentioned sub layers. + + Note: There is a lot of similarity with find_depth_columns.find_depth_columns. Refactoring may be desired. + + Args: + block (TextBlock): The block to calculate the depth interval for. + + Returns: + LayerDepthColumnEntry: The depth interval. + """ + + def value_as_float(string_value: str) -> float: # noqa: D103 + # OCR sometimes tends to miss the decimal comma + parsed_text = re.sub(r"-?([0-9]+)([0-9]{2})", r"\1.\2", string_value) + # remove final "." + parsed_text = re.sub(r"\D+$", "", parsed_text) + return abs(float(parsed_text)) + + # The regular expression pattern + pattern = re.compile(r"-?\d+[\.,]?\d*\s*[müMN\\.]*\s*-\s*\d+[\.,]?\d*\s*[müMN\\.]?") + + number_pairs = [] + rects = [] + for line in block.lines: + match = pattern.findall(line.text) + if match: + for m in match: + try: + numbers = m.split("-") + # Remove any trailing 'm' and leading/trailing whitespace, and replace commas with periods + numbers = [n.strip().replace("m", "").replace(",", ".") for n in numbers] + number_pairs.append([value_as_float(n) for n in numbers]) + if any(n is None for n in number_pairs[-1]): + number_pairs.pop() + continue + first_half_rect = fitz.Rect( + line.rect.x0, line.rect.y0, line.rect.x1 - line.rect.width / 2, line.rect.y1 + ) + second_half_rect = fitz.Rect( + line.rect.x0 + line.rect.width / 2, line.rect.y0, line.rect.x1, line.rect.y1 + ) + rects.append([first_half_rect, second_half_rect]) + except ValueError: + pass + + if number_pairs: + start_idx = np.argmin([pair[0] for pair in number_pairs]) + end_idx = np.argmax([pair[1] for pair in number_pairs]) + + start = DepthColumnEntry(rects[start_idx][0], number_pairs[start_idx][0]) + end = DepthColumnEntry(rects[end_idx][1], number_pairs[end_idx][1]) + + return LayerDepthColumnEntry(start, end) + else: + return None + def to_json(self): rect = self.rect() return { From 4ac569d843e4aff12449ed64b00541ea57072bfd Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 22 May 2024 11:19:13 +0200 Subject: [PATCH 4/9] Refactor: Refactored the way layer depth intervals are extracted from textblocks for Deriaz borehole profiles. --- src/stratigraphy/util/find_depth_columns.py | 64 ++++++++++++------- .../util/layer_identifier_column.py | 61 ++++++------------ src/stratigraphy/util/line.py | 6 ++ 3 files changed, 65 insertions(+), 66 deletions(-) diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py index 78557966..9dfed63b 100644 --- a/src/stratigraphy/util/find_depth_columns.py +++ b/src/stratigraphy/util/find_depth_columns.py @@ -6,7 +6,7 @@ from stratigraphy.util.depthcolumn import BoundaryDepthColumn, LayerDepthColumn from stratigraphy.util.depthcolumnentry import DepthColumnEntry, LayerDepthColumnEntry -from stratigraphy.util.line import TextWord +from stratigraphy.util.line import TextWord, TextLine def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]: @@ -21,12 +21,6 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis Returns: list[DepthColumnEntry]: The extracted depth column entries. """ - - def value_as_float(string_value: str) -> float: # noqa: D103 - # OCR sometimes tends to miss the decimal comma - parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value) - return abs(float(parsed_text)) - entries = [] for word in sorted(all_words, key=lambda word: word.rect.y0): try: @@ -37,27 +31,51 @@ def value_as_float(string_value: str) -> float: # noqa: D103 value = value_as_float(match.group(1)) entries.append(DepthColumnEntry(word.rect, value)) elif include_splits: - # support for e.g. "1.10-1.60m" extracted as a single word - regex2 = re.compile(r"^-?([0-9]+(\.[0-9]+)?)[müMN\\.]*\W+([0-9]+(\.[0-9]+)?)[müMN\\.]*$") - match2 = regex2.match(input_string) - - if match2: - value1 = value_as_float(match2.group(1)) - first_half_rect = fitz.Rect( - word.rect.x0, word.rect.y0, word.rect.x1 - word.rect.width / 2, word.rect.y1 - ) - entries.append(DepthColumnEntry(first_half_rect, value1)) - - value2 = value_as_float(match2.group(3)) - second_half_rect = fitz.Rect( - word.rect.x0 + word.rect.width / 2, word.rect.y0, word.rect.x1, word.rect.y1 - ) - entries.append(DepthColumnEntry(second_half_rect, value2)) + entries.extend(extract_layer_depth_interval_entries(input_string, word)) except ValueError: pass return entries +def value_as_float(string_value: str) -> float: # noqa: D103 + # OCR sometimes tends to miss the decimal comma + parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value) + return abs(float(parsed_text)) + + +def extract_layer_depth_interval_entries( + input_string: str, line: TextWord|TextLine, require_start_of_string: bool = True +) -> list[DepthColumnEntry]: + """Extracts two DepthColumnEntry obejcts from a string to represent a layer depth interval. + + Args: + input_string (str): The string to extract the depth interval from. + line (TextLine): The line object containing the rectangle of the string. + require_start_of_string (bool, optional): Whether the number to extract needs to be + at the start of a string. Defaults to True. + + Returns: + list[DepthColumnEntry]: The extracted depth column entries. Either two entries or an empty list. + """ + input_string = line.text.strip().replace(",", ".") + entries = [] + query = r"-?([0-9]+(\.[0-9]+)?)[müMN\]*[\s-]+([0-9]+(\.[0-9]+)?)[müMN\\.]*" + if not require_start_of_string: + query = r".*?" + query + regex = re.compile(query) + match = regex.match(input_string) + if match: + value1 = value_as_float(match.group(1)) + first_half_rect = fitz.Rect(line.rect.x0, line.rect.y0, line.rect.x1 - line.rect.width / 2, line.rect.y1) + entries.append(DepthColumnEntry(first_half_rect, value1)) + + value2 = value_as_float(match.group(3)) + second_half_rect = fitz.Rect(line.rect.x0 + line.rect.width / 2, line.rect.y0, line.rect.x1, line.rect.y1) + entries.append(DepthColumnEntry(second_half_rect, value2)) + + return entries + + def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[TextWord]) -> list[LayerDepthColumn]: """Finds all layer depth columns. diff --git a/src/stratigraphy/util/layer_identifier_column.py b/src/stratigraphy/util/layer_identifier_column.py index 85533338..1118c0f1 100644 --- a/src/stratigraphy/util/layer_identifier_column.py +++ b/src/stratigraphy/util/layer_identifier_column.py @@ -6,7 +6,9 @@ import numpy as np from stratigraphy.util.line import TextWord -from stratigraphy.util.depthcolumn import DepthColumnEntry, LayerDepthColumnEntry +from stratigraphy.util.depthcolumn import LayerDepthColumnEntry +from stratigraphy.util.depthcolumn import LayerDepthColumnEntry +from stratigraphy.util.find_depth_columns import extract_layer_depth_interval_entries from stratigraphy.util.textblock import TextBlock @@ -123,55 +125,28 @@ def get_depth_interval(self, block: TextBlock) -> LayerDepthColumnEntry: These sub layers have their own depth intervals. This function extracts the overall depth interval, spanning across all mentioned sub layers. - Note: There is a lot of similarity with find_depth_columns.find_depth_columns. Refactoring may be desired. - Args: block (TextBlock): The block to calculate the depth interval for. Returns: LayerDepthColumnEntry: The depth interval. """ - - def value_as_float(string_value: str) -> float: # noqa: D103 - # OCR sometimes tends to miss the decimal comma - parsed_text = re.sub(r"-?([0-9]+)([0-9]{2})", r"\1.\2", string_value) - # remove final "." - parsed_text = re.sub(r"\D+$", "", parsed_text) - return abs(float(parsed_text)) - - # The regular expression pattern - pattern = re.compile(r"-?\d+[\.,]?\d*\s*[müMN\\.]*\s*-\s*\d+[\.,]?\d*\s*[müMN\\.]?") - - number_pairs = [] - rects = [] + depth_entries = [] for line in block.lines: - match = pattern.findall(line.text) - if match: - for m in match: - try: - numbers = m.split("-") - # Remove any trailing 'm' and leading/trailing whitespace, and replace commas with periods - numbers = [n.strip().replace("m", "").replace(",", ".") for n in numbers] - number_pairs.append([value_as_float(n) for n in numbers]) - if any(n is None for n in number_pairs[-1]): - number_pairs.pop() - continue - first_half_rect = fitz.Rect( - line.rect.x0, line.rect.y0, line.rect.x1 - line.rect.width / 2, line.rect.y1 - ) - second_half_rect = fitz.Rect( - line.rect.x0 + line.rect.width / 2, line.rect.y0, line.rect.x1, line.rect.y1 - ) - rects.append([first_half_rect, second_half_rect]) - except ValueError: - pass - - if number_pairs: - start_idx = np.argmin([pair[0] for pair in number_pairs]) - end_idx = np.argmax([pair[1] for pair in number_pairs]) - - start = DepthColumnEntry(rects[start_idx][0], number_pairs[start_idx][0]) - end = DepthColumnEntry(rects[end_idx][1], number_pairs[end_idx][1]) + try: + new_entries = extract_layer_depth_interval_entries(line.text, line, require_start_of_string=False) + if new_entries: + depth_entries.append(new_entries) + except ValueError: + pass + + if depth_entries: + # Merge the sub layers into one depth interval. + start_idx = np.argmin([entry[0].value for entry in depth_entries]) + end_idx = np.argmax([entry[1].value for entry in depth_entries]) + + start = depth_entries[start_idx][0] + end = depth_entries[end_idx][1] return LayerDepthColumnEntry(start, end) else: diff --git a/src/stratigraphy/util/line.py b/src/stratigraphy/util/line.py index b88d0215..8fe0f9c9 100644 --- a/src/stratigraphy/util/line.py +++ b/src/stratigraphy/util/line.py @@ -24,6 +24,12 @@ def __init__(self, rect: fitz.Rect, text: str): def __repr__(self) -> str: return f"TextWord({self.rect}, {self.text})" + def to_json(self): + return { + "text": self.text, + "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], + } + class TextLine: """Class to represent TextLine objects. From a17689aeab429877588ef47f85f73218916bf2ed Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 22 May 2024 11:40:54 +0200 Subject: [PATCH 5/9] satisfy linter. --- src/stratigraphy/util/find_depth_columns.py | 4 ++-- src/stratigraphy/util/layer_identifier_column.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py index 9dfed63b..f606ea90 100644 --- a/src/stratigraphy/util/find_depth_columns.py +++ b/src/stratigraphy/util/find_depth_columns.py @@ -6,7 +6,7 @@ from stratigraphy.util.depthcolumn import BoundaryDepthColumn, LayerDepthColumn from stratigraphy.util.depthcolumnentry import DepthColumnEntry, LayerDepthColumnEntry -from stratigraphy.util.line import TextWord, TextLine +from stratigraphy.util.line import TextLine, TextWord def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]: @@ -44,7 +44,7 @@ def value_as_float(string_value: str) -> float: # noqa: D103 def extract_layer_depth_interval_entries( - input_string: str, line: TextWord|TextLine, require_start_of_string: bool = True + input_string: str, line: TextWord | TextLine, require_start_of_string: bool = True ) -> list[DepthColumnEntry]: """Extracts two DepthColumnEntry obejcts from a string to represent a layer depth interval. diff --git a/src/stratigraphy/util/layer_identifier_column.py b/src/stratigraphy/util/layer_identifier_column.py index 1118c0f1..ea43293f 100644 --- a/src/stratigraphy/util/layer_identifier_column.py +++ b/src/stratigraphy/util/layer_identifier_column.py @@ -5,10 +5,9 @@ import fitz import numpy as np -from stratigraphy.util.line import TextWord -from stratigraphy.util.depthcolumn import LayerDepthColumnEntry from stratigraphy.util.depthcolumn import LayerDepthColumnEntry from stratigraphy.util.find_depth_columns import extract_layer_depth_interval_entries +from stratigraphy.util.line import TextWord from stratigraphy.util.textblock import TextBlock From cfe31f630e01a195b75de92c9978a8683e9bf0ff Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 22 May 2024 14:16:14 +0200 Subject: [PATCH 6/9] Refactoring; Docstring documentation. --- src/stratigraphy/util/find_depth_columns.py | 17 +++++++++-------- .../util/layer_identifier_column.py | 12 +++++++----- src/stratigraphy/util/line.py | 6 ------ 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py index f606ea90..792124bb 100644 --- a/src/stratigraphy/util/find_depth_columns.py +++ b/src/stratigraphy/util/find_depth_columns.py @@ -6,7 +6,7 @@ from stratigraphy.util.depthcolumn import BoundaryDepthColumn, LayerDepthColumn from stratigraphy.util.depthcolumnentry import DepthColumnEntry, LayerDepthColumnEntry -from stratigraphy.util.line import TextLine, TextWord +from stratigraphy.util.line import TextWord def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]: @@ -31,7 +31,8 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis value = value_as_float(match.group(1)) entries.append(DepthColumnEntry(word.rect, value)) elif include_splits: - entries.extend(extract_layer_depth_interval_entries(input_string, word)) + # support for e.g. "1.10-1.60m" extracted as a single word + entries.extend(extract_layer_depth_interval_entries(input_string, word.rect)) except ValueError: pass return entries @@ -44,20 +45,20 @@ def value_as_float(string_value: str) -> float: # noqa: D103 def extract_layer_depth_interval_entries( - input_string: str, line: TextWord | TextLine, require_start_of_string: bool = True + text: str, rect: fitz.Rect, require_start_of_string: bool = True ) -> list[DepthColumnEntry]: """Extracts two DepthColumnEntry obejcts from a string to represent a layer depth interval. Args: - input_string (str): The string to extract the depth interval from. - line (TextLine): The line object containing the rectangle of the string. + text (str): The string to extract the depth interval from. + rect (fitz.Rect): The rectangle of the text. require_start_of_string (bool, optional): Whether the number to extract needs to be at the start of a string. Defaults to True. Returns: list[DepthColumnEntry]: The extracted depth column entries. Either two entries or an empty list. """ - input_string = line.text.strip().replace(",", ".") + input_string = text.strip().replace(",", ".") entries = [] query = r"-?([0-9]+(\.[0-9]+)?)[müMN\]*[\s-]+([0-9]+(\.[0-9]+)?)[müMN\\.]*" if not require_start_of_string: @@ -66,11 +67,11 @@ def extract_layer_depth_interval_entries( match = regex.match(input_string) if match: value1 = value_as_float(match.group(1)) - first_half_rect = fitz.Rect(line.rect.x0, line.rect.y0, line.rect.x1 - line.rect.width / 2, line.rect.y1) + first_half_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 - rect.width / 2, rect.y1) entries.append(DepthColumnEntry(first_half_rect, value1)) value2 = value_as_float(match.group(3)) - second_half_rect = fitz.Rect(line.rect.x0 + line.rect.width / 2, line.rect.y0, line.rect.x1, line.rect.y1) + second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1) entries.append(DepthColumnEntry(second_half_rect, value2)) return entries diff --git a/src/stratigraphy/util/layer_identifier_column.py b/src/stratigraphy/util/layer_identifier_column.py index ea43293f..d0bb3990 100644 --- a/src/stratigraphy/util/layer_identifier_column.py +++ b/src/stratigraphy/util/layer_identifier_column.py @@ -34,13 +34,13 @@ def to_json(self): class LayerIdentifierColumn: """Class for a layer identifier column.""" - def __init__(self, entries: list[TextWord]): + def __init__(self, words: list[TextWord]): """Initialize the LayerIdentifierColumn object. Args: - entries (list[TextWord]): The entries corresponding to the layer indices. + words (list[TextWord]): The entries corresponding to the layer indices. """ - self.entries = [LayerIdentifierEntry(entry.rect, entry.text) for entry in entries] + self.entries = [LayerIdentifierEntry(word.rect, word.text) for word in words] @property def max_x0(self) -> float: @@ -71,7 +71,7 @@ def add_entry(self, entry: TextWord): Args: entry (TextWord): The layer identifier column entry to be added. """ - self.entries.append(entry) + self.entries.append(LayerIdentifierEntry(entry.rect, entry.text)) def can_be_appended(self, rect: fitz.Rect) -> bool: """Checks if a new layer identifier column entry can be appended to the current layer identifier column. @@ -133,7 +133,9 @@ def get_depth_interval(self, block: TextBlock) -> LayerDepthColumnEntry: depth_entries = [] for line in block.lines: try: - new_entries = extract_layer_depth_interval_entries(line.text, line, require_start_of_string=False) + new_entries = extract_layer_depth_interval_entries(line.text, line.rect, require_start_of_string=False) + # require_start_of_string = False because the depth interval may not always start at the beginning + # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" if new_entries: depth_entries.append(new_entries) except ValueError: diff --git a/src/stratigraphy/util/line.py b/src/stratigraphy/util/line.py index 8fe0f9c9..b88d0215 100644 --- a/src/stratigraphy/util/line.py +++ b/src/stratigraphy/util/line.py @@ -24,12 +24,6 @@ def __init__(self, rect: fitz.Rect, text: str): def __repr__(self) -> str: return f"TextWord({self.rect}, {self.text})" - def to_json(self): - return { - "text": self.text, - "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], - } - class TextLine: """Class to represent TextLine objects. From ddf82d1fc15920d4125fa295c7a12e11cb86129c Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 22 May 2024 16:48:08 +0200 Subject: [PATCH 7/9] Refactor: Refactored extract_layer_depth_interval --- src/stratigraphy/util/find_depth_columns.py | 23 +++++++++++-------- .../util/layer_identifier_column.py | 16 +++++-------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py index 792124bb..5ba730a0 100644 --- a/src/stratigraphy/util/find_depth_columns.py +++ b/src/stratigraphy/util/find_depth_columns.py @@ -32,7 +32,10 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis entries.append(DepthColumnEntry(word.rect, value)) elif include_splits: # support for e.g. "1.10-1.60m" extracted as a single word - entries.extend(extract_layer_depth_interval_entries(input_string, word.rect)) + layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect) + entries.extend( + [layer_depth_column_entry.start, layer_depth_column_entry.end] if layer_depth_column_entry else [] + ) except ValueError: pass return entries @@ -44,10 +47,10 @@ def value_as_float(string_value: str) -> float: # noqa: D103 return abs(float(parsed_text)) -def extract_layer_depth_interval_entries( +def extract_layer_depth_interval( text: str, rect: fitz.Rect, require_start_of_string: bool = True -) -> list[DepthColumnEntry]: - """Extracts two DepthColumnEntry obejcts from a string to represent a layer depth interval. +) -> LayerDepthColumnEntry | None: + """Extracts a LayerDepthColumnEntry from a string. Args: text (str): The string to extract the depth interval from. @@ -56,10 +59,10 @@ def extract_layer_depth_interval_entries( at the start of a string. Defaults to True. Returns: - list[DepthColumnEntry]: The extracted depth column entries. Either two entries or an empty list. + LayerDepthColumnEntry | None: The extracted LayerDepthColumnEntry or None if none is found. """ input_string = text.strip().replace(",", ".") - entries = [] + query = r"-?([0-9]+(\.[0-9]+)?)[müMN\]*[\s-]+([0-9]+(\.[0-9]+)?)[müMN\\.]*" if not require_start_of_string: query = r".*?" + query @@ -68,13 +71,13 @@ def extract_layer_depth_interval_entries( if match: value1 = value_as_float(match.group(1)) first_half_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 - rect.width / 2, rect.y1) - entries.append(DepthColumnEntry(first_half_rect, value1)) value2 = value_as_float(match.group(3)) second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1) - entries.append(DepthColumnEntry(second_half_rect, value2)) - - return entries + return LayerDepthColumnEntry( + DepthColumnEntry(first_half_rect, value1), DepthColumnEntry(second_half_rect, value2) + ) + return None def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[TextWord]) -> list[LayerDepthColumn]: diff --git a/src/stratigraphy/util/layer_identifier_column.py b/src/stratigraphy/util/layer_identifier_column.py index d0bb3990..327d6507 100644 --- a/src/stratigraphy/util/layer_identifier_column.py +++ b/src/stratigraphy/util/layer_identifier_column.py @@ -3,10 +3,9 @@ import re import fitz -import numpy as np from stratigraphy.util.depthcolumn import LayerDepthColumnEntry -from stratigraphy.util.find_depth_columns import extract_layer_depth_interval_entries +from stratigraphy.util.find_depth_columns import extract_layer_depth_interval from stratigraphy.util.line import TextWord from stratigraphy.util.textblock import TextBlock @@ -133,21 +132,18 @@ def get_depth_interval(self, block: TextBlock) -> LayerDepthColumnEntry: depth_entries = [] for line in block.lines: try: - new_entries = extract_layer_depth_interval_entries(line.text, line.rect, require_start_of_string=False) + layer_depth_entry = extract_layer_depth_interval(line.text, line.rect, require_start_of_string=False) # require_start_of_string = False because the depth interval may not always start at the beginning # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" - if new_entries: - depth_entries.append(new_entries) + if layer_depth_entry: + depth_entries.append(layer_depth_entry) except ValueError: pass if depth_entries: # Merge the sub layers into one depth interval. - start_idx = np.argmin([entry[0].value for entry in depth_entries]) - end_idx = np.argmax([entry[1].value for entry in depth_entries]) - - start = depth_entries[start_idx][0] - end = depth_entries[end_idx][1] + start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value) + end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value) return LayerDepthColumnEntry(start, end) else: From edb3e04284ceb230450f1975371a1b69b01b0fb6 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Wed, 22 May 2024 17:00:18 +0200 Subject: [PATCH 8/9] only check for layer identifiers in the first word of each line --- src/stratigraphy/extract.py | 2 +- .../util/layer_identifier_column.py | 41 ++++++++++--------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index d48f9bf5..cb9732aa 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -72,7 +72,7 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict current_line_words = [] # Detect Layer Index Columns - layer_identifier_entries = find_layer_identifier_column_entries(words) + layer_identifier_entries = find_layer_identifier_column_entries(lines) layer_identifier_columns = ( find_layer_identifier_column(layer_identifier_entries) if layer_identifier_entries else [] ) diff --git a/src/stratigraphy/util/layer_identifier_column.py b/src/stratigraphy/util/layer_identifier_column.py index d0bb3990..b3ad0f59 100644 --- a/src/stratigraphy/util/layer_identifier_column.py +++ b/src/stratigraphy/util/layer_identifier_column.py @@ -7,7 +7,7 @@ from stratigraphy.util.depthcolumn import LayerDepthColumnEntry from stratigraphy.util.find_depth_columns import extract_layer_depth_interval_entries -from stratigraphy.util.line import TextWord +from stratigraphy.util.line import TextLine from stratigraphy.util.textblock import TextBlock @@ -34,13 +34,13 @@ def to_json(self): class LayerIdentifierColumn: """Class for a layer identifier column.""" - def __init__(self, words: list[TextWord]): + def __init__(self, entries: list[LayerIdentifierEntry]): """Initialize the LayerIdentifierColumn object. Args: - words (list[TextWord]): The entries corresponding to the layer indices. + entries (list[LayerIdentifierEntry]): The entries corresponding to the layer indices. """ - self.entries = [LayerIdentifierEntry(word.rect, word.text) for word in words] + self.entries = entries @property def max_x0(self) -> float: @@ -65,13 +65,13 @@ def rect(self) -> fitz.Rect: def rects(self) -> list[fitz.Rect]: return [entry.rect for entry in self.entries] - def add_entry(self, entry: TextWord): + def add_entry(self, entry: LayerIdentifierEntry): """Add a new layer identifier column entry to the layer identifier column. Args: - entry (TextWord): The layer identifier column entry to be added. + entry (LayerIdentifierEntry): The layer identifier column entry to be added. """ - self.entries.append(LayerIdentifierEntry(entry.rect, entry.text)) + self.entries.append(entry) def can_be_appended(self, rect: fitz.Rect) -> bool: """Checks if a new layer identifier column entry can be appended to the current layer identifier column. @@ -161,7 +161,7 @@ def to_json(self): } -def find_layer_identifier_column_entries(all_words: list[TextWord]) -> list: +def find_layer_identifier_column_entries(lines: list[TextLine]) -> list[LayerIdentifierEntry]: r"""Find the layer identifier column entries. Regex explanation: @@ -172,30 +172,31 @@ def find_layer_identifier_column_entries(all_words: list[TextWord]) -> list: This regular expression will match strings like "1)", "2)", "a)", "b)", "1a4)", "6de)", etc. Args: - all_words (list[TextWord]): The words to search for layer identifier columns. + lines (list[TextLine]): The lines to search for layer identifier columns. Returns: - list: The layer identifier column entries. + list[LayerIdentifierEntry]: The layer identifier column entries. """ entries = [] - for word in sorted(all_words, key=lambda word: word.rect.y0): - # TODO There are quite a few false positives such as "(ca. 10 cm)" where "cm)" would be matched currently. - # Could we avoid some of those examples by requiring that the word is at the start of a line and/or there are - # no other words immediately to the left of it? - regex = re.compile(r"\b[\da-z-]+\)") - match = regex.match(word.text) - if match and len(word.text) < 7: - entries.append(word) + for line in sorted(lines, key=lambda line: line.rect.y0): + if len(line.words) > 0: + # Only match in the first word of every line, to avoid e.g. matching with "cm)" in a material description + # containing an expression like "(diameter max 6 cm)". + first_word = line.words[0] + regex = re.compile(r"\b[\da-z-]+\)") + match = regex.match(first_word.text) + if match and len(first_word.text) < 7: + entries.append(LayerIdentifierEntry(first_word.rect, first_word.text)) return entries -def find_layer_identifier_column(entries: list[TextWord]) -> list[LayerIdentifierColumn]: +def find_layer_identifier_column(entries: list[LayerIdentifierEntry]) -> list[LayerIdentifierColumn]: """Find the layer identifier column given the index column entries. Note: Similar to find_depth_columns.find_depth_columns. Refactoring may be desired. Args: - entries (list[TextWord]): The layer identifier column entries. + entries (list[LayerIdentifierEntry]): The layer identifier column entries. Returns: list[LayerIdentifierColumn]: The found layer identifier columns. From c8a6a8b1e255b9a27db308b99a99add4d94cbe52 Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Wed, 22 May 2024 17:11:37 +0200 Subject: [PATCH 9/9] undo accidental rename --- src/stratigraphy/util/find_depth_columns.py | 4 ++-- src/stratigraphy/util/layer_identifier_column.py | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py index 75a3c2b4..5ba730a0 100644 --- a/src/stratigraphy/util/find_depth_columns.py +++ b/src/stratigraphy/util/find_depth_columns.py @@ -32,7 +32,7 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis entries.append(DepthColumnEntry(word.rect, value)) elif include_splits: # support for e.g. "1.10-1.60m" extracted as a single word - layer_depth_column_entry = extract_layer_depth_interval_entries(input_string, word.rect) + layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect) entries.extend( [layer_depth_column_entry.start, layer_depth_column_entry.end] if layer_depth_column_entry else [] ) @@ -47,7 +47,7 @@ def value_as_float(string_value: str) -> float: # noqa: D103 return abs(float(parsed_text)) -def extract_layer_depth_interval_entries( +def extract_layer_depth_interval( text: str, rect: fitz.Rect, require_start_of_string: bool = True ) -> LayerDepthColumnEntry | None: """Extracts a LayerDepthColumnEntry from a string. diff --git a/src/stratigraphy/util/layer_identifier_column.py b/src/stratigraphy/util/layer_identifier_column.py index dba9b258..17a56a86 100644 --- a/src/stratigraphy/util/layer_identifier_column.py +++ b/src/stratigraphy/util/layer_identifier_column.py @@ -5,7 +5,7 @@ import fitz from stratigraphy.util.depthcolumn import LayerDepthColumnEntry -from stratigraphy.util.find_depth_columns import extract_layer_depth_interval_entries +from stratigraphy.util.find_depth_columns import extract_layer_depth_interval from stratigraphy.util.line import TextLine from stratigraphy.util.textblock import TextBlock @@ -132,9 +132,7 @@ def get_depth_interval(self, block: TextBlock) -> LayerDepthColumnEntry: depth_entries = [] for line in block.lines: try: - layer_depth_entry = extract_layer_depth_interval_entries( - line.text, line.rect, require_start_of_string=False - ) + layer_depth_entry = extract_layer_depth_interval(line.text, line.rect, require_start_of_string=False) # require_start_of_string = False because the depth interval may not always start at the beginning # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" if layer_depth_entry: