diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 6d035013..cb9732aa 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -5,7 +5,6 @@ import fitz -from stratigraphy import DATAPATH from stratigraphy.util import find_depth_columns from stratigraphy.util.dataclasses import Line from stratigraphy.util.depthcolumn import DepthColumn @@ -16,6 +15,7 @@ ) from stratigraphy.util.interval import BoundaryInterval, Interval from stratigraphy.util.layer_identifier_column import ( + LayerIdentifierColumn, find_layer_identifier_column, find_layer_identifier_column_entries, ) @@ -71,86 +71,55 @@ def process_page(page: fitz.Page, geometric_lines, language: str, **params: dict lines.append(TextLine(current_line_words)) current_line_words = [] - depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True) - layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words) - - used_entry_rects = [] - for column in layer_depth_columns: - for entry in column.entries: - used_entry_rects.extend([entry.start.rect, entry.end.rect]) - - depth_column_entries = [ - entry - for entry in find_depth_columns.depth_column_entries(words, include_splits=False) - if entry.rect not in used_entry_rects - ] - depth_columns: list[DepthColumn] = layer_depth_columns - depth_columns.extend( - find_depth_columns.find_depth_columns( - depth_column_entries, words, depth_column_params=params["depth_column_params"] - ) - ) - # Detect Layer Index Columns - layer_identifier_entries = find_layer_identifier_column_entries(words) + layer_identifier_entries = find_layer_identifier_column_entries(lines) layer_identifier_columns = ( find_layer_identifier_column(layer_identifier_entries) if layer_identifier_entries else [] ) + pairs = [] if layer_identifier_columns: - layer_identifier_pairs = [] for layer_identifier_column in layer_identifier_columns: material_description_rect = find_material_description_column( lines, layer_identifier_column, language, **params["material_description"] ) if material_description_rect: - layer_identifier_pairs.append((layer_identifier_column, material_description_rect)) - - # Obtain the best pair. In contrast to depth columns, there only ever is one layer index column per page. - if layer_identifier_pairs: - layer_identifier_pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1])) - layer_identifier_column, material_description_rect = layer_identifier_pairs[-1] - # split the material description rect into blocks. - description_lines = get_description_lines(lines, material_description_rect) - blocks = get_description_blocks_from_layer_identifier(layer_identifier_column.entries, description_lines) - - predictions = [{"material_description": block.to_json()} for block in blocks] - predictions = parse_and_remove_empty_predictions(predictions) - - json_filtered_pairs = [ - { - "depth_column": None, - "material_description_rect": [ - material_description_rect.x0, - material_description_rect.y0, - material_description_rect.x1, - material_description_rect.y1, - ], - } - ] - - # Visualization: To be dropped before merging to main. - for layer_identifier_column in layer_identifier_columns: - fitz.utils.draw_rect( - page, layer_identifier_column.rect() * page.derotation_matrix, color=fitz.utils.getColor("blue") - ) - for block in blocks: - fitz.utils.draw_rect(page, block.rect * page.derotation_matrix, color=fitz.utils.getColor("red")) - fitz.utils.draw_rect( - page, material_description_rect * page.derotation_matrix, color=fitz.utils.getColor("blue") + pairs.append((layer_identifier_column, material_description_rect)) + + # Obtain the best pair. In contrast do depth columns, there only ever is one layer index column per page. + if pairs: + pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1])) + + # If there is a layer identifier column, then we use this directly. + # Else, we search for depth columns. We could also think of some scoring mechanism to decide which one to use. + if not pairs: + depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True) + layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words) + + used_entry_rects = [] + for column in layer_depth_columns: + for entry in column.entries: + used_entry_rects.extend([entry.start.rect, entry.end.rect]) + + depth_column_entries = [ + entry + for entry in find_depth_columns.depth_column_entries(words, include_splits=False) + if entry.rect not in used_entry_rects + ] + depth_columns: list[DepthColumn] = layer_depth_columns + depth_columns.extend( + find_depth_columns.find_depth_columns( + depth_column_entries, words, depth_column_params=params["depth_column_params"] ) - page.parent.save(DATAPATH / "_temp" / "output.pdf", garbage=4, deflate=True, clean=True) - - return predictions, json_filtered_pairs - - pairs = [] - for depth_column in depth_columns: - material_description_rect = find_material_description_column( - lines, depth_column, language, **params["material_description"] ) - if material_description_rect: - pairs.append((depth_column, material_description_rect)) - # lowest score first - pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words)) + + for depth_column in depth_columns: + material_description_rect = find_material_description_column( + lines, depth_column, language, **params["material_description"] + ) + if material_description_rect: + pairs.append((depth_column, material_description_rect)) + # lowest score first + pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words)) to_delete = [] for i, (_depth_column, material_description_rect) in enumerate(pairs): @@ -257,7 +226,7 @@ def score_column_match( def match_columns( - depth_column: DepthColumn, + depth_column: DepthColumn | LayerIdentifierColumn, description_lines: list[TextLine], geometric_lines: list[Line], material_description_rect: fitz.Rect, @@ -266,10 +235,11 @@ def match_columns( """Match the depth column entries with the description lines. This function identifies groups of depth intervals and text blocks that are likely to match. - In this process, the number of text blocks is adjusted to match the number of depth intervals. + Makes a distinction between DepthColumn and LayerIdentifierColumn and obtains the corresponding text blocks + as well as their depth intervals where present. Args: - depth_column (DepthColumn): The depth column. + depth_column (DepthColumn | LayerIdentifierColumn): The depth column. description_lines (list[TextLine]): The description lines. geometric_lines (list[Line]): The geometric lines. material_description_rect (fitz.Rect): The material description rectangle. @@ -278,13 +248,28 @@ def match_columns( Returns: list: The matched depth intervals and text blocks. """ - return [ - element - for group in depth_column.identify_groups( - description_lines, geometric_lines, material_description_rect, **params + if isinstance(depth_column, DepthColumn): + return [ + element + for group in depth_column.identify_groups( + description_lines, geometric_lines, material_description_rect, **params + ) + for element in transform_groups(group["depth_intervals"], group["blocks"], **params) + ] + elif isinstance(depth_column, LayerIdentifierColumn): + blocks = get_description_blocks_from_layer_identifier(depth_column.entries, description_lines) + groups = [] + for block in blocks: + depth_interval = depth_column.get_depth_interval(block) + if depth_interval: + groups.append({"depth_interval": depth_interval, "block": block}) + else: + groups.append({"block": block}) + return groups + else: + raise ValueError( + f"depth_column must be a DepthColumn or a LayerIdentifierColumn object. Got {type(depth_column)}." ) - for element in transform_groups(group["depth_intervals"], group["blocks"], **params) - ] def transform_groups( diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py index 78557966..5ba730a0 100644 --- a/src/stratigraphy/util/find_depth_columns.py +++ b/src/stratigraphy/util/find_depth_columns.py @@ -21,12 +21,6 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis Returns: list[DepthColumnEntry]: The extracted depth column entries. """ - - def value_as_float(string_value: str) -> float: # noqa: D103 - # OCR sometimes tends to miss the decimal comma - parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value) - return abs(float(parsed_text)) - entries = [] for word in sorted(all_words, key=lambda word: word.rect.y0): try: @@ -38,26 +32,54 @@ def value_as_float(string_value: str) -> float: # noqa: D103 entries.append(DepthColumnEntry(word.rect, value)) elif include_splits: # support for e.g. "1.10-1.60m" extracted as a single word - regex2 = re.compile(r"^-?([0-9]+(\.[0-9]+)?)[müMN\\.]*\W+([0-9]+(\.[0-9]+)?)[müMN\\.]*$") - match2 = regex2.match(input_string) - - if match2: - value1 = value_as_float(match2.group(1)) - first_half_rect = fitz.Rect( - word.rect.x0, word.rect.y0, word.rect.x1 - word.rect.width / 2, word.rect.y1 - ) - entries.append(DepthColumnEntry(first_half_rect, value1)) - - value2 = value_as_float(match2.group(3)) - second_half_rect = fitz.Rect( - word.rect.x0 + word.rect.width / 2, word.rect.y0, word.rect.x1, word.rect.y1 - ) - entries.append(DepthColumnEntry(second_half_rect, value2)) + layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect) + entries.extend( + [layer_depth_column_entry.start, layer_depth_column_entry.end] if layer_depth_column_entry else [] + ) except ValueError: pass return entries +def value_as_float(string_value: str) -> float: # noqa: D103 + # OCR sometimes tends to miss the decimal comma + parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value) + return abs(float(parsed_text)) + + +def extract_layer_depth_interval( + text: str, rect: fitz.Rect, require_start_of_string: bool = True +) -> LayerDepthColumnEntry | None: + """Extracts a LayerDepthColumnEntry from a string. + + Args: + text (str): The string to extract the depth interval from. + rect (fitz.Rect): The rectangle of the text. + require_start_of_string (bool, optional): Whether the number to extract needs to be + at the start of a string. Defaults to True. + + Returns: + LayerDepthColumnEntry | None: The extracted LayerDepthColumnEntry or None if none is found. + """ + input_string = text.strip().replace(",", ".") + + query = r"-?([0-9]+(\.[0-9]+)?)[müMN\]*[\s-]+([0-9]+(\.[0-9]+)?)[müMN\\.]*" + if not require_start_of_string: + query = r".*?" + query + regex = re.compile(query) + match = regex.match(input_string) + if match: + value1 = value_as_float(match.group(1)) + first_half_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 - rect.width / 2, rect.y1) + + value2 = value_as_float(match.group(3)) + second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1) + return LayerDepthColumnEntry( + DepthColumnEntry(first_half_rect, value1), DepthColumnEntry(second_half_rect, value2) + ) + return None + + def find_layer_depth_columns(entries: list[DepthColumnEntry], all_words: list[TextWord]) -> list[LayerDepthColumn]: """Finds all layer depth columns. diff --git a/src/stratigraphy/util/layer_identifier_column.py b/src/stratigraphy/util/layer_identifier_column.py index 5d8316bc..17a56a86 100644 --- a/src/stratigraphy/util/layer_identifier_column.py +++ b/src/stratigraphy/util/layer_identifier_column.py @@ -4,17 +4,40 @@ import fitz -from stratigraphy.util.line import TextWord +from stratigraphy.util.depthcolumn import LayerDepthColumnEntry +from stratigraphy.util.find_depth_columns import extract_layer_depth_interval +from stratigraphy.util.line import TextLine +from stratigraphy.util.textblock import TextBlock + + +class LayerIdentifierEntry: + """Class for a layer identifier entry. + + Note: As of now this is very similar to DepthColumnEntry. Refactoring may be desired. + """ + + def __init__(self, rect: fitz.Rect, text: str): + self.rect = rect + self.text = text + + def __repr__(self): + return str(self.text) + + def to_json(self): + return { + "text": self.text, + "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], + } class LayerIdentifierColumn: """Class for a layer identifier column.""" - def __init__(self, entries: list[TextWord]): + def __init__(self, entries: list[LayerIdentifierEntry]): """Initialize the LayerIdentifierColumn object. Args: - entries (list[TextWord]): The entries corresponding to the layer indices. + entries (list[LayerIdentifierEntry]): The entries corresponding to the layer indices. """ self.entries = entries @@ -41,11 +64,11 @@ def rect(self) -> fitz.Rect: def rects(self) -> list[fitz.Rect]: return [entry.rect for entry in self.entries] - def add_entry(self, entry: TextWord): + def add_entry(self, entry: LayerIdentifierEntry): """Add a new layer identifier column entry to the layer identifier column. Args: - entry (TextWord): The layer identifier column entry to be added. + entry (LayerIdentifierEntry): The layer identifier column entry to be added. """ self.entries.append(entry) @@ -92,8 +115,49 @@ def is_contained(self, rect: fitz.Rect) -> bool: and self.rect().y1 <= rect.y1 ) + def get_depth_interval(self, block: TextBlock) -> LayerDepthColumnEntry: + """Extract depth interval from a material description block. + + For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description + of the material. Often, these text descriptions contain a further separation into multiple sub layers. + These sub layers have their own depth intervals. This function extracts the overall depth interval, + spanning across all mentioned sub layers. + + Args: + block (TextBlock): The block to calculate the depth interval for. -def find_layer_identifier_column_entries(all_words: list[TextWord]) -> list: + Returns: + LayerDepthColumnEntry: The depth interval. + """ + depth_entries = [] + for line in block.lines: + try: + layer_depth_entry = extract_layer_depth_interval(line.text, line.rect, require_start_of_string=False) + # require_start_of_string = False because the depth interval may not always start at the beginning + # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" + if layer_depth_entry: + depth_entries.append(layer_depth_entry) + except ValueError: + pass + + if depth_entries: + # Merge the sub layers into one depth interval. + start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value) + end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value) + + return LayerDepthColumnEntry(start, end) + else: + return None + + def to_json(self): + rect = self.rect() + return { + "rect": [rect.x0, rect.y0, rect.x1, rect.y1], + "entries": [entry.to_json() for entry in self.entries], + } + + +def find_layer_identifier_column_entries(lines: list[TextLine]) -> list[LayerIdentifierEntry]: r"""Find the layer identifier column entries. Regex explanation: @@ -104,30 +168,31 @@ def find_layer_identifier_column_entries(all_words: list[TextWord]) -> list: This regular expression will match strings like "1)", "2)", "a)", "b)", "1a4)", "6de)", etc. Args: - all_words (list[TextWord]): The words to search for layer identifier columns. + lines (list[TextLine]): The lines to search for layer identifier columns. Returns: - list: The layer identifier column entries. + list[LayerIdentifierEntry]: The layer identifier column entries. """ entries = [] - for word in sorted(all_words, key=lambda word: word.rect.y0): - # TODO There are quite a few false positives such as "(ca. 10 cm)" where "cm)" would be matched currently. - # Could we avoid some of those examples by requiring that the word is at the start of a line and/or there are - # no other words immediately to the left of it? - regex = re.compile(r"\b[\da-z-]+\)") - match = regex.match(word.text) - if match and len(word.text) < 7: - entries.append(word) + for line in sorted(lines, key=lambda line: line.rect.y0): + if len(line.words) > 0: + # Only match in the first word of every line, to avoid e.g. matching with "cm)" in a material description + # containing an expression like "(diameter max 6 cm)". + first_word = line.words[0] + regex = re.compile(r"\b[\da-z-]+\)") + match = regex.match(first_word.text) + if match and len(first_word.text) < 7: + entries.append(LayerIdentifierEntry(first_word.rect, first_word.text)) return entries -def find_layer_identifier_column(entries: list[TextWord]) -> list[LayerIdentifierColumn]: +def find_layer_identifier_column(entries: list[LayerIdentifierEntry]) -> list[LayerIdentifierColumn]: """Find the layer identifier column given the index column entries. Note: Similar to find_depth_columns.find_depth_columns. Refactoring may be desired. Args: - entries (list[TextWord]): The layer identifier column entries. + entries (list[LayerIdentifierEntry]): The layer identifier column entries. Returns: list[LayerIdentifierColumn]: The found layer identifier columns.