From 573b9f85e455fa43727f3d2fd24d4cf67ff50cb6 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Tue, 2 Apr 2024 18:19:45 +0200 Subject: [PATCH 01/19] Create CLI interface. --- environment-dev.yml | 3 ++- environment-prod.yml | 1 + pyproject.toml | 3 +++ src/stratigraphy/main.py | 49 ++++++++++++++++++++++++++++++++-------- 4 files changed, 46 insertions(+), 10 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index d77a88ca..552f7558 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -10,7 +10,7 @@ dependencies: - pathlib==1.0.1 - opencv==4.9.0 - python-dotenv==1.0.1 - - pytest==8.1.1 + - click==8.1.7 - pip # dev dependencies - matplotlib==3.8.0 @@ -18,6 +18,7 @@ dependencies: - jupyterlab==4.1.3 - black==24.2.0 - pre-commit==3.6.2 + - pytest==8.1.1 - pip: # prod pip dependencies; needs to be a strict copy of environment-prod.yml - amazon-textract-textractor diff --git a/environment-prod.yml b/environment-prod.yml index 02e3b443..da30c50e 100644 --- a/environment-prod.yml +++ b/environment-prod.yml @@ -9,6 +9,7 @@ dependencies: - pathlib==1.0.1 - opencv==4.9.0 - python-dotenv==1.0.1 + - click==8.1.7 - pip - pip: - amazon-textract-textractor diff --git a/pyproject.toml b/pyproject.toml index bb3efa8d..ca0fbde4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,9 @@ requires-python = ">=3.10" dependencies = [ ] +[project.scripts] +boreholes-extract-materials = "stratigraphy.main:start_pipeline" + [tool.ruff.lint] select = [ # pydocstyle diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index f28463cd..1c437d83 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -6,6 +6,7 @@ import os from pathlib import Path +import click import fitz from dotenv import load_dotenv @@ -502,9 +503,40 @@ def perform_matching(directory: Path, **params: dict) -> dict: return output -if __name__ == "__main__": - # setup mlflow tracking; should be started before any other code - # such that tracking is enabled in other parts of the code. +@click.command() +@click.option( + "--input_directory", + type=click.Path(exists=True, path_type=Path), + default=DATAPATH / "Benchmark", + help="Path to the input directory.", +) +@click.option( + "--ground_truth_path", + type=click.Path(exists=True, path_type=Path), + default=DATAPATH / "Benchmark" / "ground_truth.json", + help="Path to the ground truth file.", +) +@click.option( + "--out_directory", + type=click.Path(path_type=Path), + default=DATAPATH / "Benchmark" / "evaluation", + help="Path to the output directory.", +) +@click.option( + "--predictions_path", + type=click.Path(path_type=Path), + default=DATAPATH / "Benchmark" / "extract" / "predictions.json", + help="Path to the predictions file.", +) +def start_pipeline(input_directory: Path, ground_truth_path: Path, out_directory: Path, predictions_path: Path): + """Description. + + Args: + input_directory (Path): _description_ + ground_truth_path (Path): _description_ + out_directory (Path): _description_ + predictions_path (Path): _description_ + """ if mlflow_tracking: import mlflow @@ -513,13 +545,8 @@ def perform_matching(directory: Path, **params: dict) -> dict: mlflow.log_params(flatten(line_detection_params)) mlflow.log_params(flatten(matching_params)) - # instantiate all paths - input_directory = DATAPATH / "Benchmark" - ground_truth_path = input_directory / "ground_truth.json" - out_directory = input_directory / "evaluation" - predictions_path = input_directory / "extract" / "predictions.json" temp_directory = DATAPATH / "_temp" # temporary directory to dump files for mlflow artifact logging - + # check if directories exist and create them when neccessary # check if directories exist and create them when neccessary out_directory.mkdir(parents=True, exist_ok=True) temp_directory.mkdir(parents=True, exist_ok=True) @@ -538,3 +565,7 @@ def perform_matching(directory: Path, **params: dict) -> dict: if mlflow_tracking: mlflow.log_metrics(metrics) mlflow.log_artifact(temp_directory / "document_level_metrics.csv") + + +if __name__ == "__main__": + start_pipeline() From baada76de18b662c8456974da8ef83fecf1dcb35 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Tue, 2 Apr 2024 18:34:07 +0200 Subject: [PATCH 02/19] Refactor: Refactored pipeline logic out of main.py into extract.py. --- src/stratigraphy/extract.py | 490 ++++++++++++++++++++++++++++++++++++ src/stratigraphy/main.py | 485 +---------------------------------- 2 files changed, 493 insertions(+), 482 deletions(-) create mode 100644 src/stratigraphy/extract.py diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py new file mode 100644 index 00000000..a3b911d1 --- /dev/null +++ b/src/stratigraphy/extract.py @@ -0,0 +1,490 @@ +"""Contains the main extraction pipeline for stratigraphy.""" + +import logging +import math +import os +from pathlib import Path + +import fitz + +from stratigraphy.line_detection import extract_lines, line_detection_params +from stratigraphy.util import find_depth_columns +from stratigraphy.util.dataclasses import Line +from stratigraphy.util.depthcolumn import DepthColumn +from stratigraphy.util.find_description import get_description_blocks, get_description_lines +from stratigraphy.util.interval import BoundaryInterval, Interval +from stratigraphy.util.line import DepthInterval, TextLine +from stratigraphy.util.textblock import TextBlock, block_distance +from stratigraphy.util.util import ( + parse_and_remove_empty_predictions, + x_overlap, + x_overlap_significant_smallest, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def process_page(page: fitz.Page, **params: dict) -> list[dict]: + """Process a single page of a pdf. + + Finds all descriptions and depth intervals on the page and matches them. + + Args: + page (fitz.Page): The page to process. + **params (dict): Additional parameters for the matching pipeline. + + Returns: + list[dict]: All list of the text of all description blocks. + """ + words = [] + words_by_line = {} + for x0, y0, x1, y1, word, block_no, line_no, _word_no in fitz.utils.get_text(page, "words"): + rect = fitz.Rect(x0, y0, x1, y1) * page.rotation_matrix + depth_interval = DepthInterval(rect, word) + words.append(TextLine([depth_interval])) + key = f"{block_no}_{line_no}" + if key not in words_by_line: + words_by_line[key] = [] + words_by_line[key].append(depth_interval) + + raw_lines = [TextLine(words_by_line[key]) for key in words_by_line] + + lines = [] + current_line_words = [] + for line_index, raw_line in enumerate(raw_lines): + for word_index, word in enumerate(raw_line.words): + remaining_line = TextLine(raw_line.words[word_index:]) + if len(current_line_words) > 0 and remaining_line.is_line_start(lines, raw_lines[line_index + 1 :]): + lines.append(TextLine(current_line_words)) + current_line_words = [] + current_line_words.append(word) + if len(current_line_words): + lines.append(TextLine(current_line_words)) + current_line_words = [] + + depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True) + layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words) + + used_entry_rects = [] + for column in layer_depth_columns: + for entry in column.entries: + used_entry_rects.extend([entry.start.rect, entry.end.rect]) + + depth_column_entries = [ + entry + for entry in find_depth_columns.depth_column_entries(words, include_splits=False) + if entry.rect not in used_entry_rects + ] + + depth_columns: list[DepthColumn] = layer_depth_columns + depth_columns.extend(find_depth_columns.find_depth_columns(depth_column_entries, words)) + + pairs = [] + for depth_column in depth_columns: + material_description_rect = find_material_description_column(lines, depth_column) + if material_description_rect: + pairs.append((depth_column, material_description_rect)) + + # lowest score first + pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words)) + + to_delete = [] + for i, (_depth_column, material_description_rect) in enumerate(pairs): + for _depth_column_2, material_description_rect_2 in pairs[i + 1 :]: + if material_description_rect.intersects(material_description_rect_2): + to_delete.append(i) + continue + filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete] + + geometric_lines = extract_lines(page, line_detection_params) + + groups = [] # list of matched depth intervals and text blocks + # groups is of the form: ["depth_interval": BoundaryInterval, "block": TextBlock] + if len(filtered_pairs): # match depth column items with material description + for depth_column, material_description_rect in filtered_pairs: + description_lines = get_description_lines(lines, material_description_rect) + if len(description_lines) > 1: + new_groups = match_columns( + depth_column, description_lines, geometric_lines, material_description_rect, **params + ) + groups.extend(new_groups) + json_filtered_pairs = [ + { + "depth_column": depth_column.to_json(), + "material_description_rect": [ + material_description_rect.x0, + material_description_rect.y0, + material_description_rect.x1, + material_description_rect.y1, + ], + } + for depth_column, material_description_rect in filtered_pairs + ] + + else: + json_filtered_pairs = [] + # Fallback when no depth column was found + material_description_rect = find_material_description_column(lines, depth_column=None) + if material_description_rect: + description_lines = get_description_lines(lines, material_description_rect) + description_blocks = get_description_blocks( + description_lines, + geometric_lines, + material_description_rect, + params["block_line_ratio"], + params["left_line_length_threshold"], + ) + groups.extend([{"block": block} for block in description_blocks]) + json_filtered_pairs.extend( + [ + { + "depth_column": None, + "material_description_rect": [ + material_description_rect.x0, + material_description_rect.y0, + material_description_rect.x1, + material_description_rect.y1, + ], + } + ] + ) + predictions = [ + {"material_description": group["block"].to_json(), "depth_interval": group["depth_interval"].to_json()} + if "depth_interval" in group + else {"material_description": group["block"].to_json()} + for group in groups + ] + predictions = parse_and_remove_empty_predictions(predictions) + return predictions, json_filtered_pairs + + +def score_column_match( + depth_column: DepthColumn, + material_description_rect: fitz.Rect, + all_words: list[TextLine] | None = None, + **params: dict, +) -> float: + """Scores the match between a depth column and a material description. + + Args: + depth_column (DepthColumn): The depth column. + material_description_rect (fitz.Rect): The material description rectangle. + all_words (list[TextLine] | None, optional): List of the available textlines. Defaults to None. + **params (dict): Additional parameters for the matching pipeline. Kept for compatibility with the pipeline. + + Returns: + float: The score of the match. + """ + rect = depth_column.rect() + top = rect.y0 + bottom = rect.y1 + right = rect.x1 + distance = ( + abs(top - material_description_rect.y0) + + abs(bottom - material_description_rect.y1) + + abs(right - material_description_rect.x0) + ) + + height = bottom - top + + noise_count = depth_column.noise_count(all_words) if all_words else 0 + + return (height - distance) * math.pow(0.8, noise_count) + + +def match_columns( + depth_column: DepthColumn, + description_lines: list[TextLine], + geometric_lines: list[Line], + material_description_rect: fitz.Rect, + **params: dict, +) -> list: + """Match the depth column entries with the description lines. + + This function identifies groups of depth intervals and text blocks that are likely to match. + In this process, the number of text blocks is adjusted to match the number of depth intervals. + + Args: + depth_column (DepthColumn): The depth column. + description_lines (list[TextLine]): The description lines. + geometric_lines (list[Line]): The geometric lines. + material_description_rect (fitz.Rect): The material description rectangle. + **params (dict): Additional parameters for the matching pipeline. + + Returns: + list: The matched depth intervals and text blocks. + """ + return [ + element + for group in depth_column.identify_groups( + description_lines, geometric_lines, material_description_rect, **params + ) + for element in transform_groups(group["depth_intervals"], group["blocks"], **params) + ] + + +def transform_groups( + depth_intervals: list[Interval], blocks: list[TextBlock], **params: dict +) -> list[dict[str, Interval | TextBlock]]: + """Transforms the text blocks such that their number equals the number of depth intervals. + + If there are more depth intervals than text blocks, text blocks are splitted. When there + are more text blocks than depth intervals, text blocks are merged. If the number of text blocks + and depth intervals equals, we proceed with the pairing. + + Args: + depth_intervals (List[Interval]): The depth intervals from the pdf. + blocks (List[TextBlock]): Found textblocks from the pdf. + **params (dict): Additional parameters for the matching pipeline. + + Returns: + List[Dict[str, Union[Interval, TextBlock]]]: Pairing of text blocks and depth intervals. + """ + if len(depth_intervals) == 0: + return [] + elif len(depth_intervals) == 1: + concatenated_block = TextBlock( + [line for block in blocks for line in block.lines] + ) # concatenate all text lines within a block; line separation flag does not matter here. + return [{"depth_interval": depth_intervals[0], "block": concatenated_block}] + else: + if len(blocks) < len(depth_intervals): + blocks = split_blocks_by_textline_length(blocks, target_split_count=len(depth_intervals) - len(blocks)) + + if len(blocks) > len(depth_intervals): + # create additional depth intervals with end & start value None to match the number of blocks + depth_intervals.extend([BoundaryInterval(None, None) for _ in range(len(blocks) - len(depth_intervals))]) + + return [ + {"depth_interval": depth_interval, "block": block} + for depth_interval, block in zip(depth_intervals, blocks, strict=False) + ] + + +def merge_blocks_by_vertical_spacing(blocks: list[TextBlock], target_merge_count: int) -> list[TextBlock]: + """Merge textblocks without any geometric lines that separates them. + + Note: Deprecated. Currently not in use any more. Kept here until we are sure that it is not needed anymore. + + The logic looks at the distances between the textblocks and merges them if they are closer + than a certain cutoff. + + Args: + blocks (List[TextBlock]): Textblocks that are to be merged. + target_merge_count (int): the number of merges that we'd like to happen (i.e. we'd like the total number of + blocks to be reduced by this number) + + Returns: + List[TextBlock]: The merged textblocks. + """ + distances = [] + for block_index in range(len(blocks) - 1): + distances.append(block_distance(blocks[block_index], blocks[block_index + 1])) + cutoff = sorted(distances)[target_merge_count - 1] # merge all blocks that have a distance smaller than this + merged_count = 0 + merged_blocks = [] + current_merged_block = blocks[0] + for block_index in range(len(blocks) - 1): + new_block = blocks[block_index + 1] + if ( + merged_count < target_merge_count + and block_distance(blocks[block_index], blocks[block_index + 1]) <= cutoff + ): + current_merged_block = current_merged_block.concatenate(new_block) + merged_count += 1 + else: + merged_blocks.append(current_merged_block) + current_merged_block = new_block + + if len(current_merged_block.lines): + merged_blocks.append(current_merged_block) + return merged_blocks + + +def split_blocks_by_textline_length(blocks: list[TextBlock], target_split_count: int) -> list[TextBlock]: + """Split textblocks without any geometric lines that separates them. + + The logic looks at the lengths of the text lines and cuts them off + if there are textlines that are shorter than others. + # TODO: Extend documentation about logic. + + Args: + blocks (List[TextBlock]): Textblocks that are to be split. + target_split_count (int): the number of splits that we'd like to happen (i.e. we'd like the total number of + blocks to be increased by this number) + + Returns: + List[TextBlock]: The split textblocks. + """ + line_lengths = sorted([line.rect.x1 for block in blocks for line in block.lines[:-1]]) + if len(line_lengths) <= target_split_count: # In that case each line is a block + return [TextBlock([line]) for block in blocks for line in block.lines] + else: + cutoff_values = line_lengths[:target_split_count] # all lines inside cutoff_values will be split line + split_blocks = [] + current_block_lines = [] + for block in blocks: + for line_index in range(block.line_count): + line = block.lines[line_index] + current_block_lines.append(line) + if line_index < block.line_count - 1 and line.rect.x1 in cutoff_values: + split_blocks.append(TextBlock(current_block_lines)) + cutoff_values.remove(line.rect.x1) + current_block_lines = [] + if len(current_block_lines): + split_blocks.append(TextBlock(current_block_lines)) + current_block_lines = [] + if ( + block.is_terminated_by_line + ): # If block was terminated by a line, populate the flag to the last element of split_blocks. + split_blocks[-1].is_terminated_by_line = True + return split_blocks + + +def find_material_description_column( + lines: list[TextLine], depth_column: DepthColumn, **params: dict +) -> fitz.Rect | None: + """Find the material description column given a depth column. + + Args: + lines (list[TextLine]): The text lines of the page. + depth_column (DepthColumn): The depth column. + **params (dict): Additional parameters for the matching pipeline. + + Returns: + fitz.Rect | None: The material description column. + """ + if depth_column: + above_depth_column = [ + line + for line in lines + if x_overlap(line.rect, depth_column.rect()) and line.rect.y0 < depth_column.rect().y0 + ] + + min_y0 = max(line.rect.y0 for line in above_depth_column) if len(above_depth_column) else -1 + + def check_y0_condition(y0): + return y0 > min_y0 and y0 < depth_column.rect().y1 + else: + + def check_y0_condition(y0): + return True + + candidate_description = [line for line in lines if check_y0_condition(line.rect.y0)] + is_description = [line for line in candidate_description if line.is_description] + + if len(candidate_description) == 0: + return + + description_clusters = [] + while len(is_description) > 0: + coverage_by_generating_line = [ + [other for other in is_description if x_overlap_significant_smallest(line.rect, other.rect, 0.5)] + for line in is_description + ] + + def filter_coverage(coverage): + if len(coverage): + min_x0 = min(line.rect.x0 for line in coverage) + max_x1 = max(line.rect.x1 for line in coverage) + x0_threshold = max_x1 - 0.4 * ( + max_x1 - min_x0 + ) # how did we determine the 0.4? Should it be a parameter? What would it do if we were to change it? + return [line for line in coverage if line.rect.x0 < x0_threshold] + else: + return [] + + coverage_by_generating_line = [filter_coverage(coverage) for coverage in coverage_by_generating_line] + max_coverage = max(coverage_by_generating_line, key=len) + description_clusters.append(max_coverage) + is_description = [line for line in is_description if line not in max_coverage] + + candidate_rects = [] + + for cluster in description_clusters: + best_y0 = min([line.rect.y0 for line in cluster]) + best_y1 = max([line.rect.y1 for line in cluster]) + + min_description_x0 = min( + [ + line.rect.x0 - 0.01 * line.rect.width for line in cluster + ] # How did we determine the 0.01? Should it be a parameter? What would it do if we were to change it? + ) + max_description_x0 = max( + [ + line.rect.x0 + 0.2 * line.rect.width for line in cluster + ] # How did we determine the 0.2? Should it be a parameter? What would it do if we were to change it? + ) + good_lines = [ + line + for line in candidate_description + if line.rect.y0 >= best_y0 and line.rect.y1 <= best_y1 + if min_description_x0 < line.rect.x0 < max_description_x0 + ] + best_x0 = min([line.rect.x0 for line in good_lines]) + best_x1 = max([line.rect.x1 for line in good_lines]) + + # expand to include entire last block + def is_below(best_x0, best_y1, line): + return ( + ( + line.rect.x0 > best_x0 - 5 + ) # How did we determine the 5? Should it be a parameter? What would it do if we were to change it? + and (line.rect.x0 < (best_x0 + best_x1) / 2) # noqa B023 + and ( + line.rect.y0 < best_y1 + 10 + ) # How did we determine the 10? Should it be a parameter? What would it do if we were to change it? + and (line.rect.y1 > best_y1) + ) + + continue_search = True + while continue_search: + line = next((line for line in lines if is_below(best_x0, best_y1, line)), None) + if line: + best_x0 = min(best_x0, line.rect.x0) + best_x1 = max(best_x1, line.rect.x1) + best_y1 = line.rect.y1 + else: + continue_search = False + + candidate_rects.append(fitz.Rect(best_x0, best_y0, best_x1, best_y1)) + + if len(candidate_rects) == 0: + return None + if depth_column: + return max(candidate_rects, key=lambda rect: score_column_match(depth_column, rect)) + else: + return candidate_rects[0] + + +def perform_matching(directory: Path, **params: dict) -> dict: + """Perform the matching of text blocks with depth intervals. + + Args: + directory (Path): Path to the directory that contains the pdfs. + **params (dict): Additional parameters for the matching pipeline. + + Returns: + dict: The predictions. + """ + for root, _dirs, files in os.walk(directory): + output = {} + for filename in files: + if filename.endswith(".pdf"): + in_path = os.path.join(root, filename) + logger.info("Processing file: %s", in_path) + output[filename] = {} + + with fitz.Document(in_path) as doc: + for page_index, page in enumerate(doc): + page_number = page_index + 1 + logger.info("Processing page %s", page_number) + + predictions, depths_materials_column_pairs = process_page(page, **params) + + output[filename][f"page_{page_number}"] = { + "layers": predictions, + "depths_materials_column_pairs": depths_materials_column_pairs, + } + return output diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index 1c437d83..029f71ea 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -2,31 +2,17 @@ import json import logging -import math import os from pathlib import Path import click -import fitz from dotenv import load_dotenv from stratigraphy import DATAPATH from stratigraphy.benchmark.score import evaluate_matching -from stratigraphy.line_detection import extract_lines, line_detection_params -from stratigraphy.util import find_depth_columns -from stratigraphy.util.dataclasses import Line -from stratigraphy.util.depthcolumn import DepthColumn -from stratigraphy.util.find_description import get_description_blocks, get_description_lines -from stratigraphy.util.interval import BoundaryInterval, Interval -from stratigraphy.util.line import DepthInterval, TextLine -from stratigraphy.util.textblock import TextBlock, block_distance -from stratigraphy.util.util import ( - flatten, - parse_and_remove_empty_predictions, - read_params, - x_overlap, - x_overlap_significant_smallest, -) +from stratigraphy.extract import perform_matching +from stratigraphy.line_detection import line_detection_params +from stratigraphy.util.util import flatten, read_params load_dotenv() @@ -38,471 +24,6 @@ matching_params = read_params("matching_params.yml") -def process_page(page: fitz.Page, **params: dict) -> list[dict]: - """Process a single page of a pdf. - - Finds all descriptions and depth intervals on the page and matches them. - - Args: - page (fitz.Page): The page to process. - **params (dict): Additional parameters for the matching pipeline. - - Returns: - list[dict]: All list of the text of all description blocks. - """ - words = [] - words_by_line = {} - for x0, y0, x1, y1, word, block_no, line_no, _word_no in fitz.utils.get_text(page, "words"): - rect = fitz.Rect(x0, y0, x1, y1) * page.rotation_matrix - depth_interval = DepthInterval(rect, word) - words.append(TextLine([depth_interval])) - key = f"{block_no}_{line_no}" - if key not in words_by_line: - words_by_line[key] = [] - words_by_line[key].append(depth_interval) - - raw_lines = [TextLine(words_by_line[key]) for key in words_by_line] - - lines = [] - current_line_words = [] - for line_index, raw_line in enumerate(raw_lines): - for word_index, word in enumerate(raw_line.words): - remaining_line = TextLine(raw_line.words[word_index:]) - if len(current_line_words) > 0 and remaining_line.is_line_start(lines, raw_lines[line_index + 1 :]): - lines.append(TextLine(current_line_words)) - current_line_words = [] - current_line_words.append(word) - if len(current_line_words): - lines.append(TextLine(current_line_words)) - current_line_words = [] - - depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True) - layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words) - - used_entry_rects = [] - for column in layer_depth_columns: - for entry in column.entries: - used_entry_rects.extend([entry.start.rect, entry.end.rect]) - - depth_column_entries = [ - entry - for entry in find_depth_columns.depth_column_entries(words, include_splits=False) - if entry.rect not in used_entry_rects - ] - - depth_columns: list[DepthColumn] = layer_depth_columns - depth_columns.extend(find_depth_columns.find_depth_columns(depth_column_entries, words)) - - pairs = [] - for depth_column in depth_columns: - material_description_rect = find_material_description_column(lines, depth_column) - if material_description_rect: - pairs.append((depth_column, material_description_rect)) - - # lowest score first - pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words)) - - to_delete = [] - for i, (_depth_column, material_description_rect) in enumerate(pairs): - for _depth_column_2, material_description_rect_2 in pairs[i + 1 :]: - if material_description_rect.intersects(material_description_rect_2): - to_delete.append(i) - continue - filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete] - - geometric_lines = extract_lines(page, line_detection_params) - - groups = [] # list of matched depth intervals and text blocks - # groups is of the form: ["depth_interval": BoundaryInterval, "block": TextBlock] - if len(filtered_pairs): # match depth column items with material description - for depth_column, material_description_rect in filtered_pairs: - description_lines = get_description_lines(lines, material_description_rect) - if len(description_lines) > 1: - new_groups = match_columns( - depth_column, description_lines, geometric_lines, material_description_rect, **params - ) - groups.extend(new_groups) - json_filtered_pairs = [ - { - "depth_column": depth_column.to_json(), - "material_description_rect": [ - material_description_rect.x0, - material_description_rect.y0, - material_description_rect.x1, - material_description_rect.y1, - ], - } - for depth_column, material_description_rect in filtered_pairs - ] - - else: - json_filtered_pairs = [] - # Fallback when no depth column was found - material_description_rect = find_material_description_column(lines, depth_column=None) - if material_description_rect: - description_lines = get_description_lines(lines, material_description_rect) - description_blocks = get_description_blocks( - description_lines, - geometric_lines, - material_description_rect, - params["block_line_ratio"], - params["left_line_length_threshold"], - ) - groups.extend([{"block": block} for block in description_blocks]) - json_filtered_pairs.extend( - [ - { - "depth_column": None, - "material_description_rect": [ - material_description_rect.x0, - material_description_rect.y0, - material_description_rect.x1, - material_description_rect.y1, - ], - } - ] - ) - predictions = [ - {"material_description": group["block"].to_json(), "depth_interval": group["depth_interval"].to_json()} - if "depth_interval" in group - else {"material_description": group["block"].to_json()} - for group in groups - ] - predictions = parse_and_remove_empty_predictions(predictions) - return predictions, json_filtered_pairs - - -def score_column_match( - depth_column: DepthColumn, - material_description_rect: fitz.Rect, - all_words: list[TextLine] | None = None, - **params: dict, -) -> float: - """Scores the match between a depth column and a material description. - - Args: - depth_column (DepthColumn): The depth column. - material_description_rect (fitz.Rect): The material description rectangle. - all_words (list[TextLine] | None, optional): List of the available textlines. Defaults to None. - **params (dict): Additional parameters for the matching pipeline. Kept for compatibility with the pipeline. - - Returns: - float: The score of the match. - """ - rect = depth_column.rect() - top = rect.y0 - bottom = rect.y1 - right = rect.x1 - distance = ( - abs(top - material_description_rect.y0) - + abs(bottom - material_description_rect.y1) - + abs(right - material_description_rect.x0) - ) - - height = bottom - top - - noise_count = depth_column.noise_count(all_words) if all_words else 0 - - return (height - distance) * math.pow(0.8, noise_count) - - -def match_columns( - depth_column: DepthColumn, - description_lines: list[TextLine], - geometric_lines: list[Line], - material_description_rect: fitz.Rect, - **params: dict, -) -> list: - """Match the depth column entries with the description lines. - - This function identifies groups of depth intervals and text blocks that are likely to match. - In this process, the number of text blocks is adjusted to match the number of depth intervals. - - Args: - depth_column (DepthColumn): The depth column. - description_lines (list[TextLine]): The description lines. - geometric_lines (list[Line]): The geometric lines. - material_description_rect (fitz.Rect): The material description rectangle. - **params (dict): Additional parameters for the matching pipeline. - - Returns: - list: The matched depth intervals and text blocks. - """ - return [ - element - for group in depth_column.identify_groups( - description_lines, geometric_lines, material_description_rect, **params - ) - for element in transform_groups(group["depth_intervals"], group["blocks"], **params) - ] - - -def transform_groups( - depth_intervals: list[Interval], blocks: list[TextBlock], **params: dict -) -> list[dict[str, Interval | TextBlock]]: - """Transforms the text blocks such that their number equals the number of depth intervals. - - If there are more depth intervals than text blocks, text blocks are splitted. When there - are more text blocks than depth intervals, text blocks are merged. If the number of text blocks - and depth intervals equals, we proceed with the pairing. - - Args: - depth_intervals (List[Interval]): The depth intervals from the pdf. - blocks (List[TextBlock]): Found textblocks from the pdf. - **params (dict): Additional parameters for the matching pipeline. - - Returns: - List[Dict[str, Union[Interval, TextBlock]]]: Pairing of text blocks and depth intervals. - """ - if len(depth_intervals) == 0: - return [] - elif len(depth_intervals) == 1: - concatenated_block = TextBlock( - [line for block in blocks for line in block.lines] - ) # concatenate all text lines within a block; line separation flag does not matter here. - return [{"depth_interval": depth_intervals[0], "block": concatenated_block}] - else: - if len(blocks) < len(depth_intervals): - blocks = split_blocks_by_textline_length(blocks, target_split_count=len(depth_intervals) - len(blocks)) - - if len(blocks) > len(depth_intervals): - # create additional depth intervals with end & start value None to match the number of blocks - depth_intervals.extend([BoundaryInterval(None, None) for _ in range(len(blocks) - len(depth_intervals))]) - - return [ - {"depth_interval": depth_interval, "block": block} - for depth_interval, block in zip(depth_intervals, blocks, strict=False) - ] - - -def merge_blocks_by_vertical_spacing(blocks: list[TextBlock], target_merge_count: int) -> list[TextBlock]: - """Merge textblocks without any geometric lines that separates them. - - Note: Deprecated. Currently not in use any more. Kept here until we are sure that it is not needed anymore. - - The logic looks at the distances between the textblocks and merges them if they are closer - than a certain cutoff. - - Args: - blocks (List[TextBlock]): Textblocks that are to be merged. - target_merge_count (int): the number of merges that we'd like to happen (i.e. we'd like the total number of - blocks to be reduced by this number) - - Returns: - List[TextBlock]: The merged textblocks. - """ - distances = [] - for block_index in range(len(blocks) - 1): - distances.append(block_distance(blocks[block_index], blocks[block_index + 1])) - cutoff = sorted(distances)[target_merge_count - 1] # merge all blocks that have a distance smaller than this - merged_count = 0 - merged_blocks = [] - current_merged_block = blocks[0] - for block_index in range(len(blocks) - 1): - new_block = blocks[block_index + 1] - if ( - merged_count < target_merge_count - and block_distance(blocks[block_index], blocks[block_index + 1]) <= cutoff - ): - current_merged_block = current_merged_block.concatenate(new_block) - merged_count += 1 - else: - merged_blocks.append(current_merged_block) - current_merged_block = new_block - - if len(current_merged_block.lines): - merged_blocks.append(current_merged_block) - return merged_blocks - - -def split_blocks_by_textline_length(blocks: list[TextBlock], target_split_count: int) -> list[TextBlock]: - """Split textblocks without any geometric lines that separates them. - - The logic looks at the lengths of the text lines and cuts them off - if there are textlines that are shorter than others. - # TODO: Extend documentation about logic. - - Args: - blocks (List[TextBlock]): Textblocks that are to be split. - target_split_count (int): the number of splits that we'd like to happen (i.e. we'd like the total number of - blocks to be increased by this number) - - Returns: - List[TextBlock]: The split textblocks. - """ - line_lengths = sorted([line.rect.x1 for block in blocks for line in block.lines[:-1]]) - if len(line_lengths) <= target_split_count: # In that case each line is a block - return [TextBlock([line]) for block in blocks for line in block.lines] - else: - cutoff_values = line_lengths[:target_split_count] # all lines inside cutoff_values will be split line - split_blocks = [] - current_block_lines = [] - for block in blocks: - for line_index in range(block.line_count): - line = block.lines[line_index] - current_block_lines.append(line) - if line_index < block.line_count - 1 and line.rect.x1 in cutoff_values: - split_blocks.append(TextBlock(current_block_lines)) - cutoff_values.remove(line.rect.x1) - current_block_lines = [] - if len(current_block_lines): - split_blocks.append(TextBlock(current_block_lines)) - current_block_lines = [] - if ( - block.is_terminated_by_line - ): # If block was terminated by a line, populate the flag to the last element of split_blocks. - split_blocks[-1].is_terminated_by_line = True - return split_blocks - - -def find_material_description_column( - lines: list[TextLine], depth_column: DepthColumn, **params: dict -) -> fitz.Rect | None: - """Find the material description column given a depth column. - - Args: - lines (list[TextLine]): The text lines of the page. - depth_column (DepthColumn): The depth column. - **params (dict): Additional parameters for the matching pipeline. - - Returns: - fitz.Rect | None: The material description column. - """ - if depth_column: - above_depth_column = [ - line - for line in lines - if x_overlap(line.rect, depth_column.rect()) and line.rect.y0 < depth_column.rect().y0 - ] - - min_y0 = max(line.rect.y0 for line in above_depth_column) if len(above_depth_column) else -1 - - def check_y0_condition(y0): - return y0 > min_y0 and y0 < depth_column.rect().y1 - else: - - def check_y0_condition(y0): - return True - - candidate_description = [line for line in lines if check_y0_condition(line.rect.y0)] - is_description = [line for line in candidate_description if line.is_description] - - if len(candidate_description) == 0: - return - - description_clusters = [] - while len(is_description) > 0: - coverage_by_generating_line = [ - [other for other in is_description if x_overlap_significant_smallest(line.rect, other.rect, 0.5)] - for line in is_description - ] - - def filter_coverage(coverage): - if len(coverage): - min_x0 = min(line.rect.x0 for line in coverage) - max_x1 = max(line.rect.x1 for line in coverage) - x0_threshold = max_x1 - 0.4 * ( - max_x1 - min_x0 - ) # how did we determine the 0.4? Should it be a parameter? What would it do if we were to change it? - return [line for line in coverage if line.rect.x0 < x0_threshold] - else: - return [] - - coverage_by_generating_line = [filter_coverage(coverage) for coverage in coverage_by_generating_line] - max_coverage = max(coverage_by_generating_line, key=len) - description_clusters.append(max_coverage) - is_description = [line for line in is_description if line not in max_coverage] - - candidate_rects = [] - - for cluster in description_clusters: - best_y0 = min([line.rect.y0 for line in cluster]) - best_y1 = max([line.rect.y1 for line in cluster]) - - min_description_x0 = min( - [ - line.rect.x0 - 0.01 * line.rect.width for line in cluster - ] # How did we determine the 0.01? Should it be a parameter? What would it do if we were to change it? - ) - max_description_x0 = max( - [ - line.rect.x0 + 0.2 * line.rect.width for line in cluster - ] # How did we determine the 0.2? Should it be a parameter? What would it do if we were to change it? - ) - good_lines = [ - line - for line in candidate_description - if line.rect.y0 >= best_y0 and line.rect.y1 <= best_y1 - if min_description_x0 < line.rect.x0 < max_description_x0 - ] - best_x0 = min([line.rect.x0 for line in good_lines]) - best_x1 = max([line.rect.x1 for line in good_lines]) - - # expand to include entire last block - def is_below(best_x0, best_y1, line): - return ( - ( - line.rect.x0 > best_x0 - 5 - ) # How did we determine the 5? Should it be a parameter? What would it do if we were to change it? - and (line.rect.x0 < (best_x0 + best_x1) / 2) # noqa B023 - and ( - line.rect.y0 < best_y1 + 10 - ) # How did we determine the 10? Should it be a parameter? What would it do if we were to change it? - and (line.rect.y1 > best_y1) - ) - - continue_search = True - while continue_search: - line = next((line for line in lines if is_below(best_x0, best_y1, line)), None) - if line: - best_x0 = min(best_x0, line.rect.x0) - best_x1 = max(best_x1, line.rect.x1) - best_y1 = line.rect.y1 - else: - continue_search = False - - candidate_rects.append(fitz.Rect(best_x0, best_y0, best_x1, best_y1)) - - if len(candidate_rects) == 0: - return None - if depth_column: - return max(candidate_rects, key=lambda rect: score_column_match(depth_column, rect)) - else: - return candidate_rects[0] - - -def perform_matching(directory: Path, **params: dict) -> dict: - """Perform the matching of text blocks with depth intervals. - - Args: - directory (Path): Path to the directory that contains the pdfs. - **params (dict): Additional parameters for the matching pipeline. - - Returns: - dict: The predictions. - """ - for root, _dirs, files in os.walk(directory): - output = {} - for filename in files: - if filename.endswith(".pdf"): - in_path = os.path.join(root, filename) - logger.info("Processing file: %s", in_path) - output[filename] = {} - - with fitz.Document(in_path) as doc: - for page_index, page in enumerate(doc): - page_number = page_index + 1 - logger.info("Processing page %s", page_number) - - predictions, depths_materials_column_pairs = process_page(page, **params) - - output[filename][f"page_{page_number}"] = { - "layers": predictions, - "depths_materials_column_pairs": depths_materials_column_pairs, - } - return output - - @click.command() @click.option( "--input_directory", From 1620bfe9df6467f94fc2c9e4785bbaf67eaa7d61 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 3 Apr 2024 10:07:23 +0200 Subject: [PATCH 03/19] Allow CLI to specify if bounding boxes and lines are drawn. Allow to run pipeline on individual files. --- src/stratigraphy/benchmark/score.py | 6 +++-- src/stratigraphy/line_detection.py | 25 +++++++++++++++++ src/stratigraphy/main.py | 42 +++++++++++++++++++++++------ 3 files changed, 63 insertions(+), 10 deletions(-) diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py index ab254b7b..7a552091 100644 --- a/src/stratigraphy/benchmark/score.py +++ b/src/stratigraphy/benchmark/score.py @@ -57,7 +57,7 @@ def f1(precision: float, recall: float) -> float: def evaluate_matching( - predictions_path: Path, ground_truth_path: Path, directory: Path, out_directory: Path + predictions_path: Path, ground_truth_path: Path, directory: Path, out_directory: Path, skip_draw_predictions: bool ) -> tuple[dict, pd.DataFrame]: """Calculate F1, precision and recall for the predictions. @@ -69,6 +69,7 @@ def evaluate_matching( ground_truth_path (Path): Path to the ground truth annotated data. directory (Path): Path to the directory containing the pdf files. out_directory (Path): Path to the directory where the evaluation images should be saved. + skip_draw_predictions (bool): Whether to draw the predictions on the pdf pages. Returns: tuple[dict, pd.DataFrame]: A tuple containing the overall F1, precision and recall as a dictionary and the @@ -80,7 +81,8 @@ def evaluate_matching( predictions, number_of_truth_values = _add_ground_truth_to_predictions(predictions, ground_truth) - draw_predictions(predictions, directory, out_directory) + if not skip_draw_predictions: + draw_predictions(predictions, directory, out_directory) document_level_metrics = { "document_name": [], diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py index d208712e..e99e309c 100644 --- a/src/stratigraphy/line_detection.py +++ b/src/stratigraphy/line_detection.py @@ -1,6 +1,7 @@ """Script for line detection in pdf pages.""" import os +from pathlib import Path import cv2 import fitz @@ -88,6 +89,30 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]: return lines +def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict): + """Draw lines on pdf pages and stores them as artifacts in mlflow. + + Args: + input_directory (Path): The directory containing the pdf files. + line_detection_params (dict): The parameters for the line detection algorithm. + """ + for root, _dirs, files in os.walk(input_directory): + output = {} + for filename in files: + if filename.endswith(".pdf"): + in_path = os.path.join(root, filename) + output[filename] = {} + + with fitz.Document(in_path) as doc: + for page_index, page in enumerate(doc): + lines = extract_lines(page, line_detection_params) + img = plot_lines(page, lines, scale_factor=line_detection_params["pdf_scale_factor"]) + if mlflow_tracking: + import mlflow + + mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png") + + if __name__ == "__main__": # Some test pdfs selected_pdfs = [ diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index 029f71ea..ab638cba 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -3,6 +3,7 @@ import json import logging import os +import shutil from pathlib import Path import click @@ -11,7 +12,7 @@ from stratigraphy import DATAPATH from stratigraphy.benchmark.score import evaluate_matching from stratigraphy.extract import perform_matching -from stratigraphy.line_detection import line_detection_params +from stratigraphy.line_detection import draw_lines_on_pdfs, line_detection_params from stratigraphy.util.util import flatten, read_params load_dotenv() @@ -26,6 +27,7 @@ @click.command() @click.option( + "-i", "--input_directory", type=click.Path(exists=True, path_type=Path), default=DATAPATH / "Benchmark", @@ -49,14 +51,25 @@ default=DATAPATH / "Benchmark" / "extract" / "predictions.json", help="Path to the predictions file.", ) -def start_pipeline(input_directory: Path, ground_truth_path: Path, out_directory: Path, predictions_path: Path): +@click.option("-s", "--skip-draw-predictions", is_flag=True, default=False, help="Draw predictions on pdf pages.") +@click.option("-l", "--draw-lines", is_flag=True, default=False, help="Draw lines on pdf pages.") +def start_pipeline( + input_directory: Path, + ground_truth_path: Path, + out_directory: Path, + predictions_path: Path, + skip_draw_predictions: bool = False, + draw_lines: bool = False, +): """Description. Args: - input_directory (Path): _description_ - ground_truth_path (Path): _description_ - out_directory (Path): _description_ - predictions_path (Path): _description_ + input_directory (Path): The directory containing the pdf files. + ground_truth_path (Path): The path to the ground truth file. + out_directory (Path): The directory to store the evaluation results. + predictions_path (Path): The path to the predictions file. + skip_draw_predictions (bool, optional): Whether to skip drawing predictions on pdf pages. Defaults to False. + draw_lines (bool, optional): Whether to draw lines on pdf pages. Defaults to False. """ if mlflow_tracking: import mlflow @@ -67,11 +80,20 @@ def start_pipeline(input_directory: Path, ground_truth_path: Path, out_directory mlflow.log_params(flatten(matching_params)) temp_directory = DATAPATH / "_temp" # temporary directory to dump files for mlflow artifact logging - # check if directories exist and create them when neccessary + # check if directories exist and create them when neccessary out_directory.mkdir(parents=True, exist_ok=True) temp_directory.mkdir(parents=True, exist_ok=True) + # if a file is specified instead of an input directory, copy the file to a temporary directory and work with that. + if input_directory.is_file(): + if (temp_directory / "single_file").is_dir(): + shutil.rmtree(temp_directory / "single_file") + + Path.mkdir(temp_directory / "single_file") + shutil.copy(input_directory, temp_directory / "single_file") + input_directory = temp_directory / "single_file" + # run the matching pipeline and save the result predictions = perform_matching(input_directory, **matching_params) with open(predictions_path, "w") as file: @@ -79,7 +101,7 @@ def start_pipeline(input_directory: Path, ground_truth_path: Path, out_directory # evaluate the predictions metrics, document_level_metrics = evaluate_matching( - predictions_path, ground_truth_path, input_directory, out_directory + predictions_path, ground_truth_path, input_directory, out_directory, skip_draw_predictions ) document_level_metrics.to_csv(temp_directory / "document_level_metrics.csv") # mlflow.log_artifact expects a file @@ -87,6 +109,10 @@ def start_pipeline(input_directory: Path, ground_truth_path: Path, out_directory mlflow.log_metrics(metrics) mlflow.log_artifact(temp_directory / "document_level_metrics.csv") + if draw_lines: + logger.info("Drawing lines on pdf pages.") + draw_lines_on_pdfs(input_directory, line_detection_params=line_detection_params) + if __name__ == "__main__": start_pipeline() From b0c49a6be43ef4f6a8df2caee438fe5faa0c9d33 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 3 Apr 2024 10:15:04 +0200 Subject: [PATCH 04/19] Update docstring and help for click commands. --- src/stratigraphy/main.py | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index ab638cba..1670a4ac 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -34,25 +34,36 @@ help="Path to the input directory.", ) @click.option( + "-g", "--ground_truth_path", type=click.Path(exists=True, path_type=Path), default=DATAPATH / "Benchmark" / "ground_truth.json", help="Path to the ground truth file.", ) @click.option( + "-o", "--out_directory", type=click.Path(path_type=Path), default=DATAPATH / "Benchmark" / "evaluation", help="Path to the output directory.", ) @click.option( + "-p", "--predictions_path", type=click.Path(path_type=Path), default=DATAPATH / "Benchmark" / "extract" / "predictions.json", help="Path to the predictions file.", ) -@click.option("-s", "--skip-draw-predictions", is_flag=True, default=False, help="Draw predictions on pdf pages.") -@click.option("-l", "--draw-lines", is_flag=True, default=False, help="Draw lines on pdf pages.") +@click.option( + "-s", + "--skip-draw-predictions", + is_flag=True, + default=False, + help="Whether to skip drawing the predictions on pdf pages. Defaults to False.", +) +@click.option( + "-l", "--draw-lines", is_flag=True, default=False, help="Whether to draw lines on pdf pages. Defaults to False." +) def start_pipeline( input_directory: Path, ground_truth_path: Path, @@ -61,16 +72,20 @@ def start_pipeline( skip_draw_predictions: bool = False, draw_lines: bool = False, ): - """Description. - - Args: - input_directory (Path): The directory containing the pdf files. - ground_truth_path (Path): The path to the ground truth file. - out_directory (Path): The directory to store the evaluation results. - predictions_path (Path): The path to the predictions file. - skip_draw_predictions (bool, optional): Whether to skip drawing predictions on pdf pages. Defaults to False. - draw_lines (bool, optional): Whether to draw lines on pdf pages. Defaults to False. - """ + """Run the boreholes data extraction pipeline. + + The pipeline will extract material description of all found layers and assign them to the corresponding + depth intervals. The input directory should contain pdf files with boreholes data. The algorithm can deal + with borehole profiles of multiple pages. + + Args:\n + input_directory (Path): The directory containing the pdf files.\n + ground_truth_path (Path): The path to the ground truth file json file.\n + out_directory (Path): The directory to store the evaluation results.\n + predictions_path (Path): The path to the predictions file.\n + skip_draw_predictions (bool, optional): Whether to skip drawing predictions on pdf pages. Defaults to False.\n + draw_lines (bool, optional): Whether to draw lines on pdf pages. Defaults to False.\n + """ # noqa: D301 if mlflow_tracking: import mlflow From d85a936ebd9b895b4f4c76025fe60d2f6744de4b Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 3 Apr 2024 10:24:04 +0200 Subject: [PATCH 05/19] Update README. --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fd2d5f81..50764cf6 100644 --- a/README.md +++ b/README.md @@ -40,11 +40,13 @@ To execute the data extraction pipeline, follow these steps: `conda activate boreholes-dev` -2. **Run the main script** +2. **Run the extraction script** - The main script for the extraction pipeline is located at `src/stratigraphy/main.py`. Run this script to start the extraction process. + The main script for the extraction pipeline is located at `src/stratigraphy/main.py`. A cli command is created to run this script. - This script will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory. + Run `boreholes-extract-materials` to run the main extraction script. With the default options, the command will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory. + + Use `boreholes-extract-materials --help` to see all options for the extraction script. 3. **Check the results** From d0612470bf892f721c04e95cdf4fd51701442954 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 3 Apr 2024 10:27:39 +0200 Subject: [PATCH 06/19] Remove line_detection script --- README.md | 5 +---- src/stratigraphy/line_detection.py | 36 +----------------------------- 2 files changed, 2 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 50764cf6..61155a08 100644 --- a/README.md +++ b/README.md @@ -52,8 +52,6 @@ To execute the data extraction pipeline, follow these steps: Once the script has finished running, you can check the results in the `data/Benchmark/extract` directory. The result is a `predictions.json` file as well as a png file for each page of each PDF in the `data/Benchmark` directory. -Please note that for now the pipeline assumes that all PDF files to be analyzed are placed in the `data/Benchmark` directory. If you want to analyze different files, please place them in this directory. - ### Output Structure The `predictions.json` file contains the results of a data extraction process from PDF files. Each key in the JSON object is the name of a PDF file, and the value is a list of extracted items in a dictionary like object. The extracted items for now are the material descriptions in their correct order (given by their depths). @@ -151,7 +149,7 @@ The project structure and the most important files are as follows: - `src/` : The source code of the project. - `stratigraphy/` : The main package of the project. - `main.py` : The main script of the project. This script runs the data extraction pipeline. - - `line_detection.py`: This script runs the line detection on provided sample pdfs. Will be deprecated in the future. + - `line_detection.py`: Contains functionalities for line detection on pdf pages. - `util/` : Utility scripts and modules. - `benchmark/` : Scripts to evaluate the data extraction. - `data/` : The data used by the project. @@ -166,7 +164,6 @@ The project structure and the most important files are as follows: - `main.py` : This is the main script of the project. It runs the data extraction pipeline, which analyzes the PDF files in the `data/Benchmark` directory and saves the results in the `predictions.json` file. -- `line_detection.py` : Runs the line detection algorithm on pdfs using `lsd` from opencv. It is meant to find all lines that potentially separate two material descriptions. It is incorporated in the script `main.py` and will be deprecated as a standalone script in the future. ## Experiment Tracking We perform experiment tracking using MLFlow. Each developer has his own local MLFlow instance. diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py index e99e309c..6b95b6a3 100644 --- a/src/stratigraphy/line_detection.py +++ b/src/stratigraphy/line_detection.py @@ -9,7 +9,6 @@ from dotenv import load_dotenv from numpy.typing import ArrayLike -from stratigraphy import DATAPATH from stratigraphy.util.dataclasses import Line from stratigraphy.util.geometric_line_utilities import ( drop_vertical_lines, @@ -17,7 +16,7 @@ merge_parallel_lines_efficiently, ) from stratigraphy.util.plot_utils import plot_lines -from stratigraphy.util.util import flatten, line_from_array, read_params +from stratigraphy.util.util import line_from_array, read_params load_dotenv() @@ -111,36 +110,3 @@ def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict): import mlflow mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png") - - -if __name__ == "__main__": - # Some test pdfs - selected_pdfs = [ - "270124083-bp.pdf", - "268124307-bp.pdf", - "268125268-bp.pdf", - "267125378-bp.pdf", - "268124435-bp.pdf", - "267123060-bp.pdf", - "268124635-bp.pdf", - "675230002-bp.pdf", - "268125592-bp.pdf", - "267124070-bp.pdf", - "699248001-bp.pdf", - ] - - if mlflow_tracking: - import mlflow - - mlflow.set_experiment("LineDetection") - mlflow.start_run() - mlflow.log_params(flatten(line_detection_params)) - lines = {} - for pdf in selected_pdfs: - doc = fitz.open(DATAPATH / "Benchmark" / pdf) - - for page in doc: - lines[pdf] = extract_lines(page, line_detection_params) - img = plot_lines(page, lines[pdf], scale_factor=line_detection_params["pdf_scale_factor"]) - if mlflow_tracking: - mlflow.log_image(img, f"lines_{pdf}.png") From 395fd411b6cc0b482f7a6d1d71e86fd60c81b6b9 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 3 Apr 2024 10:31:38 +0200 Subject: [PATCH 07/19] Raise warning if MLFlow tracking is not enabled for draw_lines_on_pdfs. --- src/stratigraphy/line_detection.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py index 6b95b6a3..0bb8afc9 100644 --- a/src/stratigraphy/line_detection.py +++ b/src/stratigraphy/line_detection.py @@ -95,6 +95,10 @@ def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict): input_directory (Path): The directory containing the pdf files. line_detection_params (dict): The parameters for the line detection algorithm. """ + if not mlflow_tracking: + raise Warning("MLFlow tracking is not enabled. MLFLow is required to store the images.") + import mlflow + for root, _dirs, files in os.walk(input_directory): output = {} for filename in files: @@ -106,7 +110,4 @@ def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict): for page_index, page in enumerate(doc): lines = extract_lines(page, line_detection_params) img = plot_lines(page, lines, scale_factor=line_detection_params["pdf_scale_factor"]) - if mlflow_tracking: - import mlflow - - mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png") + mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png") From b144d157466fe34b14495bac5bd61b37dcd70584 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 3 Apr 2024 13:36:02 +0200 Subject: [PATCH 08/19] Improve the way --help prints the docstring. --- src/stratigraphy/main.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index 1670a4ac..c0ff1876 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -78,13 +78,14 @@ def start_pipeline( depth intervals. The input directory should contain pdf files with boreholes data. The algorithm can deal with borehole profiles of multiple pages. - Args:\n - input_directory (Path): The directory containing the pdf files.\n - ground_truth_path (Path): The path to the ground truth file json file.\n - out_directory (Path): The directory to store the evaluation results.\n - predictions_path (Path): The path to the predictions file.\n - skip_draw_predictions (bool, optional): Whether to skip drawing predictions on pdf pages. Defaults to False.\n - draw_lines (bool, optional): Whether to draw lines on pdf pages. Defaults to False.\n + \f + Args: + input_directory (Path): The directory containing the pdf files. + ground_truth_path (Path): The path to the ground truth file json file. + out_directory (Path): The directory to store the evaluation results. + predictions_path (Path): The path to the predictions file. + skip_draw_predictions (bool, optional): Whether to skip drawing predictions on pdf pages. Defaults to False. + draw_lines (bool, optional): Whether to draw lines on pdf pages. Defaults to False. """ # noqa: D301 if mlflow_tracking: import mlflow From 672a8942de838616e0b54cdddcf9e7eae0539360 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 3 Apr 2024 13:38:20 +0200 Subject: [PATCH 09/19] Rename command boreholes-extract-materials to boreholes-extract layers. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ca0fbde4..dd5f811a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ dependencies = [ ] [project.scripts] -boreholes-extract-materials = "stratigraphy.main:start_pipeline" +boreholes-extract-layers = "stratigraphy.main:start_pipeline" [tool.ruff.lint] select = [ From 397cae80a08064393b896dd95330337d140b206b Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 3 Apr 2024 14:18:21 +0200 Subject: [PATCH 10/19] Refactor extraction pipeline.2 --- src/stratigraphy/extract.py | 6 ++--- src/stratigraphy/line_detection.py | 24 ++++++------------ src/stratigraphy/main.py | 39 ++++++++++++++++++++++++------ 3 files changed, 41 insertions(+), 28 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index a3b911d1..9cd622ae 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -7,7 +7,6 @@ import fitz -from stratigraphy.line_detection import extract_lines, line_detection_params from stratigraphy.util import find_depth_columns from stratigraphy.util.dataclasses import Line from stratigraphy.util.depthcolumn import DepthColumn @@ -25,13 +24,14 @@ logger = logging.getLogger(__name__) -def process_page(page: fitz.Page, **params: dict) -> list[dict]: +def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict]: """Process a single page of a pdf. Finds all descriptions and depth intervals on the page and matches them. Args: page (fitz.Page): The page to process. + geometric_lines (list[Line]): The geometric lines of the page. **params (dict): Additional parameters for the matching pipeline. Returns: @@ -97,8 +97,6 @@ def process_page(page: fitz.Page, **params: dict) -> list[dict]: continue filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete] - geometric_lines = extract_lines(page, line_detection_params) - groups = [] # list of matched depth intervals and text blocks # groups is of the form: ["depth_interval": BoundaryInterval, "block": TextBlock] if len(filtered_pairs): # match depth column items with material description diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py index 0bb8afc9..98e323f3 100644 --- a/src/stratigraphy/line_detection.py +++ b/src/stratigraphy/line_detection.py @@ -1,7 +1,6 @@ """Script for line detection in pdf pages.""" import os -from pathlib import Path import cv2 import fitz @@ -88,26 +87,19 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]: return lines -def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict): +def draw_lines_on_pdfs(filename: str, page: fitz.Page, geometric_lines: list[Line]): """Draw lines on pdf pages and stores them as artifacts in mlflow. + Note: now the function draw_lines_on_pdfs may not even be needed any more. + Args: - input_directory (Path): The directory containing the pdf files. - line_detection_params (dict): The parameters for the line detection algorithm. + filename (str): The filename of the pdf. + page (fitz.Page): The page to draw lines on. + geometric_lines (list[Line]): The lines to draw on the pdf page. """ if not mlflow_tracking: raise Warning("MLFlow tracking is not enabled. MLFLow is required to store the images.") import mlflow - for root, _dirs, files in os.walk(input_directory): - output = {} - for filename in files: - if filename.endswith(".pdf"): - in_path = os.path.join(root, filename) - output[filename] = {} - - with fitz.Document(in_path) as doc: - for page_index, page in enumerate(doc): - lines = extract_lines(page, line_detection_params) - img = plot_lines(page, lines, scale_factor=line_detection_params["pdf_scale_factor"]) - mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png") + img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"]) + mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png") diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index c0ff1876..a2d2eb7e 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -7,12 +7,13 @@ from pathlib import Path import click +import fitz from dotenv import load_dotenv from stratigraphy import DATAPATH from stratigraphy.benchmark.score import evaluate_matching -from stratigraphy.extract import perform_matching -from stratigraphy.line_detection import draw_lines_on_pdfs, line_detection_params +from stratigraphy.extract import process_page +from stratigraphy.line_detection import draw_lines_on_pdfs, extract_lines, line_detection_params from stratigraphy.util.util import flatten, read_params load_dotenv() @@ -110,8 +111,34 @@ def start_pipeline( shutil.copy(input_directory, temp_directory / "single_file") input_directory = temp_directory / "single_file" - # run the matching pipeline and save the result - predictions = perform_matching(input_directory, **matching_params) + # process the individual pdf files + predictions = {} + for root, _dirs, files in os.walk(input_directory): + for filename in files: + if filename.endswith(".pdf"): + in_path = os.path.join(root, filename) + logger.info("Processing file: %s", in_path) + predictions[filename] = {} + + with fitz.Document(in_path) as doc: + for page_index, page in enumerate(doc): + page_number = page_index + 1 + logger.info("Processing page %s", page_number) + + geometric_lines = extract_lines(page, line_detection_params) + layer_predictions, depths_materials_column_pairs = process_page( + page, geometric_lines, **matching_params + ) + + predictions[filename][f"page_{page_number}"] = { + "layers": layer_predictions, + "depths_materials_column_pairs": depths_materials_column_pairs, + } + + if draw_lines: + logger.info("Drawing lines on pdf pages.") + draw_lines_on_pdfs(filename, page, geometric_lines) + with open(predictions_path, "w") as file: file.write(json.dumps(predictions)) @@ -125,10 +152,6 @@ def start_pipeline( mlflow.log_metrics(metrics) mlflow.log_artifact(temp_directory / "document_level_metrics.csv") - if draw_lines: - logger.info("Drawing lines on pdf pages.") - draw_lines_on_pdfs(input_directory, line_detection_params=line_detection_params) - if __name__ == "__main__": start_pipeline() From 12cde5813a5366a54d9c3af2f64bd70d6a9b2e62 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 3 Apr 2024 14:39:37 +0200 Subject: [PATCH 11/19] make single files work without temporary directory. --- src/stratigraphy/main.py | 13 ++++--------- src/stratigraphy/util/draw.py | 2 ++ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index a2d2eb7e..03005d17 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -3,7 +3,6 @@ import json import logging import os -import shutil from pathlib import Path import click @@ -104,16 +103,12 @@ def start_pipeline( # if a file is specified instead of an input directory, copy the file to a temporary directory and work with that. if input_directory.is_file(): - if (temp_directory / "single_file").is_dir(): - shutil.rmtree(temp_directory / "single_file") - - Path.mkdir(temp_directory / "single_file") - shutil.copy(input_directory, temp_directory / "single_file") - input_directory = temp_directory / "single_file" - + file_iterator = [(input_directory.parent, None, [input_directory.name])] + else: + file_iterator = os.walk(input_directory) # process the individual pdf files predictions = {} - for root, _dirs, files in os.walk(input_directory): + for root, _dirs, files in file_iterator: for filename in files: if filename.endswith(".pdf"): in_path = os.path.join(root, filename) diff --git a/src/stratigraphy/util/draw.py b/src/stratigraphy/util/draw.py index 388d60b6..948bd430 100644 --- a/src/stratigraphy/util/draw.py +++ b/src/stratigraphy/util/draw.py @@ -35,6 +35,8 @@ def draw_predictions(predictions: dict, directory: Path, out_directory: Path) -> directory (Path): Path to the directory containing the pdf files. out_directory (Path): Path to the output directory where the images are saved. """ + if directory.is_file(): # deal with the case when we pass a file instead of a directory + directory = directory.parent for file in predictions: logger.info(f"Evaluating {file}.") with fitz.Document(directory / file) as doc: From b549d0ff91e647194fb23afdb7c86a94d444b40e Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Thu, 4 Apr 2024 08:11:26 +0200 Subject: [PATCH 12/19] suggestion for rectangle correction with lines. --- src/stratigraphy/extract.py | 42 ++++++++++++++++++++++++++++ src/stratigraphy/util/dataclasses.py | 3 ++ 2 files changed, 45 insertions(+) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 9cd622ae..5e312730 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -84,6 +84,9 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict] for depth_column in depth_columns: material_description_rect = find_material_description_column(lines, depth_column) if material_description_rect: + material_description_rect = adjust_material_description_rect( + material_description_rect, page.rect.width, geometric_lines + ) pairs.append((depth_column, material_description_rect)) # lowest score first @@ -125,6 +128,9 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict] # Fallback when no depth column was found material_description_rect = find_material_description_column(lines, depth_column=None) if material_description_rect: + material_description_rect = adjust_material_description_rect( + material_description_rect, page.rect.width, geometric_lines + ) description_lines = get_description_lines(lines, material_description_rect) description_blocks = get_description_blocks( description_lines, @@ -456,6 +462,42 @@ def is_below(best_x0, best_y1, line): return candidate_rects[0] +def adjust_material_description_rect( + material_description_rect: fitz.Rect, page_width: float, geometric_lines: list[Line] +) -> fitz.Rect: + """Adjust the material description rectangle based on the geometric lines. + + Tries to adjust the material description rectangle to the top until a long solid line is reached. + + Args: + material_description_rect (fitz.Rect): _description_ + page_width (float): _description_ + geometric_lines (list[Line]): _description_ + + Returns: + fitz.Rect: _description_ + """ + material_description_rect_top = material_description_rect.y0 + max_line_y = 0 + for line in geometric_lines: + if ( + line.length() > 0.7 * material_description_rect.width + and (line.start.y + line.end.y) / 2 < material_description_rect_top + 5 + ): + max_line_y = max(max_line_y, (line.start.y + line.end.y) / 2) - 5 + + if max_line_y > material_description_rect_top: + new_coordinates = [ + material_description_rect.x0, + max_line_y, + material_description_rect.x1, + material_description_rect.y1, + ] + return fitz.Rect(*new_coordinates) + else: + return material_description_rect + + def perform_matching(directory: Path, **params: dict) -> dict: """Perform the matching of text blocks with depth intervals. diff --git a/src/stratigraphy/util/dataclasses.py b/src/stratigraphy/util/dataclasses.py index 48436128..b787743b 100644 --- a/src/stratigraphy/util/dataclasses.py +++ b/src/stratigraphy/util/dataclasses.py @@ -47,6 +47,9 @@ def distance_to(self, point: Point) -> float: - (self.start.x - point.x) * (self.end.y - self.start.y) ) / np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2) + def length(self) -> float: + return np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2) + def slope(self) -> float: return (self.end.y - self.start.y) / (self.end.x - self.start.x) if self.end.x - self.start.x != 0 else np.inf From 1edbb7975b39a389eefefaad8676f149acdeb05b Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Thu, 4 Apr 2024 14:24:29 +0200 Subject: [PATCH 13/19] refactoring evaluate_matching. --- src/stratigraphy/benchmark/score.py | 30 ++++++++--------------------- src/stratigraphy/line_detection.py | 2 +- src/stratigraphy/main.py | 16 +++++++++------ 3 files changed, 19 insertions(+), 29 deletions(-) diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py index 7a552091..a655bc7e 100644 --- a/src/stratigraphy/benchmark/score.py +++ b/src/stratigraphy/benchmark/score.py @@ -1,6 +1,5 @@ """Evaluate the predictions against the ground truth.""" -import json import logging import os from pathlib import Path @@ -9,7 +8,6 @@ from dotenv import load_dotenv from stratigraphy import DATAPATH from stratigraphy.benchmark.ground_truth import GroundTruth -from stratigraphy.util.draw import draw_predictions from stratigraphy.util.util import parse_text load_dotenv() @@ -56,34 +54,20 @@ def f1(precision: float, recall: float) -> float: return 0 -def evaluate_matching( - predictions_path: Path, ground_truth_path: Path, directory: Path, out_directory: Path, skip_draw_predictions: bool -) -> tuple[dict, pd.DataFrame]: +def evaluate_matching(predictions: dict, number_of_truth_values: dict) -> tuple[dict, pd.DataFrame]: """Calculate F1, precision and recall for the predictions. Calculate F1, precision and recall for the individual documents as well as overall. The individual document metrics are returned as a DataFrame. Args: - predictions_path (Path): Path to the predictions.json file. - ground_truth_path (Path): Path to the ground truth annotated data. - directory (Path): Path to the directory containing the pdf files. - out_directory (Path): Path to the directory where the evaluation images should be saved. - skip_draw_predictions (bool): Whether to draw the predictions on the pdf pages. + predictions (dict): The predictions. + number_of_truth_values (dict): The number of ground truth values per file. Returns: tuple[dict, pd.DataFrame]: A tuple containing the overall F1, precision and recall as a dictionary and the individual document metrics as a DataFrame. """ - ground_truth = GroundTruth(ground_truth_path) - with open(predictions_path) as in_file: - predictions = json.load(in_file) - - predictions, number_of_truth_values = _add_ground_truth_to_predictions(predictions, ground_truth) - - if not skip_draw_predictions: - draw_predictions(predictions, directory, out_directory) - document_level_metrics = { "document_name": [], "F1": [], @@ -137,16 +121,18 @@ def evaluate_matching( }, pd.DataFrame(document_level_metrics) -def _add_ground_truth_to_predictions(predictions: dict, ground_truth: GroundTruth) -> (dict, dict): +def add_ground_truth_to_predictions(predictions: dict, ground_truth_path: Path) -> tuple[dict, dict]: """Add the ground truth to the predictions. Args: predictions (dict): The predictions. - ground_truth (GroundTruth): The ground truth. + ground_truth_path (Path): The path to the ground truth file. Returns: - (dict, dict): The predictions with the ground truth added, and the number of ground truth values per file. + tuple[dict, dict]: The predictions with the ground truth added, and the number of ground truth values per file. """ + ground_truth = GroundTruth(ground_truth_path) + number_of_truth_values = {} for file, file_predictions in predictions.items(): ground_truth_for_file = ground_truth.for_file(file) diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py index 98e323f3..a5870cff 100644 --- a/src/stratigraphy/line_detection.py +++ b/src/stratigraphy/line_detection.py @@ -87,7 +87,7 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]: return lines -def draw_lines_on_pdfs(filename: str, page: fitz.Page, geometric_lines: list[Line]): +def draw_lines_on_page(filename: str, page: fitz.Page, geometric_lines: list[Line]): """Draw lines on pdf pages and stores them as artifacts in mlflow. Note: now the function draw_lines_on_pdfs may not even be needed any more. diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index 03005d17..e61c7222 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -10,9 +10,10 @@ from dotenv import load_dotenv from stratigraphy import DATAPATH -from stratigraphy.benchmark.score import evaluate_matching +from stratigraphy.benchmark.score import add_ground_truth_to_predictions, evaluate_matching from stratigraphy.extract import process_page -from stratigraphy.line_detection import draw_lines_on_pdfs, extract_lines, line_detection_params +from stratigraphy.line_detection import draw_lines_on_page, extract_lines, line_detection_params +from stratigraphy.util.draw import draw_predictions from stratigraphy.util.util import flatten, read_params load_dotenv() @@ -132,15 +133,18 @@ def start_pipeline( if draw_lines: logger.info("Drawing lines on pdf pages.") - draw_lines_on_pdfs(filename, page, geometric_lines) + draw_lines_on_page(filename, page, geometric_lines) with open(predictions_path, "w") as file: file.write(json.dumps(predictions)) # evaluate the predictions - metrics, document_level_metrics = evaluate_matching( - predictions_path, ground_truth_path, input_directory, out_directory, skip_draw_predictions - ) + predictions, number_of_truth_values = add_ground_truth_to_predictions(predictions, ground_truth_path) + + if not skip_draw_predictions: + draw_predictions(predictions, input_directory, out_directory) + + metrics, document_level_metrics = evaluate_matching(predictions, number_of_truth_values) document_level_metrics.to_csv(temp_directory / "document_level_metrics.csv") # mlflow.log_artifact expects a file if mlflow_tracking: From ebe0bac5ba58fdb4e59f41792eaaf5486a42f5bc Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Thu, 4 Apr 2024 14:27:10 +0200 Subject: [PATCH 14/19] Undo mistaken commit. --- src/stratigraphy/extract.py | 42 ---------------------------- src/stratigraphy/util/dataclasses.py | 3 -- 2 files changed, 45 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 5e312730..9cd622ae 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -84,9 +84,6 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict] for depth_column in depth_columns: material_description_rect = find_material_description_column(lines, depth_column) if material_description_rect: - material_description_rect = adjust_material_description_rect( - material_description_rect, page.rect.width, geometric_lines - ) pairs.append((depth_column, material_description_rect)) # lowest score first @@ -128,9 +125,6 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict] # Fallback when no depth column was found material_description_rect = find_material_description_column(lines, depth_column=None) if material_description_rect: - material_description_rect = adjust_material_description_rect( - material_description_rect, page.rect.width, geometric_lines - ) description_lines = get_description_lines(lines, material_description_rect) description_blocks = get_description_blocks( description_lines, @@ -462,42 +456,6 @@ def is_below(best_x0, best_y1, line): return candidate_rects[0] -def adjust_material_description_rect( - material_description_rect: fitz.Rect, page_width: float, geometric_lines: list[Line] -) -> fitz.Rect: - """Adjust the material description rectangle based on the geometric lines. - - Tries to adjust the material description rectangle to the top until a long solid line is reached. - - Args: - material_description_rect (fitz.Rect): _description_ - page_width (float): _description_ - geometric_lines (list[Line]): _description_ - - Returns: - fitz.Rect: _description_ - """ - material_description_rect_top = material_description_rect.y0 - max_line_y = 0 - for line in geometric_lines: - if ( - line.length() > 0.7 * material_description_rect.width - and (line.start.y + line.end.y) / 2 < material_description_rect_top + 5 - ): - max_line_y = max(max_line_y, (line.start.y + line.end.y) / 2) - 5 - - if max_line_y > material_description_rect_top: - new_coordinates = [ - material_description_rect.x0, - max_line_y, - material_description_rect.x1, - material_description_rect.y1, - ] - return fitz.Rect(*new_coordinates) - else: - return material_description_rect - - def perform_matching(directory: Path, **params: dict) -> dict: """Perform the matching of text blocks with depth intervals. diff --git a/src/stratigraphy/util/dataclasses.py b/src/stratigraphy/util/dataclasses.py index b787743b..48436128 100644 --- a/src/stratigraphy/util/dataclasses.py +++ b/src/stratigraphy/util/dataclasses.py @@ -47,9 +47,6 @@ def distance_to(self, point: Point) -> float: - (self.start.x - point.x) * (self.end.y - self.start.y) ) / np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2) - def length(self) -> float: - return np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2) - def slope(self) -> float: return (self.end.y - self.start.y) / (self.end.x - self.start.x) if self.end.x - self.start.x != 0 else np.inf From 2166ad13e1bb761e374af1d9d25878ff90ec6f19 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Thu, 4 Apr 2024 15:19:39 +0200 Subject: [PATCH 15/19] Minor refactoring --- src/stratigraphy/main.py | 1 - src/stratigraphy/util/draw.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index e61c7222..f7213a6f 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -130,7 +130,6 @@ def start_pipeline( "layers": layer_predictions, "depths_materials_column_pairs": depths_materials_column_pairs, } - if draw_lines: logger.info("Drawing lines on pdf pages.") draw_lines_on_page(filename, page, geometric_lines) diff --git a/src/stratigraphy/util/draw.py b/src/stratigraphy/util/draw.py index 948bd430..c5a814f4 100644 --- a/src/stratigraphy/util/draw.py +++ b/src/stratigraphy/util/draw.py @@ -31,14 +31,13 @@ def draw_predictions(predictions: dict, directory: Path, out_directory: Path) -> - Assignments of material description text blocks to depth intervals (if available) Args: - predictions (dict): Content of the predictions.json file.. + predictions (dict): Content of the predictions.json file. directory (Path): Path to the directory containing the pdf files. out_directory (Path): Path to the output directory where the images are saved. """ if directory.is_file(): # deal with the case when we pass a file instead of a directory directory = directory.parent for file in predictions: - logger.info(f"Evaluating {file}.") with fitz.Document(directory / file) as doc: for page_index, page in enumerate(doc): page_number = page_index + 1 From d32976b11860d0dcb12cbc3d5163f9dea9a5c81c Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Fri, 5 Apr 2024 10:33:09 +0200 Subject: [PATCH 16/19] remove unused function. Remove draw_lines and implement it in main.py directly. --- src/stratigraphy/extract.py | 34 ------------------------------ src/stratigraphy/line_detection.py | 19 ----------------- src/stratigraphy/main.py | 16 ++++++++++---- 3 files changed, 12 insertions(+), 57 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 9cd622ae..cfd44633 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -2,8 +2,6 @@ import logging import math -import os -from pathlib import Path import fitz @@ -454,35 +452,3 @@ def is_below(best_x0, best_y1, line): return max(candidate_rects, key=lambda rect: score_column_match(depth_column, rect)) else: return candidate_rects[0] - - -def perform_matching(directory: Path, **params: dict) -> dict: - """Perform the matching of text blocks with depth intervals. - - Args: - directory (Path): Path to the directory that contains the pdfs. - **params (dict): Additional parameters for the matching pipeline. - - Returns: - dict: The predictions. - """ - for root, _dirs, files in os.walk(directory): - output = {} - for filename in files: - if filename.endswith(".pdf"): - in_path = os.path.join(root, filename) - logger.info("Processing file: %s", in_path) - output[filename] = {} - - with fitz.Document(in_path) as doc: - for page_index, page in enumerate(doc): - page_number = page_index + 1 - logger.info("Processing page %s", page_number) - - predictions, depths_materials_column_pairs = process_page(page, **params) - - output[filename][f"page_{page_number}"] = { - "layers": predictions, - "depths_materials_column_pairs": depths_materials_column_pairs, - } - return output diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py index a5870cff..6f9aa3b8 100644 --- a/src/stratigraphy/line_detection.py +++ b/src/stratigraphy/line_detection.py @@ -14,7 +14,6 @@ merge_parallel_lines_approximately, merge_parallel_lines_efficiently, ) -from stratigraphy.util.plot_utils import plot_lines from stratigraphy.util.util import line_from_array, read_params load_dotenv() @@ -85,21 +84,3 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]: lines, tol=merging_params["merging_tolerance"], angle_threshold=merging_params["angle_threshold"] ) return lines - - -def draw_lines_on_page(filename: str, page: fitz.Page, geometric_lines: list[Line]): - """Draw lines on pdf pages and stores them as artifacts in mlflow. - - Note: now the function draw_lines_on_pdfs may not even be needed any more. - - Args: - filename (str): The filename of the pdf. - page (fitz.Page): The page to draw lines on. - geometric_lines (list[Line]): The lines to draw on the pdf page. - """ - if not mlflow_tracking: - raise Warning("MLFlow tracking is not enabled. MLFLow is required to store the images.") - import mlflow - - img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"]) - mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png") diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index f7213a6f..9856de9e 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -12,8 +12,9 @@ from stratigraphy import DATAPATH from stratigraphy.benchmark.score import add_ground_truth_to_predictions, evaluate_matching from stratigraphy.extract import process_page -from stratigraphy.line_detection import draw_lines_on_page, extract_lines, line_detection_params +from stratigraphy.line_detection import extract_lines, line_detection_params from stratigraphy.util.draw import draw_predictions +from stratigraphy.util.plot_utils import plot_lines from stratigraphy.util.util import flatten, read_params load_dotenv() @@ -130,9 +131,16 @@ def start_pipeline( "layers": layer_predictions, "depths_materials_column_pairs": depths_materials_column_pairs, } - if draw_lines: - logger.info("Drawing lines on pdf pages.") - draw_lines_on_page(filename, page, geometric_lines) + if draw_lines: # could be changed to if draw_lines and mflow_tracking: + if not mlflow_tracking: + logger.warning( + "MLFlow tracking is not enabled. MLFLow is required to store the images." + ) + else: + img = plot_lines( + page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"] + ) + mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png") with open(predictions_path, "w") as file: file.write(json.dumps(predictions)) From 3421484361f00e7b0ea1d478f527bf94ec190583 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Fri, 5 Apr 2024 11:35:17 +0200 Subject: [PATCH 17/19] Update comments. --- src/stratigraphy/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index 9856de9e..d9dc239a 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -145,7 +145,7 @@ def start_pipeline( with open(predictions_path, "w") as file: file.write(json.dumps(predictions)) - # evaluate the predictions + # evaluate the predictions; if file doesnt exist, the predictions are not changed. predictions, number_of_truth_values = add_ground_truth_to_predictions(predictions, ground_truth_path) if not skip_draw_predictions: From a4141fc854600e0bf8f4a11eaa80ae09dc7697fe Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Fri, 5 Apr 2024 11:40:11 +0200 Subject: [PATCH 18/19] Update readme; correct script name. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 61155a08..08cb6cc9 100644 --- a/README.md +++ b/README.md @@ -44,9 +44,9 @@ To execute the data extraction pipeline, follow these steps: The main script for the extraction pipeline is located at `src/stratigraphy/main.py`. A cli command is created to run this script. - Run `boreholes-extract-materials` to run the main extraction script. With the default options, the command will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory. + Run `boreholes-extract-layers` to run the main extraction script. With the default options, the command will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory. - Use `boreholes-extract-materials --help` to see all options for the extraction script. + Use `boreholes-extract-layers --help` to see all options for the extraction script. 3. **Check the results** From 901b02dda15c717ae1b3c3feec06b195124e7e7c Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Fri, 5 Apr 2024 11:41:38 +0200 Subject: [PATCH 19/19] Update documentation to clarify that input_path can be either a directory or a pdf path. --- src/stratigraphy/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index d9dc239a..3baa75e2 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -33,7 +33,7 @@ "--input_directory", type=click.Path(exists=True, path_type=Path), default=DATAPATH / "Benchmark", - help="Path to the input directory.", + help="Path to the input directory, or path to a single pdf file.", ) @click.option( "-g", @@ -82,7 +82,7 @@ def start_pipeline( \f Args: - input_directory (Path): The directory containing the pdf files. + input_directory (Path): The directory containing the pdf files. Can also be the path to a single pdf file. ground_truth_path (Path): The path to the ground truth file json file. out_directory (Path): The directory to store the evaluation results. predictions_path (Path): The path to the predictions file.