From 573b9f85e455fa43727f3d2fd24d4cf67ff50cb6 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Tue, 2 Apr 2024 18:19:45 +0200
Subject: [PATCH 01/19] Create CLI interface.

---
 environment-dev.yml      |  3 ++-
 environment-prod.yml     |  1 +
 pyproject.toml           |  3 +++
 src/stratigraphy/main.py | 49 ++++++++++++++++++++++++++++++++--------
 4 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/environment-dev.yml b/environment-dev.yml
index d77a88ca..552f7558 100644
--- a/environment-dev.yml
+++ b/environment-dev.yml
@@ -10,7 +10,7 @@ dependencies:
   - pathlib==1.0.1
   - opencv==4.9.0
   - python-dotenv==1.0.1
-  - pytest==8.1.1
+  - click==8.1.7
   - pip
 # dev dependencies
   - matplotlib==3.8.0
@@ -18,6 +18,7 @@ dependencies:
   - jupyterlab==4.1.3
   - black==24.2.0
   - pre-commit==3.6.2
+  - pytest==8.1.1
   - pip:
 # prod pip dependencies; needs to be a strict copy of environment-prod.yml
       - amazon-textract-textractor
diff --git a/environment-prod.yml b/environment-prod.yml
index 02e3b443..da30c50e 100644
--- a/environment-prod.yml
+++ b/environment-prod.yml
@@ -9,6 +9,7 @@ dependencies:
   - pathlib==1.0.1
   - opencv==4.9.0
   - python-dotenv==1.0.1
+  - click==8.1.7
   - pip
   - pip:
       - amazon-textract-textractor
diff --git a/pyproject.toml b/pyproject.toml
index bb3efa8d..ca0fbde4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,6 +7,9 @@ requires-python = ">=3.10"
 dependencies = [
 ]
 
+[project.scripts]
+boreholes-extract-materials = "stratigraphy.main:start_pipeline"
+
 [tool.ruff.lint]
 select = [
     # pydocstyle
diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index f28463cd..1c437d83 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -6,6 +6,7 @@
 import os
 from pathlib import Path
 
+import click
 import fitz
 from dotenv import load_dotenv
 
@@ -502,9 +503,40 @@ def perform_matching(directory: Path, **params: dict) -> dict:
         return output
 
 
-if __name__ == "__main__":
-    # setup mlflow tracking; should be started before any other code
-    # such that tracking is enabled in other parts of the code.
+@click.command()
+@click.option(
+    "--input_directory",
+    type=click.Path(exists=True, path_type=Path),
+    default=DATAPATH / "Benchmark",
+    help="Path to the input directory.",
+)
+@click.option(
+    "--ground_truth_path",
+    type=click.Path(exists=True, path_type=Path),
+    default=DATAPATH / "Benchmark" / "ground_truth.json",
+    help="Path to the ground truth file.",
+)
+@click.option(
+    "--out_directory",
+    type=click.Path(path_type=Path),
+    default=DATAPATH / "Benchmark" / "evaluation",
+    help="Path to the output directory.",
+)
+@click.option(
+    "--predictions_path",
+    type=click.Path(path_type=Path),
+    default=DATAPATH / "Benchmark" / "extract" / "predictions.json",
+    help="Path to the predictions file.",
+)
+def start_pipeline(input_directory: Path, ground_truth_path: Path, out_directory: Path, predictions_path: Path):
+    """Description.
+
+    Args:
+        input_directory (Path): _description_
+        ground_truth_path (Path): _description_
+        out_directory (Path): _description_
+        predictions_path (Path): _description_
+    """
     if mlflow_tracking:
         import mlflow
 
@@ -513,13 +545,8 @@ def perform_matching(directory: Path, **params: dict) -> dict:
         mlflow.log_params(flatten(line_detection_params))
         mlflow.log_params(flatten(matching_params))
 
-    # instantiate all paths
-    input_directory = DATAPATH / "Benchmark"
-    ground_truth_path = input_directory / "ground_truth.json"
-    out_directory = input_directory / "evaluation"
-    predictions_path = input_directory / "extract" / "predictions.json"
     temp_directory = DATAPATH / "_temp"  # temporary directory to dump files for mlflow artifact logging
-
+    # check if directories exist and create them when neccessary
     # check if directories exist and create them when neccessary
     out_directory.mkdir(parents=True, exist_ok=True)
     temp_directory.mkdir(parents=True, exist_ok=True)
@@ -538,3 +565,7 @@ def perform_matching(directory: Path, **params: dict) -> dict:
     if mlflow_tracking:
         mlflow.log_metrics(metrics)
         mlflow.log_artifact(temp_directory / "document_level_metrics.csv")
+
+
+if __name__ == "__main__":
+    start_pipeline()

From baada76de18b662c8456974da8ef83fecf1dcb35 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Tue, 2 Apr 2024 18:34:07 +0200
Subject: [PATCH 02/19] Refactor: Refactored pipeline logic out of main.py into
 extract.py.

---
 src/stratigraphy/extract.py | 490 ++++++++++++++++++++++++++++++++++++
 src/stratigraphy/main.py    | 485 +----------------------------------
 2 files changed, 493 insertions(+), 482 deletions(-)
 create mode 100644 src/stratigraphy/extract.py

diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
new file mode 100644
index 00000000..a3b911d1
--- /dev/null
+++ b/src/stratigraphy/extract.py
@@ -0,0 +1,490 @@
+"""Contains the main extraction pipeline for stratigraphy."""
+
+import logging
+import math
+import os
+from pathlib import Path
+
+import fitz
+
+from stratigraphy.line_detection import extract_lines, line_detection_params
+from stratigraphy.util import find_depth_columns
+from stratigraphy.util.dataclasses import Line
+from stratigraphy.util.depthcolumn import DepthColumn
+from stratigraphy.util.find_description import get_description_blocks, get_description_lines
+from stratigraphy.util.interval import BoundaryInterval, Interval
+from stratigraphy.util.line import DepthInterval, TextLine
+from stratigraphy.util.textblock import TextBlock, block_distance
+from stratigraphy.util.util import (
+    parse_and_remove_empty_predictions,
+    x_overlap,
+    x_overlap_significant_smallest,
+)
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def process_page(page: fitz.Page, **params: dict) -> list[dict]:
+    """Process a single page of a pdf.
+
+    Finds all descriptions and depth intervals on the page and matches them.
+
+    Args:
+        page (fitz.Page): The page to process.
+        **params (dict): Additional parameters for the matching pipeline.
+
+    Returns:
+        list[dict]: All list of the text of all description blocks.
+    """
+    words = []
+    words_by_line = {}
+    for x0, y0, x1, y1, word, block_no, line_no, _word_no in fitz.utils.get_text(page, "words"):
+        rect = fitz.Rect(x0, y0, x1, y1) * page.rotation_matrix
+        depth_interval = DepthInterval(rect, word)
+        words.append(TextLine([depth_interval]))
+        key = f"{block_no}_{line_no}"
+        if key not in words_by_line:
+            words_by_line[key] = []
+        words_by_line[key].append(depth_interval)
+
+    raw_lines = [TextLine(words_by_line[key]) for key in words_by_line]
+
+    lines = []
+    current_line_words = []
+    for line_index, raw_line in enumerate(raw_lines):
+        for word_index, word in enumerate(raw_line.words):
+            remaining_line = TextLine(raw_line.words[word_index:])
+            if len(current_line_words) > 0 and remaining_line.is_line_start(lines, raw_lines[line_index + 1 :]):
+                lines.append(TextLine(current_line_words))
+                current_line_words = []
+            current_line_words.append(word)
+        if len(current_line_words):
+            lines.append(TextLine(current_line_words))
+            current_line_words = []
+
+    depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True)
+    layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words)
+
+    used_entry_rects = []
+    for column in layer_depth_columns:
+        for entry in column.entries:
+            used_entry_rects.extend([entry.start.rect, entry.end.rect])
+
+    depth_column_entries = [
+        entry
+        for entry in find_depth_columns.depth_column_entries(words, include_splits=False)
+        if entry.rect not in used_entry_rects
+    ]
+
+    depth_columns: list[DepthColumn] = layer_depth_columns
+    depth_columns.extend(find_depth_columns.find_depth_columns(depth_column_entries, words))
+
+    pairs = []
+    for depth_column in depth_columns:
+        material_description_rect = find_material_description_column(lines, depth_column)
+        if material_description_rect:
+            pairs.append((depth_column, material_description_rect))
+
+    # lowest score first
+    pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words))
+
+    to_delete = []
+    for i, (_depth_column, material_description_rect) in enumerate(pairs):
+        for _depth_column_2, material_description_rect_2 in pairs[i + 1 :]:
+            if material_description_rect.intersects(material_description_rect_2):
+                to_delete.append(i)
+                continue
+    filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete]
+
+    geometric_lines = extract_lines(page, line_detection_params)
+
+    groups = []  # list of matched depth intervals and text blocks
+    # groups is of the form: ["depth_interval": BoundaryInterval, "block": TextBlock]
+    if len(filtered_pairs):  # match depth column items with material description
+        for depth_column, material_description_rect in filtered_pairs:
+            description_lines = get_description_lines(lines, material_description_rect)
+            if len(description_lines) > 1:
+                new_groups = match_columns(
+                    depth_column, description_lines, geometric_lines, material_description_rect, **params
+                )
+                groups.extend(new_groups)
+        json_filtered_pairs = [
+            {
+                "depth_column": depth_column.to_json(),
+                "material_description_rect": [
+                    material_description_rect.x0,
+                    material_description_rect.y0,
+                    material_description_rect.x1,
+                    material_description_rect.y1,
+                ],
+            }
+            for depth_column, material_description_rect in filtered_pairs
+        ]
+
+    else:
+        json_filtered_pairs = []
+        # Fallback when no depth column was found
+        material_description_rect = find_material_description_column(lines, depth_column=None)
+        if material_description_rect:
+            description_lines = get_description_lines(lines, material_description_rect)
+            description_blocks = get_description_blocks(
+                description_lines,
+                geometric_lines,
+                material_description_rect,
+                params["block_line_ratio"],
+                params["left_line_length_threshold"],
+            )
+            groups.extend([{"block": block} for block in description_blocks])
+            json_filtered_pairs.extend(
+                [
+                    {
+                        "depth_column": None,
+                        "material_description_rect": [
+                            material_description_rect.x0,
+                            material_description_rect.y0,
+                            material_description_rect.x1,
+                            material_description_rect.y1,
+                        ],
+                    }
+                ]
+            )
+    predictions = [
+        {"material_description": group["block"].to_json(), "depth_interval": group["depth_interval"].to_json()}
+        if "depth_interval" in group
+        else {"material_description": group["block"].to_json()}
+        for group in groups
+    ]
+    predictions = parse_and_remove_empty_predictions(predictions)
+    return predictions, json_filtered_pairs
+
+
+def score_column_match(
+    depth_column: DepthColumn,
+    material_description_rect: fitz.Rect,
+    all_words: list[TextLine] | None = None,
+    **params: dict,
+) -> float:
+    """Scores the match between a depth column and a material description.
+
+    Args:
+        depth_column (DepthColumn): The depth column.
+        material_description_rect (fitz.Rect): The material description rectangle.
+        all_words (list[TextLine] | None, optional): List of the available textlines. Defaults to None.
+        **params (dict): Additional parameters for the matching pipeline. Kept for compatibility with the pipeline.
+
+    Returns:
+        float: The score of the match.
+    """
+    rect = depth_column.rect()
+    top = rect.y0
+    bottom = rect.y1
+    right = rect.x1
+    distance = (
+        abs(top - material_description_rect.y0)
+        + abs(bottom - material_description_rect.y1)
+        + abs(right - material_description_rect.x0)
+    )
+
+    height = bottom - top
+
+    noise_count = depth_column.noise_count(all_words) if all_words else 0
+
+    return (height - distance) * math.pow(0.8, noise_count)
+
+
+def match_columns(
+    depth_column: DepthColumn,
+    description_lines: list[TextLine],
+    geometric_lines: list[Line],
+    material_description_rect: fitz.Rect,
+    **params: dict,
+) -> list:
+    """Match the depth column entries with the description lines.
+
+    This function identifies groups of depth intervals and text blocks that are likely to match.
+    In this process, the number of text blocks is adjusted to match the number of depth intervals.
+
+    Args:
+        depth_column (DepthColumn): The depth column.
+        description_lines (list[TextLine]): The description lines.
+        geometric_lines (list[Line]): The geometric lines.
+        material_description_rect (fitz.Rect): The material description rectangle.
+        **params (dict): Additional parameters for the matching pipeline.
+
+    Returns:
+        list: The matched depth intervals and text blocks.
+    """
+    return [
+        element
+        for group in depth_column.identify_groups(
+            description_lines, geometric_lines, material_description_rect, **params
+        )
+        for element in transform_groups(group["depth_intervals"], group["blocks"], **params)
+    ]
+
+
+def transform_groups(
+    depth_intervals: list[Interval], blocks: list[TextBlock], **params: dict
+) -> list[dict[str, Interval | TextBlock]]:
+    """Transforms the text blocks such that their number equals the number of depth intervals.
+
+    If there are more depth intervals than text blocks, text blocks are splitted. When there
+    are more text blocks than depth intervals, text blocks are merged. If the number of text blocks
+    and depth intervals equals, we proceed with the pairing.
+
+    Args:
+        depth_intervals (List[Interval]): The depth intervals from the pdf.
+        blocks (List[TextBlock]): Found textblocks from the pdf.
+        **params (dict): Additional parameters for the matching pipeline.
+
+    Returns:
+        List[Dict[str, Union[Interval, TextBlock]]]: Pairing of text blocks and depth intervals.
+    """
+    if len(depth_intervals) == 0:
+        return []
+    elif len(depth_intervals) == 1:
+        concatenated_block = TextBlock(
+            [line for block in blocks for line in block.lines]
+        )  # concatenate all text lines within a block; line separation flag does not matter here.
+        return [{"depth_interval": depth_intervals[0], "block": concatenated_block}]
+    else:
+        if len(blocks) < len(depth_intervals):
+            blocks = split_blocks_by_textline_length(blocks, target_split_count=len(depth_intervals) - len(blocks))
+
+        if len(blocks) > len(depth_intervals):
+            # create additional depth intervals with end & start value None to match the number of blocks
+            depth_intervals.extend([BoundaryInterval(None, None) for _ in range(len(blocks) - len(depth_intervals))])
+
+        return [
+            {"depth_interval": depth_interval, "block": block}
+            for depth_interval, block in zip(depth_intervals, blocks, strict=False)
+        ]
+
+
+def merge_blocks_by_vertical_spacing(blocks: list[TextBlock], target_merge_count: int) -> list[TextBlock]:
+    """Merge textblocks without any geometric lines that separates them.
+
+    Note: Deprecated. Currently not in use any more. Kept here until we are sure that it is not needed anymore.
+
+    The logic looks at the distances between the textblocks and merges them if they are closer
+    than a certain cutoff.
+
+    Args:
+        blocks (List[TextBlock]): Textblocks that are to be merged.
+        target_merge_count (int): the number of merges that we'd like to happen (i.e. we'd like the total number of
+                                  blocks to be reduced by this number)
+
+    Returns:
+        List[TextBlock]: The merged textblocks.
+    """
+    distances = []
+    for block_index in range(len(blocks) - 1):
+        distances.append(block_distance(blocks[block_index], blocks[block_index + 1]))
+    cutoff = sorted(distances)[target_merge_count - 1]  # merge all blocks that have a distance smaller than this
+    merged_count = 0
+    merged_blocks = []
+    current_merged_block = blocks[0]
+    for block_index in range(len(blocks) - 1):
+        new_block = blocks[block_index + 1]
+        if (
+            merged_count < target_merge_count
+            and block_distance(blocks[block_index], blocks[block_index + 1]) <= cutoff
+        ):
+            current_merged_block = current_merged_block.concatenate(new_block)
+            merged_count += 1
+        else:
+            merged_blocks.append(current_merged_block)
+            current_merged_block = new_block
+
+    if len(current_merged_block.lines):
+        merged_blocks.append(current_merged_block)
+    return merged_blocks
+
+
+def split_blocks_by_textline_length(blocks: list[TextBlock], target_split_count: int) -> list[TextBlock]:
+    """Split textblocks without any geometric lines that separates them.
+
+    The logic looks at the lengths of the text lines and cuts them off
+    if there are textlines that are shorter than others.
+    # TODO: Extend documentation about logic.
+
+    Args:
+        blocks (List[TextBlock]): Textblocks that are to be split.
+        target_split_count (int): the number of splits that we'd like to happen (i.e. we'd like the total number of
+                                  blocks to be increased by this number)
+
+    Returns:
+        List[TextBlock]: The split textblocks.
+    """
+    line_lengths = sorted([line.rect.x1 for block in blocks for line in block.lines[:-1]])
+    if len(line_lengths) <= target_split_count:  # In that case each line is a block
+        return [TextBlock([line]) for block in blocks for line in block.lines]
+    else:
+        cutoff_values = line_lengths[:target_split_count]  # all lines inside cutoff_values will be split line
+        split_blocks = []
+        current_block_lines = []
+        for block in blocks:
+            for line_index in range(block.line_count):
+                line = block.lines[line_index]
+                current_block_lines.append(line)
+                if line_index < block.line_count - 1 and line.rect.x1 in cutoff_values:
+                    split_blocks.append(TextBlock(current_block_lines))
+                    cutoff_values.remove(line.rect.x1)
+                    current_block_lines = []
+            if len(current_block_lines):
+                split_blocks.append(TextBlock(current_block_lines))
+                current_block_lines = []
+            if (
+                block.is_terminated_by_line
+            ):  # If block was terminated by a line, populate the flag to the last element of split_blocks.
+                split_blocks[-1].is_terminated_by_line = True
+        return split_blocks
+
+
+def find_material_description_column(
+    lines: list[TextLine], depth_column: DepthColumn, **params: dict
+) -> fitz.Rect | None:
+    """Find the material description column given a depth column.
+
+    Args:
+        lines (list[TextLine]): The text lines of the page.
+        depth_column (DepthColumn): The depth column.
+        **params (dict): Additional parameters for the matching pipeline.
+
+    Returns:
+        fitz.Rect | None: The material description column.
+    """
+    if depth_column:
+        above_depth_column = [
+            line
+            for line in lines
+            if x_overlap(line.rect, depth_column.rect()) and line.rect.y0 < depth_column.rect().y0
+        ]
+
+        min_y0 = max(line.rect.y0 for line in above_depth_column) if len(above_depth_column) else -1
+
+        def check_y0_condition(y0):
+            return y0 > min_y0 and y0 < depth_column.rect().y1
+    else:
+
+        def check_y0_condition(y0):
+            return True
+
+    candidate_description = [line for line in lines if check_y0_condition(line.rect.y0)]
+    is_description = [line for line in candidate_description if line.is_description]
+
+    if len(candidate_description) == 0:
+        return
+
+    description_clusters = []
+    while len(is_description) > 0:
+        coverage_by_generating_line = [
+            [other for other in is_description if x_overlap_significant_smallest(line.rect, other.rect, 0.5)]
+            for line in is_description
+        ]
+
+        def filter_coverage(coverage):
+            if len(coverage):
+                min_x0 = min(line.rect.x0 for line in coverage)
+                max_x1 = max(line.rect.x1 for line in coverage)
+                x0_threshold = max_x1 - 0.4 * (
+                    max_x1 - min_x0
+                )  #  how did we determine the 0.4? Should it be a parameter? What would it do if we were to change it?
+                return [line for line in coverage if line.rect.x0 < x0_threshold]
+            else:
+                return []
+
+        coverage_by_generating_line = [filter_coverage(coverage) for coverage in coverage_by_generating_line]
+        max_coverage = max(coverage_by_generating_line, key=len)
+        description_clusters.append(max_coverage)
+        is_description = [line for line in is_description if line not in max_coverage]
+
+    candidate_rects = []
+
+    for cluster in description_clusters:
+        best_y0 = min([line.rect.y0 for line in cluster])
+        best_y1 = max([line.rect.y1 for line in cluster])
+
+        min_description_x0 = min(
+            [
+                line.rect.x0 - 0.01 * line.rect.width for line in cluster
+            ]  # How did we determine the 0.01? Should it be a parameter? What would it do if we were to change it?
+        )
+        max_description_x0 = max(
+            [
+                line.rect.x0 + 0.2 * line.rect.width for line in cluster
+            ]  # How did we determine the 0.2? Should it be a parameter? What would it do if we were to change it?
+        )
+        good_lines = [
+            line
+            for line in candidate_description
+            if line.rect.y0 >= best_y0 and line.rect.y1 <= best_y1
+            if min_description_x0 < line.rect.x0 < max_description_x0
+        ]
+        best_x0 = min([line.rect.x0 for line in good_lines])
+        best_x1 = max([line.rect.x1 for line in good_lines])
+
+        # expand to include entire last block
+        def is_below(best_x0, best_y1, line):
+            return (
+                (
+                    line.rect.x0 > best_x0 - 5
+                )  # How did we determine the 5? Should it be a parameter? What would it do if we were to change it?
+                and (line.rect.x0 < (best_x0 + best_x1) / 2)  # noqa B023
+                and (
+                    line.rect.y0 < best_y1 + 10
+                )  # How did we determine the 10? Should it be a parameter? What would it do if we were to change it?
+                and (line.rect.y1 > best_y1)
+            )
+
+        continue_search = True
+        while continue_search:
+            line = next((line for line in lines if is_below(best_x0, best_y1, line)), None)
+            if line:
+                best_x0 = min(best_x0, line.rect.x0)
+                best_x1 = max(best_x1, line.rect.x1)
+                best_y1 = line.rect.y1
+            else:
+                continue_search = False
+
+        candidate_rects.append(fitz.Rect(best_x0, best_y0, best_x1, best_y1))
+
+    if len(candidate_rects) == 0:
+        return None
+    if depth_column:
+        return max(candidate_rects, key=lambda rect: score_column_match(depth_column, rect))
+    else:
+        return candidate_rects[0]
+
+
+def perform_matching(directory: Path, **params: dict) -> dict:
+    """Perform the matching of text blocks with depth intervals.
+
+    Args:
+        directory (Path): Path to the directory that contains the pdfs.
+        **params (dict): Additional parameters for the matching pipeline.
+
+    Returns:
+        dict: The predictions.
+    """
+    for root, _dirs, files in os.walk(directory):
+        output = {}
+        for filename in files:
+            if filename.endswith(".pdf"):
+                in_path = os.path.join(root, filename)
+                logger.info("Processing file: %s", in_path)
+                output[filename] = {}
+
+                with fitz.Document(in_path) as doc:
+                    for page_index, page in enumerate(doc):
+                        page_number = page_index + 1
+                        logger.info("Processing page %s", page_number)
+
+                        predictions, depths_materials_column_pairs = process_page(page, **params)
+
+                        output[filename][f"page_{page_number}"] = {
+                            "layers": predictions,
+                            "depths_materials_column_pairs": depths_materials_column_pairs,
+                        }
+        return output
diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index 1c437d83..029f71ea 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -2,31 +2,17 @@
 
 import json
 import logging
-import math
 import os
 from pathlib import Path
 
 import click
-import fitz
 from dotenv import load_dotenv
 
 from stratigraphy import DATAPATH
 from stratigraphy.benchmark.score import evaluate_matching
-from stratigraphy.line_detection import extract_lines, line_detection_params
-from stratigraphy.util import find_depth_columns
-from stratigraphy.util.dataclasses import Line
-from stratigraphy.util.depthcolumn import DepthColumn
-from stratigraphy.util.find_description import get_description_blocks, get_description_lines
-from stratigraphy.util.interval import BoundaryInterval, Interval
-from stratigraphy.util.line import DepthInterval, TextLine
-from stratigraphy.util.textblock import TextBlock, block_distance
-from stratigraphy.util.util import (
-    flatten,
-    parse_and_remove_empty_predictions,
-    read_params,
-    x_overlap,
-    x_overlap_significant_smallest,
-)
+from stratigraphy.extract import perform_matching
+from stratigraphy.line_detection import line_detection_params
+from stratigraphy.util.util import flatten, read_params
 
 load_dotenv()
 
@@ -38,471 +24,6 @@
 matching_params = read_params("matching_params.yml")
 
 
-def process_page(page: fitz.Page, **params: dict) -> list[dict]:
-    """Process a single page of a pdf.
-
-    Finds all descriptions and depth intervals on the page and matches them.
-
-    Args:
-        page (fitz.Page): The page to process.
-        **params (dict): Additional parameters for the matching pipeline.
-
-    Returns:
-        list[dict]: All list of the text of all description blocks.
-    """
-    words = []
-    words_by_line = {}
-    for x0, y0, x1, y1, word, block_no, line_no, _word_no in fitz.utils.get_text(page, "words"):
-        rect = fitz.Rect(x0, y0, x1, y1) * page.rotation_matrix
-        depth_interval = DepthInterval(rect, word)
-        words.append(TextLine([depth_interval]))
-        key = f"{block_no}_{line_no}"
-        if key not in words_by_line:
-            words_by_line[key] = []
-        words_by_line[key].append(depth_interval)
-
-    raw_lines = [TextLine(words_by_line[key]) for key in words_by_line]
-
-    lines = []
-    current_line_words = []
-    for line_index, raw_line in enumerate(raw_lines):
-        for word_index, word in enumerate(raw_line.words):
-            remaining_line = TextLine(raw_line.words[word_index:])
-            if len(current_line_words) > 0 and remaining_line.is_line_start(lines, raw_lines[line_index + 1 :]):
-                lines.append(TextLine(current_line_words))
-                current_line_words = []
-            current_line_words.append(word)
-        if len(current_line_words):
-            lines.append(TextLine(current_line_words))
-            current_line_words = []
-
-    depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True)
-    layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words)
-
-    used_entry_rects = []
-    for column in layer_depth_columns:
-        for entry in column.entries:
-            used_entry_rects.extend([entry.start.rect, entry.end.rect])
-
-    depth_column_entries = [
-        entry
-        for entry in find_depth_columns.depth_column_entries(words, include_splits=False)
-        if entry.rect not in used_entry_rects
-    ]
-
-    depth_columns: list[DepthColumn] = layer_depth_columns
-    depth_columns.extend(find_depth_columns.find_depth_columns(depth_column_entries, words))
-
-    pairs = []
-    for depth_column in depth_columns:
-        material_description_rect = find_material_description_column(lines, depth_column)
-        if material_description_rect:
-            pairs.append((depth_column, material_description_rect))
-
-    # lowest score first
-    pairs.sort(key=lambda pair: score_column_match(pair[0], pair[1], words))
-
-    to_delete = []
-    for i, (_depth_column, material_description_rect) in enumerate(pairs):
-        for _depth_column_2, material_description_rect_2 in pairs[i + 1 :]:
-            if material_description_rect.intersects(material_description_rect_2):
-                to_delete.append(i)
-                continue
-    filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete]
-
-    geometric_lines = extract_lines(page, line_detection_params)
-
-    groups = []  # list of matched depth intervals and text blocks
-    # groups is of the form: ["depth_interval": BoundaryInterval, "block": TextBlock]
-    if len(filtered_pairs):  # match depth column items with material description
-        for depth_column, material_description_rect in filtered_pairs:
-            description_lines = get_description_lines(lines, material_description_rect)
-            if len(description_lines) > 1:
-                new_groups = match_columns(
-                    depth_column, description_lines, geometric_lines, material_description_rect, **params
-                )
-                groups.extend(new_groups)
-        json_filtered_pairs = [
-            {
-                "depth_column": depth_column.to_json(),
-                "material_description_rect": [
-                    material_description_rect.x0,
-                    material_description_rect.y0,
-                    material_description_rect.x1,
-                    material_description_rect.y1,
-                ],
-            }
-            for depth_column, material_description_rect in filtered_pairs
-        ]
-
-    else:
-        json_filtered_pairs = []
-        # Fallback when no depth column was found
-        material_description_rect = find_material_description_column(lines, depth_column=None)
-        if material_description_rect:
-            description_lines = get_description_lines(lines, material_description_rect)
-            description_blocks = get_description_blocks(
-                description_lines,
-                geometric_lines,
-                material_description_rect,
-                params["block_line_ratio"],
-                params["left_line_length_threshold"],
-            )
-            groups.extend([{"block": block} for block in description_blocks])
-            json_filtered_pairs.extend(
-                [
-                    {
-                        "depth_column": None,
-                        "material_description_rect": [
-                            material_description_rect.x0,
-                            material_description_rect.y0,
-                            material_description_rect.x1,
-                            material_description_rect.y1,
-                        ],
-                    }
-                ]
-            )
-    predictions = [
-        {"material_description": group["block"].to_json(), "depth_interval": group["depth_interval"].to_json()}
-        if "depth_interval" in group
-        else {"material_description": group["block"].to_json()}
-        for group in groups
-    ]
-    predictions = parse_and_remove_empty_predictions(predictions)
-    return predictions, json_filtered_pairs
-
-
-def score_column_match(
-    depth_column: DepthColumn,
-    material_description_rect: fitz.Rect,
-    all_words: list[TextLine] | None = None,
-    **params: dict,
-) -> float:
-    """Scores the match between a depth column and a material description.
-
-    Args:
-        depth_column (DepthColumn): The depth column.
-        material_description_rect (fitz.Rect): The material description rectangle.
-        all_words (list[TextLine] | None, optional): List of the available textlines. Defaults to None.
-        **params (dict): Additional parameters for the matching pipeline. Kept for compatibility with the pipeline.
-
-    Returns:
-        float: The score of the match.
-    """
-    rect = depth_column.rect()
-    top = rect.y0
-    bottom = rect.y1
-    right = rect.x1
-    distance = (
-        abs(top - material_description_rect.y0)
-        + abs(bottom - material_description_rect.y1)
-        + abs(right - material_description_rect.x0)
-    )
-
-    height = bottom - top
-
-    noise_count = depth_column.noise_count(all_words) if all_words else 0
-
-    return (height - distance) * math.pow(0.8, noise_count)
-
-
-def match_columns(
-    depth_column: DepthColumn,
-    description_lines: list[TextLine],
-    geometric_lines: list[Line],
-    material_description_rect: fitz.Rect,
-    **params: dict,
-) -> list:
-    """Match the depth column entries with the description lines.
-
-    This function identifies groups of depth intervals and text blocks that are likely to match.
-    In this process, the number of text blocks is adjusted to match the number of depth intervals.
-
-    Args:
-        depth_column (DepthColumn): The depth column.
-        description_lines (list[TextLine]): The description lines.
-        geometric_lines (list[Line]): The geometric lines.
-        material_description_rect (fitz.Rect): The material description rectangle.
-        **params (dict): Additional parameters for the matching pipeline.
-
-    Returns:
-        list: The matched depth intervals and text blocks.
-    """
-    return [
-        element
-        for group in depth_column.identify_groups(
-            description_lines, geometric_lines, material_description_rect, **params
-        )
-        for element in transform_groups(group["depth_intervals"], group["blocks"], **params)
-    ]
-
-
-def transform_groups(
-    depth_intervals: list[Interval], blocks: list[TextBlock], **params: dict
-) -> list[dict[str, Interval | TextBlock]]:
-    """Transforms the text blocks such that their number equals the number of depth intervals.
-
-    If there are more depth intervals than text blocks, text blocks are splitted. When there
-    are more text blocks than depth intervals, text blocks are merged. If the number of text blocks
-    and depth intervals equals, we proceed with the pairing.
-
-    Args:
-        depth_intervals (List[Interval]): The depth intervals from the pdf.
-        blocks (List[TextBlock]): Found textblocks from the pdf.
-        **params (dict): Additional parameters for the matching pipeline.
-
-    Returns:
-        List[Dict[str, Union[Interval, TextBlock]]]: Pairing of text blocks and depth intervals.
-    """
-    if len(depth_intervals) == 0:
-        return []
-    elif len(depth_intervals) == 1:
-        concatenated_block = TextBlock(
-            [line for block in blocks for line in block.lines]
-        )  # concatenate all text lines within a block; line separation flag does not matter here.
-        return [{"depth_interval": depth_intervals[0], "block": concatenated_block}]
-    else:
-        if len(blocks) < len(depth_intervals):
-            blocks = split_blocks_by_textline_length(blocks, target_split_count=len(depth_intervals) - len(blocks))
-
-        if len(blocks) > len(depth_intervals):
-            # create additional depth intervals with end & start value None to match the number of blocks
-            depth_intervals.extend([BoundaryInterval(None, None) for _ in range(len(blocks) - len(depth_intervals))])
-
-        return [
-            {"depth_interval": depth_interval, "block": block}
-            for depth_interval, block in zip(depth_intervals, blocks, strict=False)
-        ]
-
-
-def merge_blocks_by_vertical_spacing(blocks: list[TextBlock], target_merge_count: int) -> list[TextBlock]:
-    """Merge textblocks without any geometric lines that separates them.
-
-    Note: Deprecated. Currently not in use any more. Kept here until we are sure that it is not needed anymore.
-
-    The logic looks at the distances between the textblocks and merges them if they are closer
-    than a certain cutoff.
-
-    Args:
-        blocks (List[TextBlock]): Textblocks that are to be merged.
-        target_merge_count (int): the number of merges that we'd like to happen (i.e. we'd like the total number of
-                                  blocks to be reduced by this number)
-
-    Returns:
-        List[TextBlock]: The merged textblocks.
-    """
-    distances = []
-    for block_index in range(len(blocks) - 1):
-        distances.append(block_distance(blocks[block_index], blocks[block_index + 1]))
-    cutoff = sorted(distances)[target_merge_count - 1]  # merge all blocks that have a distance smaller than this
-    merged_count = 0
-    merged_blocks = []
-    current_merged_block = blocks[0]
-    for block_index in range(len(blocks) - 1):
-        new_block = blocks[block_index + 1]
-        if (
-            merged_count < target_merge_count
-            and block_distance(blocks[block_index], blocks[block_index + 1]) <= cutoff
-        ):
-            current_merged_block = current_merged_block.concatenate(new_block)
-            merged_count += 1
-        else:
-            merged_blocks.append(current_merged_block)
-            current_merged_block = new_block
-
-    if len(current_merged_block.lines):
-        merged_blocks.append(current_merged_block)
-    return merged_blocks
-
-
-def split_blocks_by_textline_length(blocks: list[TextBlock], target_split_count: int) -> list[TextBlock]:
-    """Split textblocks without any geometric lines that separates them.
-
-    The logic looks at the lengths of the text lines and cuts them off
-    if there are textlines that are shorter than others.
-    # TODO: Extend documentation about logic.
-
-    Args:
-        blocks (List[TextBlock]): Textblocks that are to be split.
-        target_split_count (int): the number of splits that we'd like to happen (i.e. we'd like the total number of
-                                  blocks to be increased by this number)
-
-    Returns:
-        List[TextBlock]: The split textblocks.
-    """
-    line_lengths = sorted([line.rect.x1 for block in blocks for line in block.lines[:-1]])
-    if len(line_lengths) <= target_split_count:  # In that case each line is a block
-        return [TextBlock([line]) for block in blocks for line in block.lines]
-    else:
-        cutoff_values = line_lengths[:target_split_count]  # all lines inside cutoff_values will be split line
-        split_blocks = []
-        current_block_lines = []
-        for block in blocks:
-            for line_index in range(block.line_count):
-                line = block.lines[line_index]
-                current_block_lines.append(line)
-                if line_index < block.line_count - 1 and line.rect.x1 in cutoff_values:
-                    split_blocks.append(TextBlock(current_block_lines))
-                    cutoff_values.remove(line.rect.x1)
-                    current_block_lines = []
-            if len(current_block_lines):
-                split_blocks.append(TextBlock(current_block_lines))
-                current_block_lines = []
-            if (
-                block.is_terminated_by_line
-            ):  # If block was terminated by a line, populate the flag to the last element of split_blocks.
-                split_blocks[-1].is_terminated_by_line = True
-        return split_blocks
-
-
-def find_material_description_column(
-    lines: list[TextLine], depth_column: DepthColumn, **params: dict
-) -> fitz.Rect | None:
-    """Find the material description column given a depth column.
-
-    Args:
-        lines (list[TextLine]): The text lines of the page.
-        depth_column (DepthColumn): The depth column.
-        **params (dict): Additional parameters for the matching pipeline.
-
-    Returns:
-        fitz.Rect | None: The material description column.
-    """
-    if depth_column:
-        above_depth_column = [
-            line
-            for line in lines
-            if x_overlap(line.rect, depth_column.rect()) and line.rect.y0 < depth_column.rect().y0
-        ]
-
-        min_y0 = max(line.rect.y0 for line in above_depth_column) if len(above_depth_column) else -1
-
-        def check_y0_condition(y0):
-            return y0 > min_y0 and y0 < depth_column.rect().y1
-    else:
-
-        def check_y0_condition(y0):
-            return True
-
-    candidate_description = [line for line in lines if check_y0_condition(line.rect.y0)]
-    is_description = [line for line in candidate_description if line.is_description]
-
-    if len(candidate_description) == 0:
-        return
-
-    description_clusters = []
-    while len(is_description) > 0:
-        coverage_by_generating_line = [
-            [other for other in is_description if x_overlap_significant_smallest(line.rect, other.rect, 0.5)]
-            for line in is_description
-        ]
-
-        def filter_coverage(coverage):
-            if len(coverage):
-                min_x0 = min(line.rect.x0 for line in coverage)
-                max_x1 = max(line.rect.x1 for line in coverage)
-                x0_threshold = max_x1 - 0.4 * (
-                    max_x1 - min_x0
-                )  #  how did we determine the 0.4? Should it be a parameter? What would it do if we were to change it?
-                return [line for line in coverage if line.rect.x0 < x0_threshold]
-            else:
-                return []
-
-        coverage_by_generating_line = [filter_coverage(coverage) for coverage in coverage_by_generating_line]
-        max_coverage = max(coverage_by_generating_line, key=len)
-        description_clusters.append(max_coverage)
-        is_description = [line for line in is_description if line not in max_coverage]
-
-    candidate_rects = []
-
-    for cluster in description_clusters:
-        best_y0 = min([line.rect.y0 for line in cluster])
-        best_y1 = max([line.rect.y1 for line in cluster])
-
-        min_description_x0 = min(
-            [
-                line.rect.x0 - 0.01 * line.rect.width for line in cluster
-            ]  # How did we determine the 0.01? Should it be a parameter? What would it do if we were to change it?
-        )
-        max_description_x0 = max(
-            [
-                line.rect.x0 + 0.2 * line.rect.width for line in cluster
-            ]  # How did we determine the 0.2? Should it be a parameter? What would it do if we were to change it?
-        )
-        good_lines = [
-            line
-            for line in candidate_description
-            if line.rect.y0 >= best_y0 and line.rect.y1 <= best_y1
-            if min_description_x0 < line.rect.x0 < max_description_x0
-        ]
-        best_x0 = min([line.rect.x0 for line in good_lines])
-        best_x1 = max([line.rect.x1 for line in good_lines])
-
-        # expand to include entire last block
-        def is_below(best_x0, best_y1, line):
-            return (
-                (
-                    line.rect.x0 > best_x0 - 5
-                )  # How did we determine the 5? Should it be a parameter? What would it do if we were to change it?
-                and (line.rect.x0 < (best_x0 + best_x1) / 2)  # noqa B023
-                and (
-                    line.rect.y0 < best_y1 + 10
-                )  # How did we determine the 10? Should it be a parameter? What would it do if we were to change it?
-                and (line.rect.y1 > best_y1)
-            )
-
-        continue_search = True
-        while continue_search:
-            line = next((line for line in lines if is_below(best_x0, best_y1, line)), None)
-            if line:
-                best_x0 = min(best_x0, line.rect.x0)
-                best_x1 = max(best_x1, line.rect.x1)
-                best_y1 = line.rect.y1
-            else:
-                continue_search = False
-
-        candidate_rects.append(fitz.Rect(best_x0, best_y0, best_x1, best_y1))
-
-    if len(candidate_rects) == 0:
-        return None
-    if depth_column:
-        return max(candidate_rects, key=lambda rect: score_column_match(depth_column, rect))
-    else:
-        return candidate_rects[0]
-
-
-def perform_matching(directory: Path, **params: dict) -> dict:
-    """Perform the matching of text blocks with depth intervals.
-
-    Args:
-        directory (Path): Path to the directory that contains the pdfs.
-        **params (dict): Additional parameters for the matching pipeline.
-
-    Returns:
-        dict: The predictions.
-    """
-    for root, _dirs, files in os.walk(directory):
-        output = {}
-        for filename in files:
-            if filename.endswith(".pdf"):
-                in_path = os.path.join(root, filename)
-                logger.info("Processing file: %s", in_path)
-                output[filename] = {}
-
-                with fitz.Document(in_path) as doc:
-                    for page_index, page in enumerate(doc):
-                        page_number = page_index + 1
-                        logger.info("Processing page %s", page_number)
-
-                        predictions, depths_materials_column_pairs = process_page(page, **params)
-
-                        output[filename][f"page_{page_number}"] = {
-                            "layers": predictions,
-                            "depths_materials_column_pairs": depths_materials_column_pairs,
-                        }
-        return output
-
-
 @click.command()
 @click.option(
     "--input_directory",

From 1620bfe9df6467f94fc2c9e4785bbaf67eaa7d61 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Wed, 3 Apr 2024 10:07:23 +0200
Subject: [PATCH 03/19] Allow CLI to specify if bounding boxes and lines are
 drawn. Allow to run pipeline on individual files.

---
 src/stratigraphy/benchmark/score.py |  6 +++--
 src/stratigraphy/line_detection.py  | 25 +++++++++++++++++
 src/stratigraphy/main.py            | 42 +++++++++++++++++++++++------
 3 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py
index ab254b7b..7a552091 100644
--- a/src/stratigraphy/benchmark/score.py
+++ b/src/stratigraphy/benchmark/score.py
@@ -57,7 +57,7 @@ def f1(precision: float, recall: float) -> float:
 
 
 def evaluate_matching(
-    predictions_path: Path, ground_truth_path: Path, directory: Path, out_directory: Path
+    predictions_path: Path, ground_truth_path: Path, directory: Path, out_directory: Path, skip_draw_predictions: bool
 ) -> tuple[dict, pd.DataFrame]:
     """Calculate F1, precision and recall for the predictions.
 
@@ -69,6 +69,7 @@ def evaluate_matching(
         ground_truth_path (Path): Path to the ground truth annotated data.
         directory (Path): Path to the directory containing the pdf files.
         out_directory (Path): Path to the directory where the evaluation images should be saved.
+        skip_draw_predictions (bool): Whether to draw the predictions on the pdf pages.
 
     Returns:
         tuple[dict, pd.DataFrame]: A tuple containing the overall F1, precision and recall as a dictionary and the
@@ -80,7 +81,8 @@ def evaluate_matching(
 
     predictions, number_of_truth_values = _add_ground_truth_to_predictions(predictions, ground_truth)
 
-    draw_predictions(predictions, directory, out_directory)
+    if not skip_draw_predictions:
+        draw_predictions(predictions, directory, out_directory)
 
     document_level_metrics = {
         "document_name": [],
diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py
index d208712e..e99e309c 100644
--- a/src/stratigraphy/line_detection.py
+++ b/src/stratigraphy/line_detection.py
@@ -1,6 +1,7 @@
 """Script for line detection in pdf pages."""
 
 import os
+from pathlib import Path
 
 import cv2
 import fitz
@@ -88,6 +89,30 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]:
     return lines
 
 
+def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict):
+    """Draw lines on pdf pages and stores them as artifacts in mlflow.
+
+    Args:
+        input_directory (Path): The directory containing the pdf files.
+        line_detection_params (dict): The parameters for the line detection algorithm.
+    """
+    for root, _dirs, files in os.walk(input_directory):
+        output = {}
+        for filename in files:
+            if filename.endswith(".pdf"):
+                in_path = os.path.join(root, filename)
+                output[filename] = {}
+
+                with fitz.Document(in_path) as doc:
+                    for page_index, page in enumerate(doc):
+                        lines = extract_lines(page, line_detection_params)
+                        img = plot_lines(page, lines, scale_factor=line_detection_params["pdf_scale_factor"])
+                        if mlflow_tracking:
+                            import mlflow
+
+                            mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png")
+
+
 if __name__ == "__main__":
     # Some test pdfs
     selected_pdfs = [
diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index 029f71ea..ab638cba 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -3,6 +3,7 @@
 import json
 import logging
 import os
+import shutil
 from pathlib import Path
 
 import click
@@ -11,7 +12,7 @@
 from stratigraphy import DATAPATH
 from stratigraphy.benchmark.score import evaluate_matching
 from stratigraphy.extract import perform_matching
-from stratigraphy.line_detection import line_detection_params
+from stratigraphy.line_detection import draw_lines_on_pdfs, line_detection_params
 from stratigraphy.util.util import flatten, read_params
 
 load_dotenv()
@@ -26,6 +27,7 @@
 
 @click.command()
 @click.option(
+    "-i",
     "--input_directory",
     type=click.Path(exists=True, path_type=Path),
     default=DATAPATH / "Benchmark",
@@ -49,14 +51,25 @@
     default=DATAPATH / "Benchmark" / "extract" / "predictions.json",
     help="Path to the predictions file.",
 )
-def start_pipeline(input_directory: Path, ground_truth_path: Path, out_directory: Path, predictions_path: Path):
+@click.option("-s", "--skip-draw-predictions", is_flag=True, default=False, help="Draw predictions on pdf pages.")
+@click.option("-l", "--draw-lines", is_flag=True, default=False, help="Draw lines on pdf pages.")
+def start_pipeline(
+    input_directory: Path,
+    ground_truth_path: Path,
+    out_directory: Path,
+    predictions_path: Path,
+    skip_draw_predictions: bool = False,
+    draw_lines: bool = False,
+):
     """Description.
 
     Args:
-        input_directory (Path): _description_
-        ground_truth_path (Path): _description_
-        out_directory (Path): _description_
-        predictions_path (Path): _description_
+        input_directory (Path): The directory containing the pdf files.
+        ground_truth_path (Path): The path to the ground truth file.
+        out_directory (Path): The directory to store the evaluation results.
+        predictions_path (Path): The path to the predictions file.
+        skip_draw_predictions (bool, optional): Whether to skip drawing predictions on pdf pages. Defaults to False.
+        draw_lines (bool, optional): Whether to draw lines on pdf pages. Defaults to False.
     """
     if mlflow_tracking:
         import mlflow
@@ -67,11 +80,20 @@ def start_pipeline(input_directory: Path, ground_truth_path: Path, out_directory
         mlflow.log_params(flatten(matching_params))
 
     temp_directory = DATAPATH / "_temp"  # temporary directory to dump files for mlflow artifact logging
-    # check if directories exist and create them when neccessary
+
     # check if directories exist and create them when neccessary
     out_directory.mkdir(parents=True, exist_ok=True)
     temp_directory.mkdir(parents=True, exist_ok=True)
 
+    # if a file is specified instead of an input directory, copy the file to a temporary directory and work with that.
+    if input_directory.is_file():
+        if (temp_directory / "single_file").is_dir():
+            shutil.rmtree(temp_directory / "single_file")
+
+        Path.mkdir(temp_directory / "single_file")
+        shutil.copy(input_directory, temp_directory / "single_file")
+        input_directory = temp_directory / "single_file"
+
     # run the matching pipeline and save the result
     predictions = perform_matching(input_directory, **matching_params)
     with open(predictions_path, "w") as file:
@@ -79,7 +101,7 @@ def start_pipeline(input_directory: Path, ground_truth_path: Path, out_directory
 
     # evaluate the predictions
     metrics, document_level_metrics = evaluate_matching(
-        predictions_path, ground_truth_path, input_directory, out_directory
+        predictions_path, ground_truth_path, input_directory, out_directory, skip_draw_predictions
     )
     document_level_metrics.to_csv(temp_directory / "document_level_metrics.csv")  # mlflow.log_artifact expects a file
 
@@ -87,6 +109,10 @@ def start_pipeline(input_directory: Path, ground_truth_path: Path, out_directory
         mlflow.log_metrics(metrics)
         mlflow.log_artifact(temp_directory / "document_level_metrics.csv")
 
+    if draw_lines:
+        logger.info("Drawing lines on pdf pages.")
+        draw_lines_on_pdfs(input_directory, line_detection_params=line_detection_params)
+
 
 if __name__ == "__main__":
     start_pipeline()

From b0c49a6be43ef4f6a8df2caee438fe5faa0c9d33 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Wed, 3 Apr 2024 10:15:04 +0200
Subject: [PATCH 04/19] Update docstring and help for click commands.

---
 src/stratigraphy/main.py | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index ab638cba..1670a4ac 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -34,25 +34,36 @@
     help="Path to the input directory.",
 )
 @click.option(
+    "-g",
     "--ground_truth_path",
     type=click.Path(exists=True, path_type=Path),
     default=DATAPATH / "Benchmark" / "ground_truth.json",
     help="Path to the ground truth file.",
 )
 @click.option(
+    "-o",
     "--out_directory",
     type=click.Path(path_type=Path),
     default=DATAPATH / "Benchmark" / "evaluation",
     help="Path to the output directory.",
 )
 @click.option(
+    "-p",
     "--predictions_path",
     type=click.Path(path_type=Path),
     default=DATAPATH / "Benchmark" / "extract" / "predictions.json",
     help="Path to the predictions file.",
 )
-@click.option("-s", "--skip-draw-predictions", is_flag=True, default=False, help="Draw predictions on pdf pages.")
-@click.option("-l", "--draw-lines", is_flag=True, default=False, help="Draw lines on pdf pages.")
+@click.option(
+    "-s",
+    "--skip-draw-predictions",
+    is_flag=True,
+    default=False,
+    help="Whether to skip drawing the predictions on pdf pages. Defaults to False.",
+)
+@click.option(
+    "-l", "--draw-lines", is_flag=True, default=False, help="Whether to draw lines on pdf pages. Defaults to False."
+)
 def start_pipeline(
     input_directory: Path,
     ground_truth_path: Path,
@@ -61,16 +72,20 @@ def start_pipeline(
     skip_draw_predictions: bool = False,
     draw_lines: bool = False,
 ):
-    """Description.
-
-    Args:
-        input_directory (Path): The directory containing the pdf files.
-        ground_truth_path (Path): The path to the ground truth file.
-        out_directory (Path): The directory to store the evaluation results.
-        predictions_path (Path): The path to the predictions file.
-        skip_draw_predictions (bool, optional): Whether to skip drawing predictions on pdf pages. Defaults to False.
-        draw_lines (bool, optional): Whether to draw lines on pdf pages. Defaults to False.
-    """
+    """Run the boreholes data extraction pipeline.
+
+    The pipeline will extract material description of all found layers and assign them to the corresponding
+    depth intervals. The input directory should contain pdf files with boreholes data. The algorithm can deal
+    with borehole profiles of multiple pages.
+
+    Args:\n
+        input_directory (Path): The directory containing the pdf files.\n
+        ground_truth_path (Path): The path to the ground truth file json file.\n
+        out_directory (Path): The directory to store the evaluation results.\n
+        predictions_path (Path): The path to the predictions file.\n
+        skip_draw_predictions (bool, optional): Whether to skip drawing predictions on pdf pages. Defaults to False.\n
+        draw_lines (bool, optional): Whether to draw lines on pdf pages. Defaults to False.\n
+    """  # noqa: D301
     if mlflow_tracking:
         import mlflow
 

From d85a936ebd9b895b4f4c76025fe60d2f6744de4b Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Wed, 3 Apr 2024 10:24:04 +0200
Subject: [PATCH 05/19] Update README.

---
 README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index fd2d5f81..50764cf6 100644
--- a/README.md
+++ b/README.md
@@ -40,11 +40,13 @@ To execute the data extraction pipeline, follow these steps:
 
     `conda activate boreholes-dev`
 
-2. **Run the main script**
+2. **Run the extraction script**
 
-    The main script for the extraction pipeline is located at `src/stratigraphy/main.py`. Run this script to start the extraction process.
+    The main script for the extraction pipeline is located at `src/stratigraphy/main.py`. A cli command is created to run this script.
 
-    This script will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory.
+    Run `boreholes-extract-materials` to run the main extraction script. With the default options, the command will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory.
+
+    Use `boreholes-extract-materials --help` to see all options for the extraction script.
 
 3. **Check the results**
 

From d0612470bf892f721c04e95cdf4fd51701442954 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Wed, 3 Apr 2024 10:27:39 +0200
Subject: [PATCH 06/19] Remove line_detection script

---
 README.md                          |  5 +----
 src/stratigraphy/line_detection.py | 36 +-----------------------------
 2 files changed, 2 insertions(+), 39 deletions(-)

diff --git a/README.md b/README.md
index 50764cf6..61155a08 100644
--- a/README.md
+++ b/README.md
@@ -52,8 +52,6 @@ To execute the data extraction pipeline, follow these steps:
 
     Once the script has finished running, you can check the results in the `data/Benchmark/extract` directory. The result is a `predictions.json` file as well as a png file for each page of each PDF in the `data/Benchmark` directory.
 
-Please note that for now the pipeline assumes that all PDF files to be analyzed are placed in the `data/Benchmark` directory. If you want to analyze different files, please place them in this directory.
-
 ### Output Structure
 The `predictions.json` file contains the results of a data extraction process from PDF files. Each key in the JSON object is the name of a PDF file, and the value is a list of extracted items in a dictionary like object. The extracted items for now are the material descriptions in their correct order (given by their depths).
 
@@ -151,7 +149,7 @@ The project structure and the most important files are as follows:
   - `src/` : The source code of the project.
     - `stratigraphy/` : The main package of the project.
       - `main.py` : The main script of the project. This script runs the data extraction pipeline.
-      - `line_detection.py`: This script runs the line detection on provided sample pdfs. Will be deprecated in the future.
+      - `line_detection.py`: Contains functionalities for line detection on pdf pages.
       - `util/` : Utility scripts and modules.
       - `benchmark/` : Scripts to evaluate the data extraction.
   - `data/` : The data used by the project.
@@ -166,7 +164,6 @@ The project structure and the most important files are as follows:
 
 - `main.py` : This is the main script of the project. It runs the data extraction pipeline, which analyzes the PDF files in the `data/Benchmark` directory and saves the results in the `predictions.json` file.
 
-- `line_detection.py` : Runs the line detection algorithm on pdfs using `lsd` from opencv. It is meant to find all lines that potentially separate two material descriptions. It is incorporated in the script `main.py` and will be deprecated as a standalone script in the future.
 
 ## Experiment Tracking
 We perform experiment tracking using MLFlow. Each developer has his own local MLFlow instance. 
diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py
index e99e309c..6b95b6a3 100644
--- a/src/stratigraphy/line_detection.py
+++ b/src/stratigraphy/line_detection.py
@@ -9,7 +9,6 @@
 from dotenv import load_dotenv
 from numpy.typing import ArrayLike
 
-from stratigraphy import DATAPATH
 from stratigraphy.util.dataclasses import Line
 from stratigraphy.util.geometric_line_utilities import (
     drop_vertical_lines,
@@ -17,7 +16,7 @@
     merge_parallel_lines_efficiently,
 )
 from stratigraphy.util.plot_utils import plot_lines
-from stratigraphy.util.util import flatten, line_from_array, read_params
+from stratigraphy.util.util import line_from_array, read_params
 
 load_dotenv()
 
@@ -111,36 +110,3 @@ def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict):
                             import mlflow
 
                             mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png")
-
-
-if __name__ == "__main__":
-    # Some test pdfs
-    selected_pdfs = [
-        "270124083-bp.pdf",
-        "268124307-bp.pdf",
-        "268125268-bp.pdf",
-        "267125378-bp.pdf",
-        "268124435-bp.pdf",
-        "267123060-bp.pdf",
-        "268124635-bp.pdf",
-        "675230002-bp.pdf",
-        "268125592-bp.pdf",
-        "267124070-bp.pdf",
-        "699248001-bp.pdf",
-    ]
-
-    if mlflow_tracking:
-        import mlflow
-
-        mlflow.set_experiment("LineDetection")
-        mlflow.start_run()
-        mlflow.log_params(flatten(line_detection_params))
-    lines = {}
-    for pdf in selected_pdfs:
-        doc = fitz.open(DATAPATH / "Benchmark" / pdf)
-
-        for page in doc:
-            lines[pdf] = extract_lines(page, line_detection_params)
-            img = plot_lines(page, lines[pdf], scale_factor=line_detection_params["pdf_scale_factor"])
-            if mlflow_tracking:
-                mlflow.log_image(img, f"lines_{pdf}.png")

From 395fd411b6cc0b482f7a6d1d71e86fd60c81b6b9 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Wed, 3 Apr 2024 10:31:38 +0200
Subject: [PATCH 07/19] Raise warning if MLFlow tracking is not enabled for
 draw_lines_on_pdfs.

---
 src/stratigraphy/line_detection.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py
index 6b95b6a3..0bb8afc9 100644
--- a/src/stratigraphy/line_detection.py
+++ b/src/stratigraphy/line_detection.py
@@ -95,6 +95,10 @@ def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict):
         input_directory (Path): The directory containing the pdf files.
         line_detection_params (dict): The parameters for the line detection algorithm.
     """
+    if not mlflow_tracking:
+        raise Warning("MLFlow tracking is not enabled. MLFLow is required to store the images.")
+    import mlflow
+
     for root, _dirs, files in os.walk(input_directory):
         output = {}
         for filename in files:
@@ -106,7 +110,4 @@ def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict):
                     for page_index, page in enumerate(doc):
                         lines = extract_lines(page, line_detection_params)
                         img = plot_lines(page, lines, scale_factor=line_detection_params["pdf_scale_factor"])
-                        if mlflow_tracking:
-                            import mlflow
-
-                            mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png")
+                        mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png")

From b144d157466fe34b14495bac5bd61b37dcd70584 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Wed, 3 Apr 2024 13:36:02 +0200
Subject: [PATCH 08/19] Improve the way --help prints the docstring.

---
 src/stratigraphy/main.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index 1670a4ac..c0ff1876 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -78,13 +78,14 @@ def start_pipeline(
     depth intervals. The input directory should contain pdf files with boreholes data. The algorithm can deal
     with borehole profiles of multiple pages.
 
-    Args:\n
-        input_directory (Path): The directory containing the pdf files.\n
-        ground_truth_path (Path): The path to the ground truth file json file.\n
-        out_directory (Path): The directory to store the evaluation results.\n
-        predictions_path (Path): The path to the predictions file.\n
-        skip_draw_predictions (bool, optional): Whether to skip drawing predictions on pdf pages. Defaults to False.\n
-        draw_lines (bool, optional): Whether to draw lines on pdf pages. Defaults to False.\n
+    \f
+    Args:
+        input_directory (Path): The directory containing the pdf files.
+        ground_truth_path (Path): The path to the ground truth file json file.
+        out_directory (Path): The directory to store the evaluation results.
+        predictions_path (Path): The path to the predictions file.
+        skip_draw_predictions (bool, optional): Whether to skip drawing predictions on pdf pages. Defaults to False.
+        draw_lines (bool, optional): Whether to draw lines on pdf pages. Defaults to False.
     """  # noqa: D301
     if mlflow_tracking:
         import mlflow

From 672a8942de838616e0b54cdddcf9e7eae0539360 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Wed, 3 Apr 2024 13:38:20 +0200
Subject: [PATCH 09/19] Rename command  boreholes-extract-materials to
 boreholes-extract layers.

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ca0fbde4..dd5f811a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ dependencies = [
 ]
 
 [project.scripts]
-boreholes-extract-materials = "stratigraphy.main:start_pipeline"
+boreholes-extract-layers = "stratigraphy.main:start_pipeline"
 
 [tool.ruff.lint]
 select = [

From 397cae80a08064393b896dd95330337d140b206b Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Wed, 3 Apr 2024 14:18:21 +0200
Subject: [PATCH 10/19] Refactor extraction pipeline.2

---
 src/stratigraphy/extract.py        |  6 ++---
 src/stratigraphy/line_detection.py | 24 ++++++------------
 src/stratigraphy/main.py           | 39 ++++++++++++++++++++++++------
 3 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
index a3b911d1..9cd622ae 100644
--- a/src/stratigraphy/extract.py
+++ b/src/stratigraphy/extract.py
@@ -7,7 +7,6 @@
 
 import fitz
 
-from stratigraphy.line_detection import extract_lines, line_detection_params
 from stratigraphy.util import find_depth_columns
 from stratigraphy.util.dataclasses import Line
 from stratigraphy.util.depthcolumn import DepthColumn
@@ -25,13 +24,14 @@
 logger = logging.getLogger(__name__)
 
 
-def process_page(page: fitz.Page, **params: dict) -> list[dict]:
+def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict]:
     """Process a single page of a pdf.
 
     Finds all descriptions and depth intervals on the page and matches them.
 
     Args:
         page (fitz.Page): The page to process.
+        geometric_lines (list[Line]): The geometric lines of the page.
         **params (dict): Additional parameters for the matching pipeline.
 
     Returns:
@@ -97,8 +97,6 @@ def process_page(page: fitz.Page, **params: dict) -> list[dict]:
                 continue
     filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete]
 
-    geometric_lines = extract_lines(page, line_detection_params)
-
     groups = []  # list of matched depth intervals and text blocks
     # groups is of the form: ["depth_interval": BoundaryInterval, "block": TextBlock]
     if len(filtered_pairs):  # match depth column items with material description
diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py
index 0bb8afc9..98e323f3 100644
--- a/src/stratigraphy/line_detection.py
+++ b/src/stratigraphy/line_detection.py
@@ -1,7 +1,6 @@
 """Script for line detection in pdf pages."""
 
 import os
-from pathlib import Path
 
 import cv2
 import fitz
@@ -88,26 +87,19 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]:
     return lines
 
 
-def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict):
+def draw_lines_on_pdfs(filename: str, page: fitz.Page, geometric_lines: list[Line]):
     """Draw lines on pdf pages and stores them as artifacts in mlflow.
 
+    Note: now the function draw_lines_on_pdfs may not even be needed any more.
+
     Args:
-        input_directory (Path): The directory containing the pdf files.
-        line_detection_params (dict): The parameters for the line detection algorithm.
+        filename (str): The filename of the pdf.
+        page (fitz.Page): The page to draw lines on.
+        geometric_lines (list[Line]): The lines to draw on the pdf page.
     """
     if not mlflow_tracking:
         raise Warning("MLFlow tracking is not enabled. MLFLow is required to store the images.")
     import mlflow
 
-    for root, _dirs, files in os.walk(input_directory):
-        output = {}
-        for filename in files:
-            if filename.endswith(".pdf"):
-                in_path = os.path.join(root, filename)
-                output[filename] = {}
-
-                with fitz.Document(in_path) as doc:
-                    for page_index, page in enumerate(doc):
-                        lines = extract_lines(page, line_detection_params)
-                        img = plot_lines(page, lines, scale_factor=line_detection_params["pdf_scale_factor"])
-                        mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png")
+    img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"])
+    mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png")
diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index c0ff1876..a2d2eb7e 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -7,12 +7,13 @@
 from pathlib import Path
 
 import click
+import fitz
 from dotenv import load_dotenv
 
 from stratigraphy import DATAPATH
 from stratigraphy.benchmark.score import evaluate_matching
-from stratigraphy.extract import perform_matching
-from stratigraphy.line_detection import draw_lines_on_pdfs, line_detection_params
+from stratigraphy.extract import process_page
+from stratigraphy.line_detection import draw_lines_on_pdfs, extract_lines, line_detection_params
 from stratigraphy.util.util import flatten, read_params
 
 load_dotenv()
@@ -110,8 +111,34 @@ def start_pipeline(
         shutil.copy(input_directory, temp_directory / "single_file")
         input_directory = temp_directory / "single_file"
 
-    # run the matching pipeline and save the result
-    predictions = perform_matching(input_directory, **matching_params)
+    # process the individual pdf files
+    predictions = {}
+    for root, _dirs, files in os.walk(input_directory):
+        for filename in files:
+            if filename.endswith(".pdf"):
+                in_path = os.path.join(root, filename)
+                logger.info("Processing file: %s", in_path)
+                predictions[filename] = {}
+
+                with fitz.Document(in_path) as doc:
+                    for page_index, page in enumerate(doc):
+                        page_number = page_index + 1
+                        logger.info("Processing page %s", page_number)
+
+                        geometric_lines = extract_lines(page, line_detection_params)
+                        layer_predictions, depths_materials_column_pairs = process_page(
+                            page, geometric_lines, **matching_params
+                        )
+
+                        predictions[filename][f"page_{page_number}"] = {
+                            "layers": layer_predictions,
+                            "depths_materials_column_pairs": depths_materials_column_pairs,
+                        }
+
+                        if draw_lines:
+                            logger.info("Drawing lines on pdf pages.")
+                            draw_lines_on_pdfs(filename, page, geometric_lines)
+
     with open(predictions_path, "w") as file:
         file.write(json.dumps(predictions))
 
@@ -125,10 +152,6 @@ def start_pipeline(
         mlflow.log_metrics(metrics)
         mlflow.log_artifact(temp_directory / "document_level_metrics.csv")
 
-    if draw_lines:
-        logger.info("Drawing lines on pdf pages.")
-        draw_lines_on_pdfs(input_directory, line_detection_params=line_detection_params)
-
 
 if __name__ == "__main__":
     start_pipeline()

From 12cde5813a5366a54d9c3af2f64bd70d6a9b2e62 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Wed, 3 Apr 2024 14:39:37 +0200
Subject: [PATCH 11/19] make single files work without temporary directory.

---
 src/stratigraphy/main.py      | 13 ++++---------
 src/stratigraphy/util/draw.py |  2 ++
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index a2d2eb7e..03005d17 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -3,7 +3,6 @@
 import json
 import logging
 import os
-import shutil
 from pathlib import Path
 
 import click
@@ -104,16 +103,12 @@ def start_pipeline(
 
     # if a file is specified instead of an input directory, copy the file to a temporary directory and work with that.
     if input_directory.is_file():
-        if (temp_directory / "single_file").is_dir():
-            shutil.rmtree(temp_directory / "single_file")
-
-        Path.mkdir(temp_directory / "single_file")
-        shutil.copy(input_directory, temp_directory / "single_file")
-        input_directory = temp_directory / "single_file"
-
+        file_iterator = [(input_directory.parent, None, [input_directory.name])]
+    else:
+        file_iterator = os.walk(input_directory)
     # process the individual pdf files
     predictions = {}
-    for root, _dirs, files in os.walk(input_directory):
+    for root, _dirs, files in file_iterator:
         for filename in files:
             if filename.endswith(".pdf"):
                 in_path = os.path.join(root, filename)
diff --git a/src/stratigraphy/util/draw.py b/src/stratigraphy/util/draw.py
index 388d60b6..948bd430 100644
--- a/src/stratigraphy/util/draw.py
+++ b/src/stratigraphy/util/draw.py
@@ -35,6 +35,8 @@ def draw_predictions(predictions: dict, directory: Path, out_directory: Path) ->
         directory (Path): Path to the directory containing the pdf files.
         out_directory (Path): Path to the output directory where the images are saved.
     """
+    if directory.is_file():  # deal with the case when we pass a file instead of a directory
+        directory = directory.parent
     for file in predictions:
         logger.info(f"Evaluating {file}.")
         with fitz.Document(directory / file) as doc:

From b549d0ff91e647194fb23afdb7c86a94d444b40e Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Thu, 4 Apr 2024 08:11:26 +0200
Subject: [PATCH 12/19] suggestion for rectangle correction with lines.

---
 src/stratigraphy/extract.py          | 42 ++++++++++++++++++++++++++++
 src/stratigraphy/util/dataclasses.py |  3 ++
 2 files changed, 45 insertions(+)

diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
index 9cd622ae..5e312730 100644
--- a/src/stratigraphy/extract.py
+++ b/src/stratigraphy/extract.py
@@ -84,6 +84,9 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict]
     for depth_column in depth_columns:
         material_description_rect = find_material_description_column(lines, depth_column)
         if material_description_rect:
+            material_description_rect = adjust_material_description_rect(
+                material_description_rect, page.rect.width, geometric_lines
+            )
             pairs.append((depth_column, material_description_rect))
 
     # lowest score first
@@ -125,6 +128,9 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict]
         # Fallback when no depth column was found
         material_description_rect = find_material_description_column(lines, depth_column=None)
         if material_description_rect:
+            material_description_rect = adjust_material_description_rect(
+                material_description_rect, page.rect.width, geometric_lines
+            )
             description_lines = get_description_lines(lines, material_description_rect)
             description_blocks = get_description_blocks(
                 description_lines,
@@ -456,6 +462,42 @@ def is_below(best_x0, best_y1, line):
         return candidate_rects[0]
 
 
+def adjust_material_description_rect(
+    material_description_rect: fitz.Rect, page_width: float, geometric_lines: list[Line]
+) -> fitz.Rect:
+    """Adjust the material description rectangle based on the geometric lines.
+
+    Tries to adjust the material description rectangle to the top until a long solid line is reached.
+
+    Args:
+        material_description_rect (fitz.Rect): _description_
+        page_width (float): _description_
+        geometric_lines (list[Line]): _description_
+
+    Returns:
+        fitz.Rect: _description_
+    """
+    material_description_rect_top = material_description_rect.y0
+    max_line_y = 0
+    for line in geometric_lines:
+        if (
+            line.length() > 0.7 * material_description_rect.width
+            and (line.start.y + line.end.y) / 2 < material_description_rect_top + 5
+        ):
+            max_line_y = max(max_line_y, (line.start.y + line.end.y) / 2) - 5
+
+    if max_line_y > material_description_rect_top:
+        new_coordinates = [
+            material_description_rect.x0,
+            max_line_y,
+            material_description_rect.x1,
+            material_description_rect.y1,
+        ]
+        return fitz.Rect(*new_coordinates)
+    else:
+        return material_description_rect
+
+
 def perform_matching(directory: Path, **params: dict) -> dict:
     """Perform the matching of text blocks with depth intervals.
 
diff --git a/src/stratigraphy/util/dataclasses.py b/src/stratigraphy/util/dataclasses.py
index 48436128..b787743b 100644
--- a/src/stratigraphy/util/dataclasses.py
+++ b/src/stratigraphy/util/dataclasses.py
@@ -47,6 +47,9 @@ def distance_to(self, point: Point) -> float:
             - (self.start.x - point.x) * (self.end.y - self.start.y)
         ) / np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2)
 
+    def length(self) -> float:
+        return np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2)
+
     def slope(self) -> float:
         return (self.end.y - self.start.y) / (self.end.x - self.start.x) if self.end.x - self.start.x != 0 else np.inf
 

From 1edbb7975b39a389eefefaad8676f149acdeb05b Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Thu, 4 Apr 2024 14:24:29 +0200
Subject: [PATCH 13/19] refactoring evaluate_matching.

---
 src/stratigraphy/benchmark/score.py | 30 ++++++++---------------------
 src/stratigraphy/line_detection.py  |  2 +-
 src/stratigraphy/main.py            | 16 +++++++++------
 3 files changed, 19 insertions(+), 29 deletions(-)

diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py
index 7a552091..a655bc7e 100644
--- a/src/stratigraphy/benchmark/score.py
+++ b/src/stratigraphy/benchmark/score.py
@@ -1,6 +1,5 @@
 """Evaluate the predictions against the ground truth."""
 
-import json
 import logging
 import os
 from pathlib import Path
@@ -9,7 +8,6 @@
 from dotenv import load_dotenv
 from stratigraphy import DATAPATH
 from stratigraphy.benchmark.ground_truth import GroundTruth
-from stratigraphy.util.draw import draw_predictions
 from stratigraphy.util.util import parse_text
 
 load_dotenv()
@@ -56,34 +54,20 @@ def f1(precision: float, recall: float) -> float:
         return 0
 
 
-def evaluate_matching(
-    predictions_path: Path, ground_truth_path: Path, directory: Path, out_directory: Path, skip_draw_predictions: bool
-) -> tuple[dict, pd.DataFrame]:
+def evaluate_matching(predictions: dict, number_of_truth_values: dict) -> tuple[dict, pd.DataFrame]:
     """Calculate F1, precision and recall for the predictions.
 
     Calculate F1, precision and recall for the individual documents as well as overall.
     The individual document metrics are returned as a DataFrame.
 
     Args:
-        predictions_path (Path): Path to the predictions.json file.
-        ground_truth_path (Path): Path to the ground truth annotated data.
-        directory (Path): Path to the directory containing the pdf files.
-        out_directory (Path): Path to the directory where the evaluation images should be saved.
-        skip_draw_predictions (bool): Whether to draw the predictions on the pdf pages.
+        predictions (dict): The predictions.
+        number_of_truth_values (dict): The number of ground truth values per file.
 
     Returns:
         tuple[dict, pd.DataFrame]: A tuple containing the overall F1, precision and recall as a dictionary and the
         individual document metrics as a DataFrame.
     """
-    ground_truth = GroundTruth(ground_truth_path)
-    with open(predictions_path) as in_file:
-        predictions = json.load(in_file)
-
-    predictions, number_of_truth_values = _add_ground_truth_to_predictions(predictions, ground_truth)
-
-    if not skip_draw_predictions:
-        draw_predictions(predictions, directory, out_directory)
-
     document_level_metrics = {
         "document_name": [],
         "F1": [],
@@ -137,16 +121,18 @@ def evaluate_matching(
     }, pd.DataFrame(document_level_metrics)
 
 
-def _add_ground_truth_to_predictions(predictions: dict, ground_truth: GroundTruth) -> (dict, dict):
+def add_ground_truth_to_predictions(predictions: dict, ground_truth_path: Path) -> tuple[dict, dict]:
     """Add the ground truth to the predictions.
 
     Args:
         predictions (dict): The predictions.
-        ground_truth (GroundTruth): The ground truth.
+        ground_truth_path (Path): The path to the ground truth file.
 
     Returns:
-        (dict, dict): The predictions with the ground truth added, and the number of ground truth values per file.
+        tuple[dict, dict]: The predictions with the ground truth added, and the number of ground truth values per file.
     """
+    ground_truth = GroundTruth(ground_truth_path)
+
     number_of_truth_values = {}
     for file, file_predictions in predictions.items():
         ground_truth_for_file = ground_truth.for_file(file)
diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py
index 98e323f3..a5870cff 100644
--- a/src/stratigraphy/line_detection.py
+++ b/src/stratigraphy/line_detection.py
@@ -87,7 +87,7 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]:
     return lines
 
 
-def draw_lines_on_pdfs(filename: str, page: fitz.Page, geometric_lines: list[Line]):
+def draw_lines_on_page(filename: str, page: fitz.Page, geometric_lines: list[Line]):
     """Draw lines on pdf pages and stores them as artifacts in mlflow.
 
     Note: now the function draw_lines_on_pdfs may not even be needed any more.
diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index 03005d17..e61c7222 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -10,9 +10,10 @@
 from dotenv import load_dotenv
 
 from stratigraphy import DATAPATH
-from stratigraphy.benchmark.score import evaluate_matching
+from stratigraphy.benchmark.score import add_ground_truth_to_predictions, evaluate_matching
 from stratigraphy.extract import process_page
-from stratigraphy.line_detection import draw_lines_on_pdfs, extract_lines, line_detection_params
+from stratigraphy.line_detection import draw_lines_on_page, extract_lines, line_detection_params
+from stratigraphy.util.draw import draw_predictions
 from stratigraphy.util.util import flatten, read_params
 
 load_dotenv()
@@ -132,15 +133,18 @@ def start_pipeline(
 
                         if draw_lines:
                             logger.info("Drawing lines on pdf pages.")
-                            draw_lines_on_pdfs(filename, page, geometric_lines)
+                            draw_lines_on_page(filename, page, geometric_lines)
 
     with open(predictions_path, "w") as file:
         file.write(json.dumps(predictions))
 
     # evaluate the predictions
-    metrics, document_level_metrics = evaluate_matching(
-        predictions_path, ground_truth_path, input_directory, out_directory, skip_draw_predictions
-    )
+    predictions, number_of_truth_values = add_ground_truth_to_predictions(predictions, ground_truth_path)
+
+    if not skip_draw_predictions:
+        draw_predictions(predictions, input_directory, out_directory)
+
+    metrics, document_level_metrics = evaluate_matching(predictions, number_of_truth_values)
     document_level_metrics.to_csv(temp_directory / "document_level_metrics.csv")  # mlflow.log_artifact expects a file
 
     if mlflow_tracking:

From ebe0bac5ba58fdb4e59f41792eaaf5486a42f5bc Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Thu, 4 Apr 2024 14:27:10 +0200
Subject: [PATCH 14/19] Undo mistaken commit.

---
 src/stratigraphy/extract.py          | 42 ----------------------------
 src/stratigraphy/util/dataclasses.py |  3 --
 2 files changed, 45 deletions(-)

diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
index 5e312730..9cd622ae 100644
--- a/src/stratigraphy/extract.py
+++ b/src/stratigraphy/extract.py
@@ -84,9 +84,6 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict]
     for depth_column in depth_columns:
         material_description_rect = find_material_description_column(lines, depth_column)
         if material_description_rect:
-            material_description_rect = adjust_material_description_rect(
-                material_description_rect, page.rect.width, geometric_lines
-            )
             pairs.append((depth_column, material_description_rect))
 
     # lowest score first
@@ -128,9 +125,6 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict]
         # Fallback when no depth column was found
         material_description_rect = find_material_description_column(lines, depth_column=None)
         if material_description_rect:
-            material_description_rect = adjust_material_description_rect(
-                material_description_rect, page.rect.width, geometric_lines
-            )
             description_lines = get_description_lines(lines, material_description_rect)
             description_blocks = get_description_blocks(
                 description_lines,
@@ -462,42 +456,6 @@ def is_below(best_x0, best_y1, line):
         return candidate_rects[0]
 
 
-def adjust_material_description_rect(
-    material_description_rect: fitz.Rect, page_width: float, geometric_lines: list[Line]
-) -> fitz.Rect:
-    """Adjust the material description rectangle based on the geometric lines.
-
-    Tries to adjust the material description rectangle to the top until a long solid line is reached.
-
-    Args:
-        material_description_rect (fitz.Rect): _description_
-        page_width (float): _description_
-        geometric_lines (list[Line]): _description_
-
-    Returns:
-        fitz.Rect: _description_
-    """
-    material_description_rect_top = material_description_rect.y0
-    max_line_y = 0
-    for line in geometric_lines:
-        if (
-            line.length() > 0.7 * material_description_rect.width
-            and (line.start.y + line.end.y) / 2 < material_description_rect_top + 5
-        ):
-            max_line_y = max(max_line_y, (line.start.y + line.end.y) / 2) - 5
-
-    if max_line_y > material_description_rect_top:
-        new_coordinates = [
-            material_description_rect.x0,
-            max_line_y,
-            material_description_rect.x1,
-            material_description_rect.y1,
-        ]
-        return fitz.Rect(*new_coordinates)
-    else:
-        return material_description_rect
-
-
 def perform_matching(directory: Path, **params: dict) -> dict:
     """Perform the matching of text blocks with depth intervals.
 
diff --git a/src/stratigraphy/util/dataclasses.py b/src/stratigraphy/util/dataclasses.py
index b787743b..48436128 100644
--- a/src/stratigraphy/util/dataclasses.py
+++ b/src/stratigraphy/util/dataclasses.py
@@ -47,9 +47,6 @@ def distance_to(self, point: Point) -> float:
             - (self.start.x - point.x) * (self.end.y - self.start.y)
         ) / np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2)
 
-    def length(self) -> float:
-        return np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2)
-
     def slope(self) -> float:
         return (self.end.y - self.start.y) / (self.end.x - self.start.x) if self.end.x - self.start.x != 0 else np.inf
 

From 2166ad13e1bb761e374af1d9d25878ff90ec6f19 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Thu, 4 Apr 2024 15:19:39 +0200
Subject: [PATCH 15/19] Minor refactoring

---
 src/stratigraphy/main.py      | 1 -
 src/stratigraphy/util/draw.py | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index e61c7222..f7213a6f 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -130,7 +130,6 @@ def start_pipeline(
                             "layers": layer_predictions,
                             "depths_materials_column_pairs": depths_materials_column_pairs,
                         }
-
                         if draw_lines:
                             logger.info("Drawing lines on pdf pages.")
                             draw_lines_on_page(filename, page, geometric_lines)
diff --git a/src/stratigraphy/util/draw.py b/src/stratigraphy/util/draw.py
index 948bd430..c5a814f4 100644
--- a/src/stratigraphy/util/draw.py
+++ b/src/stratigraphy/util/draw.py
@@ -31,14 +31,13 @@ def draw_predictions(predictions: dict, directory: Path, out_directory: Path) ->
         - Assignments of material description text blocks to depth intervals (if available)
 
     Args:
-        predictions (dict): Content of the predictions.json file..
+        predictions (dict): Content of the predictions.json file.
         directory (Path): Path to the directory containing the pdf files.
         out_directory (Path): Path to the output directory where the images are saved.
     """
     if directory.is_file():  # deal with the case when we pass a file instead of a directory
         directory = directory.parent
     for file in predictions:
-        logger.info(f"Evaluating {file}.")
         with fitz.Document(directory / file) as doc:
             for page_index, page in enumerate(doc):
                 page_number = page_index + 1

From d32976b11860d0dcb12cbc3d5163f9dea9a5c81c Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Fri, 5 Apr 2024 10:33:09 +0200
Subject: [PATCH 16/19] remove unused function. Remove draw_lines and implement
 it in main.py directly.

---
 src/stratigraphy/extract.py        | 34 ------------------------------
 src/stratigraphy/line_detection.py | 19 -----------------
 src/stratigraphy/main.py           | 16 ++++++++++----
 3 files changed, 12 insertions(+), 57 deletions(-)

diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
index 9cd622ae..cfd44633 100644
--- a/src/stratigraphy/extract.py
+++ b/src/stratigraphy/extract.py
@@ -2,8 +2,6 @@
 
 import logging
 import math
-import os
-from pathlib import Path
 
 import fitz
 
@@ -454,35 +452,3 @@ def is_below(best_x0, best_y1, line):
         return max(candidate_rects, key=lambda rect: score_column_match(depth_column, rect))
     else:
         return candidate_rects[0]
-
-
-def perform_matching(directory: Path, **params: dict) -> dict:
-    """Perform the matching of text blocks with depth intervals.
-
-    Args:
-        directory (Path): Path to the directory that contains the pdfs.
-        **params (dict): Additional parameters for the matching pipeline.
-
-    Returns:
-        dict: The predictions.
-    """
-    for root, _dirs, files in os.walk(directory):
-        output = {}
-        for filename in files:
-            if filename.endswith(".pdf"):
-                in_path = os.path.join(root, filename)
-                logger.info("Processing file: %s", in_path)
-                output[filename] = {}
-
-                with fitz.Document(in_path) as doc:
-                    for page_index, page in enumerate(doc):
-                        page_number = page_index + 1
-                        logger.info("Processing page %s", page_number)
-
-                        predictions, depths_materials_column_pairs = process_page(page, **params)
-
-                        output[filename][f"page_{page_number}"] = {
-                            "layers": predictions,
-                            "depths_materials_column_pairs": depths_materials_column_pairs,
-                        }
-        return output
diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py
index a5870cff..6f9aa3b8 100644
--- a/src/stratigraphy/line_detection.py
+++ b/src/stratigraphy/line_detection.py
@@ -14,7 +14,6 @@
     merge_parallel_lines_approximately,
     merge_parallel_lines_efficiently,
 )
-from stratigraphy.util.plot_utils import plot_lines
 from stratigraphy.util.util import line_from_array, read_params
 
 load_dotenv()
@@ -85,21 +84,3 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]:
             lines, tol=merging_params["merging_tolerance"], angle_threshold=merging_params["angle_threshold"]
         )
     return lines
-
-
-def draw_lines_on_page(filename: str, page: fitz.Page, geometric_lines: list[Line]):
-    """Draw lines on pdf pages and stores them as artifacts in mlflow.
-
-    Note: now the function draw_lines_on_pdfs may not even be needed any more.
-
-    Args:
-        filename (str): The filename of the pdf.
-        page (fitz.Page): The page to draw lines on.
-        geometric_lines (list[Line]): The lines to draw on the pdf page.
-    """
-    if not mlflow_tracking:
-        raise Warning("MLFlow tracking is not enabled. MLFLow is required to store the images.")
-    import mlflow
-
-    img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"])
-    mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png")
diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index f7213a6f..9856de9e 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -12,8 +12,9 @@
 from stratigraphy import DATAPATH
 from stratigraphy.benchmark.score import add_ground_truth_to_predictions, evaluate_matching
 from stratigraphy.extract import process_page
-from stratigraphy.line_detection import draw_lines_on_page, extract_lines, line_detection_params
+from stratigraphy.line_detection import extract_lines, line_detection_params
 from stratigraphy.util.draw import draw_predictions
+from stratigraphy.util.plot_utils import plot_lines
 from stratigraphy.util.util import flatten, read_params
 
 load_dotenv()
@@ -130,9 +131,16 @@ def start_pipeline(
                             "layers": layer_predictions,
                             "depths_materials_column_pairs": depths_materials_column_pairs,
                         }
-                        if draw_lines:
-                            logger.info("Drawing lines on pdf pages.")
-                            draw_lines_on_page(filename, page, geometric_lines)
+                        if draw_lines:  # could be changed to if draw_lines and mflow_tracking:
+                            if not mlflow_tracking:
+                                logger.warning(
+                                    "MLFlow tracking is not enabled. MLFLow is required to store the images."
+                                )
+                            else:
+                                img = plot_lines(
+                                    page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"]
+                                )
+                                mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png")
 
     with open(predictions_path, "w") as file:
         file.write(json.dumps(predictions))

From 3421484361f00e7b0ea1d478f527bf94ec190583 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Fri, 5 Apr 2024 11:35:17 +0200
Subject: [PATCH 17/19] Update comments.

---
 src/stratigraphy/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index 9856de9e..d9dc239a 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -145,7 +145,7 @@ def start_pipeline(
     with open(predictions_path, "w") as file:
         file.write(json.dumps(predictions))
 
-    # evaluate the predictions
+    # evaluate the predictions; if file doesnt exist, the predictions are not changed.
     predictions, number_of_truth_values = add_ground_truth_to_predictions(predictions, ground_truth_path)
 
     if not skip_draw_predictions:

From a4141fc854600e0bf8f4a11eaa80ae09dc7697fe Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Fri, 5 Apr 2024 11:40:11 +0200
Subject: [PATCH 18/19] Update readme; correct script name.

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 61155a08..08cb6cc9 100644
--- a/README.md
+++ b/README.md
@@ -44,9 +44,9 @@ To execute the data extraction pipeline, follow these steps:
 
     The main script for the extraction pipeline is located at `src/stratigraphy/main.py`. A cli command is created to run this script.
 
-    Run `boreholes-extract-materials` to run the main extraction script. With the default options, the command will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory.
+    Run `boreholes-extract-layers` to run the main extraction script. With the default options, the command will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory.
 
-    Use `boreholes-extract-materials --help` to see all options for the extraction script.
+    Use `boreholes-extract-layers --help` to see all options for the extraction script.
 
 3. **Check the results**
 

From 901b02dda15c717ae1b3c3feec06b195124e7e7c Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Fri, 5 Apr 2024 11:41:38 +0200
Subject: [PATCH 19/19] Update documentation to clarify that input_path can be
 either a directory or a pdf path.

---
 src/stratigraphy/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index d9dc239a..3baa75e2 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -33,7 +33,7 @@
     "--input_directory",
     type=click.Path(exists=True, path_type=Path),
     default=DATAPATH / "Benchmark",
-    help="Path to the input directory.",
+    help="Path to the input directory, or path to a single pdf file.",
 )
 @click.option(
     "-g",
@@ -82,7 +82,7 @@ def start_pipeline(
 
     \f
     Args:
-        input_directory (Path): The directory containing the pdf files.
+        input_directory (Path): The directory containing the pdf files. Can also be the path to a single pdf file.
         ground_truth_path (Path): The path to the ground truth file json file.
         out_directory (Path): The directory to store the evaluation results.
         predictions_path (Path): The path to the predictions file.