From 397cae80a08064393b896dd95330337d140b206b Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Wed, 3 Apr 2024 14:18:21 +0200
Subject: [PATCH 1/8] Refactor extraction pipeline.2

---
 src/stratigraphy/extract.py        |  6 ++---
 src/stratigraphy/line_detection.py | 24 ++++++------------
 src/stratigraphy/main.py           | 39 ++++++++++++++++++++++++------
 3 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
index a3b911d1..9cd622ae 100644
--- a/src/stratigraphy/extract.py
+++ b/src/stratigraphy/extract.py
@@ -7,7 +7,6 @@
 
 import fitz
 
-from stratigraphy.line_detection import extract_lines, line_detection_params
 from stratigraphy.util import find_depth_columns
 from stratigraphy.util.dataclasses import Line
 from stratigraphy.util.depthcolumn import DepthColumn
@@ -25,13 +24,14 @@
 logger = logging.getLogger(__name__)
 
 
-def process_page(page: fitz.Page, **params: dict) -> list[dict]:
+def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict]:
     """Process a single page of a pdf.
 
     Finds all descriptions and depth intervals on the page and matches them.
 
     Args:
         page (fitz.Page): The page to process.
+        geometric_lines (list[Line]): The geometric lines of the page.
         **params (dict): Additional parameters for the matching pipeline.
 
     Returns:
@@ -97,8 +97,6 @@ def process_page(page: fitz.Page, **params: dict) -> list[dict]:
                 continue
     filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete]
 
-    geometric_lines = extract_lines(page, line_detection_params)
-
     groups = []  # list of matched depth intervals and text blocks
     # groups is of the form: ["depth_interval": BoundaryInterval, "block": TextBlock]
     if len(filtered_pairs):  # match depth column items with material description
diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py
index 0bb8afc9..98e323f3 100644
--- a/src/stratigraphy/line_detection.py
+++ b/src/stratigraphy/line_detection.py
@@ -1,7 +1,6 @@
 """Script for line detection in pdf pages."""
 
 import os
-from pathlib import Path
 
 import cv2
 import fitz
@@ -88,26 +87,19 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]:
     return lines
 
 
-def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict):
+def draw_lines_on_pdfs(filename: str, page: fitz.Page, geometric_lines: list[Line]):
     """Draw lines on pdf pages and stores them as artifacts in mlflow.
 
+    Note: now the function draw_lines_on_pdfs may not even be needed any more.
+
     Args:
-        input_directory (Path): The directory containing the pdf files.
-        line_detection_params (dict): The parameters for the line detection algorithm.
+        filename (str): The filename of the pdf.
+        page (fitz.Page): The page to draw lines on.
+        geometric_lines (list[Line]): The lines to draw on the pdf page.
     """
     if not mlflow_tracking:
         raise Warning("MLFlow tracking is not enabled. MLFLow is required to store the images.")
     import mlflow
 
-    for root, _dirs, files in os.walk(input_directory):
-        output = {}
-        for filename in files:
-            if filename.endswith(".pdf"):
-                in_path = os.path.join(root, filename)
-                output[filename] = {}
-
-                with fitz.Document(in_path) as doc:
-                    for page_index, page in enumerate(doc):
-                        lines = extract_lines(page, line_detection_params)
-                        img = plot_lines(page, lines, scale_factor=line_detection_params["pdf_scale_factor"])
-                        mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png")
+    img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"])
+    mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png")
diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index c0ff1876..a2d2eb7e 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -7,12 +7,13 @@
 from pathlib import Path
 
 import click
+import fitz
 from dotenv import load_dotenv
 
 from stratigraphy import DATAPATH
 from stratigraphy.benchmark.score import evaluate_matching
-from stratigraphy.extract import perform_matching
-from stratigraphy.line_detection import draw_lines_on_pdfs, line_detection_params
+from stratigraphy.extract import process_page
+from stratigraphy.line_detection import draw_lines_on_pdfs, extract_lines, line_detection_params
 from stratigraphy.util.util import flatten, read_params
 
 load_dotenv()
@@ -110,8 +111,34 @@ def start_pipeline(
         shutil.copy(input_directory, temp_directory / "single_file")
         input_directory = temp_directory / "single_file"
 
-    # run the matching pipeline and save the result
-    predictions = perform_matching(input_directory, **matching_params)
+    # process the individual pdf files
+    predictions = {}
+    for root, _dirs, files in os.walk(input_directory):
+        for filename in files:
+            if filename.endswith(".pdf"):
+                in_path = os.path.join(root, filename)
+                logger.info("Processing file: %s", in_path)
+                predictions[filename] = {}
+
+                with fitz.Document(in_path) as doc:
+                    for page_index, page in enumerate(doc):
+                        page_number = page_index + 1
+                        logger.info("Processing page %s", page_number)
+
+                        geometric_lines = extract_lines(page, line_detection_params)
+                        layer_predictions, depths_materials_column_pairs = process_page(
+                            page, geometric_lines, **matching_params
+                        )
+
+                        predictions[filename][f"page_{page_number}"] = {
+                            "layers": layer_predictions,
+                            "depths_materials_column_pairs": depths_materials_column_pairs,
+                        }
+
+                        if draw_lines:
+                            logger.info("Drawing lines on pdf pages.")
+                            draw_lines_on_pdfs(filename, page, geometric_lines)
+
     with open(predictions_path, "w") as file:
         file.write(json.dumps(predictions))
 
@@ -125,10 +152,6 @@ def start_pipeline(
         mlflow.log_metrics(metrics)
         mlflow.log_artifact(temp_directory / "document_level_metrics.csv")
 
-    if draw_lines:
-        logger.info("Drawing lines on pdf pages.")
-        draw_lines_on_pdfs(input_directory, line_detection_params=line_detection_params)
-
 
 if __name__ == "__main__":
     start_pipeline()

From 12cde5813a5366a54d9c3af2f64bd70d6a9b2e62 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Wed, 3 Apr 2024 14:39:37 +0200
Subject: [PATCH 2/8] make single files work without temporary directory.

---
 src/stratigraphy/main.py      | 13 ++++---------
 src/stratigraphy/util/draw.py |  2 ++
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index a2d2eb7e..03005d17 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -3,7 +3,6 @@
 import json
 import logging
 import os
-import shutil
 from pathlib import Path
 
 import click
@@ -104,16 +103,12 @@ def start_pipeline(
 
     # if a file is specified instead of an input directory, copy the file to a temporary directory and work with that.
     if input_directory.is_file():
-        if (temp_directory / "single_file").is_dir():
-            shutil.rmtree(temp_directory / "single_file")
-
-        Path.mkdir(temp_directory / "single_file")
-        shutil.copy(input_directory, temp_directory / "single_file")
-        input_directory = temp_directory / "single_file"
-
+        file_iterator = [(input_directory.parent, None, [input_directory.name])]
+    else:
+        file_iterator = os.walk(input_directory)
     # process the individual pdf files
     predictions = {}
-    for root, _dirs, files in os.walk(input_directory):
+    for root, _dirs, files in file_iterator:
         for filename in files:
             if filename.endswith(".pdf"):
                 in_path = os.path.join(root, filename)
diff --git a/src/stratigraphy/util/draw.py b/src/stratigraphy/util/draw.py
index 388d60b6..948bd430 100644
--- a/src/stratigraphy/util/draw.py
+++ b/src/stratigraphy/util/draw.py
@@ -35,6 +35,8 @@ def draw_predictions(predictions: dict, directory: Path, out_directory: Path) ->
         directory (Path): Path to the directory containing the pdf files.
         out_directory (Path): Path to the output directory where the images are saved.
     """
+    if directory.is_file():  # deal with the case when we pass a file instead of a directory
+        directory = directory.parent
     for file in predictions:
         logger.info(f"Evaluating {file}.")
         with fitz.Document(directory / file) as doc:

From b549d0ff91e647194fb23afdb7c86a94d444b40e Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Thu, 4 Apr 2024 08:11:26 +0200
Subject: [PATCH 3/8] suggestion for rectangle correction with lines.

---
 src/stratigraphy/extract.py          | 42 ++++++++++++++++++++++++++++
 src/stratigraphy/util/dataclasses.py |  3 ++
 2 files changed, 45 insertions(+)

diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
index 9cd622ae..5e312730 100644
--- a/src/stratigraphy/extract.py
+++ b/src/stratigraphy/extract.py
@@ -84,6 +84,9 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict]
     for depth_column in depth_columns:
         material_description_rect = find_material_description_column(lines, depth_column)
         if material_description_rect:
+            material_description_rect = adjust_material_description_rect(
+                material_description_rect, page.rect.width, geometric_lines
+            )
             pairs.append((depth_column, material_description_rect))
 
     # lowest score first
@@ -125,6 +128,9 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict]
         # Fallback when no depth column was found
         material_description_rect = find_material_description_column(lines, depth_column=None)
         if material_description_rect:
+            material_description_rect = adjust_material_description_rect(
+                material_description_rect, page.rect.width, geometric_lines
+            )
             description_lines = get_description_lines(lines, material_description_rect)
             description_blocks = get_description_blocks(
                 description_lines,
@@ -456,6 +462,42 @@ def is_below(best_x0, best_y1, line):
         return candidate_rects[0]
 
 
+def adjust_material_description_rect(
+    material_description_rect: fitz.Rect, page_width: float, geometric_lines: list[Line]
+) -> fitz.Rect:
+    """Adjust the material description rectangle based on the geometric lines.
+
+    Tries to adjust the material description rectangle to the top until a long solid line is reached.
+
+    Args:
+        material_description_rect (fitz.Rect): _description_
+        page_width (float): _description_
+        geometric_lines (list[Line]): _description_
+
+    Returns:
+        fitz.Rect: _description_
+    """
+    material_description_rect_top = material_description_rect.y0
+    max_line_y = 0
+    for line in geometric_lines:
+        if (
+            line.length() > 0.7 * material_description_rect.width
+            and (line.start.y + line.end.y) / 2 < material_description_rect_top + 5
+        ):
+            max_line_y = max(max_line_y, (line.start.y + line.end.y) / 2) - 5
+
+    if max_line_y > material_description_rect_top:
+        new_coordinates = [
+            material_description_rect.x0,
+            max_line_y,
+            material_description_rect.x1,
+            material_description_rect.y1,
+        ]
+        return fitz.Rect(*new_coordinates)
+    else:
+        return material_description_rect
+
+
 def perform_matching(directory: Path, **params: dict) -> dict:
     """Perform the matching of text blocks with depth intervals.
 
diff --git a/src/stratigraphy/util/dataclasses.py b/src/stratigraphy/util/dataclasses.py
index 48436128..b787743b 100644
--- a/src/stratigraphy/util/dataclasses.py
+++ b/src/stratigraphy/util/dataclasses.py
@@ -47,6 +47,9 @@ def distance_to(self, point: Point) -> float:
             - (self.start.x - point.x) * (self.end.y - self.start.y)
         ) / np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2)
 
+    def length(self) -> float:
+        return np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2)
+
     def slope(self) -> float:
         return (self.end.y - self.start.y) / (self.end.x - self.start.x) if self.end.x - self.start.x != 0 else np.inf
 

From 1edbb7975b39a389eefefaad8676f149acdeb05b Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Thu, 4 Apr 2024 14:24:29 +0200
Subject: [PATCH 4/8] refactoring evaluate_matching.

---
 src/stratigraphy/benchmark/score.py | 30 ++++++++---------------------
 src/stratigraphy/line_detection.py  |  2 +-
 src/stratigraphy/main.py            | 16 +++++++++------
 3 files changed, 19 insertions(+), 29 deletions(-)

diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py
index 7a552091..a655bc7e 100644
--- a/src/stratigraphy/benchmark/score.py
+++ b/src/stratigraphy/benchmark/score.py
@@ -1,6 +1,5 @@
 """Evaluate the predictions against the ground truth."""
 
-import json
 import logging
 import os
 from pathlib import Path
@@ -9,7 +8,6 @@
 from dotenv import load_dotenv
 from stratigraphy import DATAPATH
 from stratigraphy.benchmark.ground_truth import GroundTruth
-from stratigraphy.util.draw import draw_predictions
 from stratigraphy.util.util import parse_text
 
 load_dotenv()
@@ -56,34 +54,20 @@ def f1(precision: float, recall: float) -> float:
         return 0
 
 
-def evaluate_matching(
-    predictions_path: Path, ground_truth_path: Path, directory: Path, out_directory: Path, skip_draw_predictions: bool
-) -> tuple[dict, pd.DataFrame]:
+def evaluate_matching(predictions: dict, number_of_truth_values: dict) -> tuple[dict, pd.DataFrame]:
     """Calculate F1, precision and recall for the predictions.
 
     Calculate F1, precision and recall for the individual documents as well as overall.
     The individual document metrics are returned as a DataFrame.
 
     Args:
-        predictions_path (Path): Path to the predictions.json file.
-        ground_truth_path (Path): Path to the ground truth annotated data.
-        directory (Path): Path to the directory containing the pdf files.
-        out_directory (Path): Path to the directory where the evaluation images should be saved.
-        skip_draw_predictions (bool): Whether to draw the predictions on the pdf pages.
+        predictions (dict): The predictions.
+        number_of_truth_values (dict): The number of ground truth values per file.
 
     Returns:
         tuple[dict, pd.DataFrame]: A tuple containing the overall F1, precision and recall as a dictionary and the
         individual document metrics as a DataFrame.
     """
-    ground_truth = GroundTruth(ground_truth_path)
-    with open(predictions_path) as in_file:
-        predictions = json.load(in_file)
-
-    predictions, number_of_truth_values = _add_ground_truth_to_predictions(predictions, ground_truth)
-
-    if not skip_draw_predictions:
-        draw_predictions(predictions, directory, out_directory)
-
     document_level_metrics = {
         "document_name": [],
         "F1": [],
@@ -137,16 +121,18 @@ def evaluate_matching(
     }, pd.DataFrame(document_level_metrics)
 
 
-def _add_ground_truth_to_predictions(predictions: dict, ground_truth: GroundTruth) -> (dict, dict):
+def add_ground_truth_to_predictions(predictions: dict, ground_truth_path: Path) -> tuple[dict, dict]:
     """Add the ground truth to the predictions.
 
     Args:
         predictions (dict): The predictions.
-        ground_truth (GroundTruth): The ground truth.
+        ground_truth_path (Path): The path to the ground truth file.
 
     Returns:
-        (dict, dict): The predictions with the ground truth added, and the number of ground truth values per file.
+        tuple[dict, dict]: The predictions with the ground truth added, and the number of ground truth values per file.
     """
+    ground_truth = GroundTruth(ground_truth_path)
+
     number_of_truth_values = {}
     for file, file_predictions in predictions.items():
         ground_truth_for_file = ground_truth.for_file(file)
diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py
index 98e323f3..a5870cff 100644
--- a/src/stratigraphy/line_detection.py
+++ b/src/stratigraphy/line_detection.py
@@ -87,7 +87,7 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]:
     return lines
 
 
-def draw_lines_on_pdfs(filename: str, page: fitz.Page, geometric_lines: list[Line]):
+def draw_lines_on_page(filename: str, page: fitz.Page, geometric_lines: list[Line]):
     """Draw lines on pdf pages and stores them as artifacts in mlflow.
 
     Note: now the function draw_lines_on_pdfs may not even be needed any more.
diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index 03005d17..e61c7222 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -10,9 +10,10 @@
 from dotenv import load_dotenv
 
 from stratigraphy import DATAPATH
-from stratigraphy.benchmark.score import evaluate_matching
+from stratigraphy.benchmark.score import add_ground_truth_to_predictions, evaluate_matching
 from stratigraphy.extract import process_page
-from stratigraphy.line_detection import draw_lines_on_pdfs, extract_lines, line_detection_params
+from stratigraphy.line_detection import draw_lines_on_page, extract_lines, line_detection_params
+from stratigraphy.util.draw import draw_predictions
 from stratigraphy.util.util import flatten, read_params
 
 load_dotenv()
@@ -132,15 +133,18 @@ def start_pipeline(
 
                         if draw_lines:
                             logger.info("Drawing lines on pdf pages.")
-                            draw_lines_on_pdfs(filename, page, geometric_lines)
+                            draw_lines_on_page(filename, page, geometric_lines)
 
     with open(predictions_path, "w") as file:
         file.write(json.dumps(predictions))
 
     # evaluate the predictions
-    metrics, document_level_metrics = evaluate_matching(
-        predictions_path, ground_truth_path, input_directory, out_directory, skip_draw_predictions
-    )
+    predictions, number_of_truth_values = add_ground_truth_to_predictions(predictions, ground_truth_path)
+
+    if not skip_draw_predictions:
+        draw_predictions(predictions, input_directory, out_directory)
+
+    metrics, document_level_metrics = evaluate_matching(predictions, number_of_truth_values)
     document_level_metrics.to_csv(temp_directory / "document_level_metrics.csv")  # mlflow.log_artifact expects a file
 
     if mlflow_tracking:

From ebe0bac5ba58fdb4e59f41792eaaf5486a42f5bc Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Thu, 4 Apr 2024 14:27:10 +0200
Subject: [PATCH 5/8] Undo mistaken commit.

---
 src/stratigraphy/extract.py          | 42 ----------------------------
 src/stratigraphy/util/dataclasses.py |  3 --
 2 files changed, 45 deletions(-)

diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
index 5e312730..9cd622ae 100644
--- a/src/stratigraphy/extract.py
+++ b/src/stratigraphy/extract.py
@@ -84,9 +84,6 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict]
     for depth_column in depth_columns:
         material_description_rect = find_material_description_column(lines, depth_column)
         if material_description_rect:
-            material_description_rect = adjust_material_description_rect(
-                material_description_rect, page.rect.width, geometric_lines
-            )
             pairs.append((depth_column, material_description_rect))
 
     # lowest score first
@@ -128,9 +125,6 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict]
         # Fallback when no depth column was found
         material_description_rect = find_material_description_column(lines, depth_column=None)
         if material_description_rect:
-            material_description_rect = adjust_material_description_rect(
-                material_description_rect, page.rect.width, geometric_lines
-            )
             description_lines = get_description_lines(lines, material_description_rect)
             description_blocks = get_description_blocks(
                 description_lines,
@@ -462,42 +456,6 @@ def is_below(best_x0, best_y1, line):
         return candidate_rects[0]
 
 
-def adjust_material_description_rect(
-    material_description_rect: fitz.Rect, page_width: float, geometric_lines: list[Line]
-) -> fitz.Rect:
-    """Adjust the material description rectangle based on the geometric lines.
-
-    Tries to adjust the material description rectangle to the top until a long solid line is reached.
-
-    Args:
-        material_description_rect (fitz.Rect): _description_
-        page_width (float): _description_
-        geometric_lines (list[Line]): _description_
-
-    Returns:
-        fitz.Rect: _description_
-    """
-    material_description_rect_top = material_description_rect.y0
-    max_line_y = 0
-    for line in geometric_lines:
-        if (
-            line.length() > 0.7 * material_description_rect.width
-            and (line.start.y + line.end.y) / 2 < material_description_rect_top + 5
-        ):
-            max_line_y = max(max_line_y, (line.start.y + line.end.y) / 2) - 5
-
-    if max_line_y > material_description_rect_top:
-        new_coordinates = [
-            material_description_rect.x0,
-            max_line_y,
-            material_description_rect.x1,
-            material_description_rect.y1,
-        ]
-        return fitz.Rect(*new_coordinates)
-    else:
-        return material_description_rect
-
-
 def perform_matching(directory: Path, **params: dict) -> dict:
     """Perform the matching of text blocks with depth intervals.
 
diff --git a/src/stratigraphy/util/dataclasses.py b/src/stratigraphy/util/dataclasses.py
index b787743b..48436128 100644
--- a/src/stratigraphy/util/dataclasses.py
+++ b/src/stratigraphy/util/dataclasses.py
@@ -47,9 +47,6 @@ def distance_to(self, point: Point) -> float:
             - (self.start.x - point.x) * (self.end.y - self.start.y)
         ) / np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2)
 
-    def length(self) -> float:
-        return np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2)
-
     def slope(self) -> float:
         return (self.end.y - self.start.y) / (self.end.x - self.start.x) if self.end.x - self.start.x != 0 else np.inf
 

From 2166ad13e1bb761e374af1d9d25878ff90ec6f19 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Thu, 4 Apr 2024 15:19:39 +0200
Subject: [PATCH 6/8] Minor refactoring

---
 src/stratigraphy/main.py      | 1 -
 src/stratigraphy/util/draw.py | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index e61c7222..f7213a6f 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -130,7 +130,6 @@ def start_pipeline(
                             "layers": layer_predictions,
                             "depths_materials_column_pairs": depths_materials_column_pairs,
                         }
-
                         if draw_lines:
                             logger.info("Drawing lines on pdf pages.")
                             draw_lines_on_page(filename, page, geometric_lines)
diff --git a/src/stratigraphy/util/draw.py b/src/stratigraphy/util/draw.py
index 948bd430..c5a814f4 100644
--- a/src/stratigraphy/util/draw.py
+++ b/src/stratigraphy/util/draw.py
@@ -31,14 +31,13 @@ def draw_predictions(predictions: dict, directory: Path, out_directory: Path) ->
         - Assignments of material description text blocks to depth intervals (if available)
 
     Args:
-        predictions (dict): Content of the predictions.json file..
+        predictions (dict): Content of the predictions.json file.
         directory (Path): Path to the directory containing the pdf files.
         out_directory (Path): Path to the output directory where the images are saved.
     """
     if directory.is_file():  # deal with the case when we pass a file instead of a directory
         directory = directory.parent
     for file in predictions:
-        logger.info(f"Evaluating {file}.")
         with fitz.Document(directory / file) as doc:
             for page_index, page in enumerate(doc):
                 page_number = page_index + 1

From d32976b11860d0dcb12cbc3d5163f9dea9a5c81c Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Fri, 5 Apr 2024 10:33:09 +0200
Subject: [PATCH 7/8] remove unused function. Remove draw_lines and implement
 it in main.py directly.

---
 src/stratigraphy/extract.py        | 34 ------------------------------
 src/stratigraphy/line_detection.py | 19 -----------------
 src/stratigraphy/main.py           | 16 ++++++++++----
 3 files changed, 12 insertions(+), 57 deletions(-)

diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
index 9cd622ae..cfd44633 100644
--- a/src/stratigraphy/extract.py
+++ b/src/stratigraphy/extract.py
@@ -2,8 +2,6 @@
 
 import logging
 import math
-import os
-from pathlib import Path
 
 import fitz
 
@@ -454,35 +452,3 @@ def is_below(best_x0, best_y1, line):
         return max(candidate_rects, key=lambda rect: score_column_match(depth_column, rect))
     else:
         return candidate_rects[0]
-
-
-def perform_matching(directory: Path, **params: dict) -> dict:
-    """Perform the matching of text blocks with depth intervals.
-
-    Args:
-        directory (Path): Path to the directory that contains the pdfs.
-        **params (dict): Additional parameters for the matching pipeline.
-
-    Returns:
-        dict: The predictions.
-    """
-    for root, _dirs, files in os.walk(directory):
-        output = {}
-        for filename in files:
-            if filename.endswith(".pdf"):
-                in_path = os.path.join(root, filename)
-                logger.info("Processing file: %s", in_path)
-                output[filename] = {}
-
-                with fitz.Document(in_path) as doc:
-                    for page_index, page in enumerate(doc):
-                        page_number = page_index + 1
-                        logger.info("Processing page %s", page_number)
-
-                        predictions, depths_materials_column_pairs = process_page(page, **params)
-
-                        output[filename][f"page_{page_number}"] = {
-                            "layers": predictions,
-                            "depths_materials_column_pairs": depths_materials_column_pairs,
-                        }
-        return output
diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py
index a5870cff..6f9aa3b8 100644
--- a/src/stratigraphy/line_detection.py
+++ b/src/stratigraphy/line_detection.py
@@ -14,7 +14,6 @@
     merge_parallel_lines_approximately,
     merge_parallel_lines_efficiently,
 )
-from stratigraphy.util.plot_utils import plot_lines
 from stratigraphy.util.util import line_from_array, read_params
 
 load_dotenv()
@@ -85,21 +84,3 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]:
             lines, tol=merging_params["merging_tolerance"], angle_threshold=merging_params["angle_threshold"]
         )
     return lines
-
-
-def draw_lines_on_page(filename: str, page: fitz.Page, geometric_lines: list[Line]):
-    """Draw lines on pdf pages and stores them as artifacts in mlflow.
-
-    Note: now the function draw_lines_on_pdfs may not even be needed any more.
-
-    Args:
-        filename (str): The filename of the pdf.
-        page (fitz.Page): The page to draw lines on.
-        geometric_lines (list[Line]): The lines to draw on the pdf page.
-    """
-    if not mlflow_tracking:
-        raise Warning("MLFlow tracking is not enabled. MLFLow is required to store the images.")
-    import mlflow
-
-    img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"])
-    mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png")
diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index f7213a6f..9856de9e 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -12,8 +12,9 @@
 from stratigraphy import DATAPATH
 from stratigraphy.benchmark.score import add_ground_truth_to_predictions, evaluate_matching
 from stratigraphy.extract import process_page
-from stratigraphy.line_detection import draw_lines_on_page, extract_lines, line_detection_params
+from stratigraphy.line_detection import extract_lines, line_detection_params
 from stratigraphy.util.draw import draw_predictions
+from stratigraphy.util.plot_utils import plot_lines
 from stratigraphy.util.util import flatten, read_params
 
 load_dotenv()
@@ -130,9 +131,16 @@ def start_pipeline(
                             "layers": layer_predictions,
                             "depths_materials_column_pairs": depths_materials_column_pairs,
                         }
-                        if draw_lines:
-                            logger.info("Drawing lines on pdf pages.")
-                            draw_lines_on_page(filename, page, geometric_lines)
+                        if draw_lines:  # could be changed to if draw_lines and mflow_tracking:
+                            if not mlflow_tracking:
+                                logger.warning(
+                                    "MLFlow tracking is not enabled. MLFLow is required to store the images."
+                                )
+                            else:
+                                img = plot_lines(
+                                    page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"]
+                                )
+                                mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png")
 
     with open(predictions_path, "w") as file:
         file.write(json.dumps(predictions))

From 3421484361f00e7b0ea1d478f527bf94ec190583 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Fri, 5 Apr 2024 11:35:17 +0200
Subject: [PATCH 8/8] Update comments.

---
 src/stratigraphy/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index 9856de9e..d9dc239a 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -145,7 +145,7 @@ def start_pipeline(
     with open(predictions_path, "w") as file:
         file.write(json.dumps(predictions))
 
-    # evaluate the predictions
+    # evaluate the predictions; if file doesnt exist, the predictions are not changed.
     predictions, number_of_truth_values = add_ground_truth_to_predictions(predictions, ground_truth_path)
 
     if not skip_draw_predictions: