From 397cae80a08064393b896dd95330337d140b206b Mon Sep 17 00:00:00 2001 From: Renato Durrer <renato.durrer@visium.ch> Date: Wed, 3 Apr 2024 14:18:21 +0200 Subject: [PATCH 1/8] Refactor extraction pipeline.2 --- src/stratigraphy/extract.py | 6 ++--- src/stratigraphy/line_detection.py | 24 ++++++------------ src/stratigraphy/main.py | 39 ++++++++++++++++++++++++------ 3 files changed, 41 insertions(+), 28 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index a3b911d1..9cd622ae 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -7,7 +7,6 @@ import fitz -from stratigraphy.line_detection import extract_lines, line_detection_params from stratigraphy.util import find_depth_columns from stratigraphy.util.dataclasses import Line from stratigraphy.util.depthcolumn import DepthColumn @@ -25,13 +24,14 @@ logger = logging.getLogger(__name__) -def process_page(page: fitz.Page, **params: dict) -> list[dict]: +def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict]: """Process a single page of a pdf. Finds all descriptions and depth intervals on the page and matches them. Args: page (fitz.Page): The page to process. + geometric_lines (list[Line]): The geometric lines of the page. **params (dict): Additional parameters for the matching pipeline. Returns: @@ -97,8 +97,6 @@ def process_page(page: fitz.Page, **params: dict) -> list[dict]: continue filtered_pairs = [item for index, item in enumerate(pairs) if index not in to_delete] - geometric_lines = extract_lines(page, line_detection_params) - groups = [] # list of matched depth intervals and text blocks # groups is of the form: ["depth_interval": BoundaryInterval, "block": TextBlock] if len(filtered_pairs): # match depth column items with material description diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py index 0bb8afc9..98e323f3 100644 --- a/src/stratigraphy/line_detection.py +++ b/src/stratigraphy/line_detection.py @@ -1,7 +1,6 @@ """Script for line detection in pdf pages.""" import os -from pathlib import Path import cv2 import fitz @@ -88,26 +87,19 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]: return lines -def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict): +def draw_lines_on_pdfs(filename: str, page: fitz.Page, geometric_lines: list[Line]): """Draw lines on pdf pages and stores them as artifacts in mlflow. + Note: now the function draw_lines_on_pdfs may not even be needed any more. + Args: - input_directory (Path): The directory containing the pdf files. - line_detection_params (dict): The parameters for the line detection algorithm. + filename (str): The filename of the pdf. + page (fitz.Page): The page to draw lines on. + geometric_lines (list[Line]): The lines to draw on the pdf page. """ if not mlflow_tracking: raise Warning("MLFlow tracking is not enabled. MLFLow is required to store the images.") import mlflow - for root, _dirs, files in os.walk(input_directory): - output = {} - for filename in files: - if filename.endswith(".pdf"): - in_path = os.path.join(root, filename) - output[filename] = {} - - with fitz.Document(in_path) as doc: - for page_index, page in enumerate(doc): - lines = extract_lines(page, line_detection_params) - img = plot_lines(page, lines, scale_factor=line_detection_params["pdf_scale_factor"]) - mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png") + img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"]) + mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png") diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index c0ff1876..a2d2eb7e 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -7,12 +7,13 @@ from pathlib import Path import click +import fitz from dotenv import load_dotenv from stratigraphy import DATAPATH from stratigraphy.benchmark.score import evaluate_matching -from stratigraphy.extract import perform_matching -from stratigraphy.line_detection import draw_lines_on_pdfs, line_detection_params +from stratigraphy.extract import process_page +from stratigraphy.line_detection import draw_lines_on_pdfs, extract_lines, line_detection_params from stratigraphy.util.util import flatten, read_params load_dotenv() @@ -110,8 +111,34 @@ def start_pipeline( shutil.copy(input_directory, temp_directory / "single_file") input_directory = temp_directory / "single_file" - # run the matching pipeline and save the result - predictions = perform_matching(input_directory, **matching_params) + # process the individual pdf files + predictions = {} + for root, _dirs, files in os.walk(input_directory): + for filename in files: + if filename.endswith(".pdf"): + in_path = os.path.join(root, filename) + logger.info("Processing file: %s", in_path) + predictions[filename] = {} + + with fitz.Document(in_path) as doc: + for page_index, page in enumerate(doc): + page_number = page_index + 1 + logger.info("Processing page %s", page_number) + + geometric_lines = extract_lines(page, line_detection_params) + layer_predictions, depths_materials_column_pairs = process_page( + page, geometric_lines, **matching_params + ) + + predictions[filename][f"page_{page_number}"] = { + "layers": layer_predictions, + "depths_materials_column_pairs": depths_materials_column_pairs, + } + + if draw_lines: + logger.info("Drawing lines on pdf pages.") + draw_lines_on_pdfs(filename, page, geometric_lines) + with open(predictions_path, "w") as file: file.write(json.dumps(predictions)) @@ -125,10 +152,6 @@ def start_pipeline( mlflow.log_metrics(metrics) mlflow.log_artifact(temp_directory / "document_level_metrics.csv") - if draw_lines: - logger.info("Drawing lines on pdf pages.") - draw_lines_on_pdfs(input_directory, line_detection_params=line_detection_params) - if __name__ == "__main__": start_pipeline() From 12cde5813a5366a54d9c3af2f64bd70d6a9b2e62 Mon Sep 17 00:00:00 2001 From: Renato Durrer <renato.durrer@visium.ch> Date: Wed, 3 Apr 2024 14:39:37 +0200 Subject: [PATCH 2/8] make single files work without temporary directory. --- src/stratigraphy/main.py | 13 ++++--------- src/stratigraphy/util/draw.py | 2 ++ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index a2d2eb7e..03005d17 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -3,7 +3,6 @@ import json import logging import os -import shutil from pathlib import Path import click @@ -104,16 +103,12 @@ def start_pipeline( # if a file is specified instead of an input directory, copy the file to a temporary directory and work with that. if input_directory.is_file(): - if (temp_directory / "single_file").is_dir(): - shutil.rmtree(temp_directory / "single_file") - - Path.mkdir(temp_directory / "single_file") - shutil.copy(input_directory, temp_directory / "single_file") - input_directory = temp_directory / "single_file" - + file_iterator = [(input_directory.parent, None, [input_directory.name])] + else: + file_iterator = os.walk(input_directory) # process the individual pdf files predictions = {} - for root, _dirs, files in os.walk(input_directory): + for root, _dirs, files in file_iterator: for filename in files: if filename.endswith(".pdf"): in_path = os.path.join(root, filename) diff --git a/src/stratigraphy/util/draw.py b/src/stratigraphy/util/draw.py index 388d60b6..948bd430 100644 --- a/src/stratigraphy/util/draw.py +++ b/src/stratigraphy/util/draw.py @@ -35,6 +35,8 @@ def draw_predictions(predictions: dict, directory: Path, out_directory: Path) -> directory (Path): Path to the directory containing the pdf files. out_directory (Path): Path to the output directory where the images are saved. """ + if directory.is_file(): # deal with the case when we pass a file instead of a directory + directory = directory.parent for file in predictions: logger.info(f"Evaluating {file}.") with fitz.Document(directory / file) as doc: From b549d0ff91e647194fb23afdb7c86a94d444b40e Mon Sep 17 00:00:00 2001 From: Renato Durrer <renato.durrer@visium.ch> Date: Thu, 4 Apr 2024 08:11:26 +0200 Subject: [PATCH 3/8] suggestion for rectangle correction with lines. --- src/stratigraphy/extract.py | 42 ++++++++++++++++++++++++++++ src/stratigraphy/util/dataclasses.py | 3 ++ 2 files changed, 45 insertions(+) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 9cd622ae..5e312730 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -84,6 +84,9 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict] for depth_column in depth_columns: material_description_rect = find_material_description_column(lines, depth_column) if material_description_rect: + material_description_rect = adjust_material_description_rect( + material_description_rect, page.rect.width, geometric_lines + ) pairs.append((depth_column, material_description_rect)) # lowest score first @@ -125,6 +128,9 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict] # Fallback when no depth column was found material_description_rect = find_material_description_column(lines, depth_column=None) if material_description_rect: + material_description_rect = adjust_material_description_rect( + material_description_rect, page.rect.width, geometric_lines + ) description_lines = get_description_lines(lines, material_description_rect) description_blocks = get_description_blocks( description_lines, @@ -456,6 +462,42 @@ def is_below(best_x0, best_y1, line): return candidate_rects[0] +def adjust_material_description_rect( + material_description_rect: fitz.Rect, page_width: float, geometric_lines: list[Line] +) -> fitz.Rect: + """Adjust the material description rectangle based on the geometric lines. + + Tries to adjust the material description rectangle to the top until a long solid line is reached. + + Args: + material_description_rect (fitz.Rect): _description_ + page_width (float): _description_ + geometric_lines (list[Line]): _description_ + + Returns: + fitz.Rect: _description_ + """ + material_description_rect_top = material_description_rect.y0 + max_line_y = 0 + for line in geometric_lines: + if ( + line.length() > 0.7 * material_description_rect.width + and (line.start.y + line.end.y) / 2 < material_description_rect_top + 5 + ): + max_line_y = max(max_line_y, (line.start.y + line.end.y) / 2) - 5 + + if max_line_y > material_description_rect_top: + new_coordinates = [ + material_description_rect.x0, + max_line_y, + material_description_rect.x1, + material_description_rect.y1, + ] + return fitz.Rect(*new_coordinates) + else: + return material_description_rect + + def perform_matching(directory: Path, **params: dict) -> dict: """Perform the matching of text blocks with depth intervals. diff --git a/src/stratigraphy/util/dataclasses.py b/src/stratigraphy/util/dataclasses.py index 48436128..b787743b 100644 --- a/src/stratigraphy/util/dataclasses.py +++ b/src/stratigraphy/util/dataclasses.py @@ -47,6 +47,9 @@ def distance_to(self, point: Point) -> float: - (self.start.x - point.x) * (self.end.y - self.start.y) ) / np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2) + def length(self) -> float: + return np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2) + def slope(self) -> float: return (self.end.y - self.start.y) / (self.end.x - self.start.x) if self.end.x - self.start.x != 0 else np.inf From 1edbb7975b39a389eefefaad8676f149acdeb05b Mon Sep 17 00:00:00 2001 From: Renato Durrer <renato.durrer@visium.ch> Date: Thu, 4 Apr 2024 14:24:29 +0200 Subject: [PATCH 4/8] refactoring evaluate_matching. --- src/stratigraphy/benchmark/score.py | 30 ++++++++--------------------- src/stratigraphy/line_detection.py | 2 +- src/stratigraphy/main.py | 16 +++++++++------ 3 files changed, 19 insertions(+), 29 deletions(-) diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py index 7a552091..a655bc7e 100644 --- a/src/stratigraphy/benchmark/score.py +++ b/src/stratigraphy/benchmark/score.py @@ -1,6 +1,5 @@ """Evaluate the predictions against the ground truth.""" -import json import logging import os from pathlib import Path @@ -9,7 +8,6 @@ from dotenv import load_dotenv from stratigraphy import DATAPATH from stratigraphy.benchmark.ground_truth import GroundTruth -from stratigraphy.util.draw import draw_predictions from stratigraphy.util.util import parse_text load_dotenv() @@ -56,34 +54,20 @@ def f1(precision: float, recall: float) -> float: return 0 -def evaluate_matching( - predictions_path: Path, ground_truth_path: Path, directory: Path, out_directory: Path, skip_draw_predictions: bool -) -> tuple[dict, pd.DataFrame]: +def evaluate_matching(predictions: dict, number_of_truth_values: dict) -> tuple[dict, pd.DataFrame]: """Calculate F1, precision and recall for the predictions. Calculate F1, precision and recall for the individual documents as well as overall. The individual document metrics are returned as a DataFrame. Args: - predictions_path (Path): Path to the predictions.json file. - ground_truth_path (Path): Path to the ground truth annotated data. - directory (Path): Path to the directory containing the pdf files. - out_directory (Path): Path to the directory where the evaluation images should be saved. - skip_draw_predictions (bool): Whether to draw the predictions on the pdf pages. + predictions (dict): The predictions. + number_of_truth_values (dict): The number of ground truth values per file. Returns: tuple[dict, pd.DataFrame]: A tuple containing the overall F1, precision and recall as a dictionary and the individual document metrics as a DataFrame. """ - ground_truth = GroundTruth(ground_truth_path) - with open(predictions_path) as in_file: - predictions = json.load(in_file) - - predictions, number_of_truth_values = _add_ground_truth_to_predictions(predictions, ground_truth) - - if not skip_draw_predictions: - draw_predictions(predictions, directory, out_directory) - document_level_metrics = { "document_name": [], "F1": [], @@ -137,16 +121,18 @@ def evaluate_matching( }, pd.DataFrame(document_level_metrics) -def _add_ground_truth_to_predictions(predictions: dict, ground_truth: GroundTruth) -> (dict, dict): +def add_ground_truth_to_predictions(predictions: dict, ground_truth_path: Path) -> tuple[dict, dict]: """Add the ground truth to the predictions. Args: predictions (dict): The predictions. - ground_truth (GroundTruth): The ground truth. + ground_truth_path (Path): The path to the ground truth file. Returns: - (dict, dict): The predictions with the ground truth added, and the number of ground truth values per file. + tuple[dict, dict]: The predictions with the ground truth added, and the number of ground truth values per file. """ + ground_truth = GroundTruth(ground_truth_path) + number_of_truth_values = {} for file, file_predictions in predictions.items(): ground_truth_for_file = ground_truth.for_file(file) diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py index 98e323f3..a5870cff 100644 --- a/src/stratigraphy/line_detection.py +++ b/src/stratigraphy/line_detection.py @@ -87,7 +87,7 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]: return lines -def draw_lines_on_pdfs(filename: str, page: fitz.Page, geometric_lines: list[Line]): +def draw_lines_on_page(filename: str, page: fitz.Page, geometric_lines: list[Line]): """Draw lines on pdf pages and stores them as artifacts in mlflow. Note: now the function draw_lines_on_pdfs may not even be needed any more. diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index 03005d17..e61c7222 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -10,9 +10,10 @@ from dotenv import load_dotenv from stratigraphy import DATAPATH -from stratigraphy.benchmark.score import evaluate_matching +from stratigraphy.benchmark.score import add_ground_truth_to_predictions, evaluate_matching from stratigraphy.extract import process_page -from stratigraphy.line_detection import draw_lines_on_pdfs, extract_lines, line_detection_params +from stratigraphy.line_detection import draw_lines_on_page, extract_lines, line_detection_params +from stratigraphy.util.draw import draw_predictions from stratigraphy.util.util import flatten, read_params load_dotenv() @@ -132,15 +133,18 @@ def start_pipeline( if draw_lines: logger.info("Drawing lines on pdf pages.") - draw_lines_on_pdfs(filename, page, geometric_lines) + draw_lines_on_page(filename, page, geometric_lines) with open(predictions_path, "w") as file: file.write(json.dumps(predictions)) # evaluate the predictions - metrics, document_level_metrics = evaluate_matching( - predictions_path, ground_truth_path, input_directory, out_directory, skip_draw_predictions - ) + predictions, number_of_truth_values = add_ground_truth_to_predictions(predictions, ground_truth_path) + + if not skip_draw_predictions: + draw_predictions(predictions, input_directory, out_directory) + + metrics, document_level_metrics = evaluate_matching(predictions, number_of_truth_values) document_level_metrics.to_csv(temp_directory / "document_level_metrics.csv") # mlflow.log_artifact expects a file if mlflow_tracking: From ebe0bac5ba58fdb4e59f41792eaaf5486a42f5bc Mon Sep 17 00:00:00 2001 From: Renato Durrer <renato.durrer@visium.ch> Date: Thu, 4 Apr 2024 14:27:10 +0200 Subject: [PATCH 5/8] Undo mistaken commit. --- src/stratigraphy/extract.py | 42 ---------------------------- src/stratigraphy/util/dataclasses.py | 3 -- 2 files changed, 45 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 5e312730..9cd622ae 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -84,9 +84,6 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict] for depth_column in depth_columns: material_description_rect = find_material_description_column(lines, depth_column) if material_description_rect: - material_description_rect = adjust_material_description_rect( - material_description_rect, page.rect.width, geometric_lines - ) pairs.append((depth_column, material_description_rect)) # lowest score first @@ -128,9 +125,6 @@ def process_page(page: fitz.Page, geometric_lines, **params: dict) -> list[dict] # Fallback when no depth column was found material_description_rect = find_material_description_column(lines, depth_column=None) if material_description_rect: - material_description_rect = adjust_material_description_rect( - material_description_rect, page.rect.width, geometric_lines - ) description_lines = get_description_lines(lines, material_description_rect) description_blocks = get_description_blocks( description_lines, @@ -462,42 +456,6 @@ def is_below(best_x0, best_y1, line): return candidate_rects[0] -def adjust_material_description_rect( - material_description_rect: fitz.Rect, page_width: float, geometric_lines: list[Line] -) -> fitz.Rect: - """Adjust the material description rectangle based on the geometric lines. - - Tries to adjust the material description rectangle to the top until a long solid line is reached. - - Args: - material_description_rect (fitz.Rect): _description_ - page_width (float): _description_ - geometric_lines (list[Line]): _description_ - - Returns: - fitz.Rect: _description_ - """ - material_description_rect_top = material_description_rect.y0 - max_line_y = 0 - for line in geometric_lines: - if ( - line.length() > 0.7 * material_description_rect.width - and (line.start.y + line.end.y) / 2 < material_description_rect_top + 5 - ): - max_line_y = max(max_line_y, (line.start.y + line.end.y) / 2) - 5 - - if max_line_y > material_description_rect_top: - new_coordinates = [ - material_description_rect.x0, - max_line_y, - material_description_rect.x1, - material_description_rect.y1, - ] - return fitz.Rect(*new_coordinates) - else: - return material_description_rect - - def perform_matching(directory: Path, **params: dict) -> dict: """Perform the matching of text blocks with depth intervals. diff --git a/src/stratigraphy/util/dataclasses.py b/src/stratigraphy/util/dataclasses.py index b787743b..48436128 100644 --- a/src/stratigraphy/util/dataclasses.py +++ b/src/stratigraphy/util/dataclasses.py @@ -47,9 +47,6 @@ def distance_to(self, point: Point) -> float: - (self.start.x - point.x) * (self.end.y - self.start.y) ) / np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2) - def length(self) -> float: - return np.sqrt((self.end.x - self.start.x) ** 2 + (self.end.y - self.start.y) ** 2) - def slope(self) -> float: return (self.end.y - self.start.y) / (self.end.x - self.start.x) if self.end.x - self.start.x != 0 else np.inf From 2166ad13e1bb761e374af1d9d25878ff90ec6f19 Mon Sep 17 00:00:00 2001 From: Renato Durrer <renato.durrer@visium.ch> Date: Thu, 4 Apr 2024 15:19:39 +0200 Subject: [PATCH 6/8] Minor refactoring --- src/stratigraphy/main.py | 1 - src/stratigraphy/util/draw.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index e61c7222..f7213a6f 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -130,7 +130,6 @@ def start_pipeline( "layers": layer_predictions, "depths_materials_column_pairs": depths_materials_column_pairs, } - if draw_lines: logger.info("Drawing lines on pdf pages.") draw_lines_on_page(filename, page, geometric_lines) diff --git a/src/stratigraphy/util/draw.py b/src/stratigraphy/util/draw.py index 948bd430..c5a814f4 100644 --- a/src/stratigraphy/util/draw.py +++ b/src/stratigraphy/util/draw.py @@ -31,14 +31,13 @@ def draw_predictions(predictions: dict, directory: Path, out_directory: Path) -> - Assignments of material description text blocks to depth intervals (if available) Args: - predictions (dict): Content of the predictions.json file.. + predictions (dict): Content of the predictions.json file. directory (Path): Path to the directory containing the pdf files. out_directory (Path): Path to the output directory where the images are saved. """ if directory.is_file(): # deal with the case when we pass a file instead of a directory directory = directory.parent for file in predictions: - logger.info(f"Evaluating {file}.") with fitz.Document(directory / file) as doc: for page_index, page in enumerate(doc): page_number = page_index + 1 From d32976b11860d0dcb12cbc3d5163f9dea9a5c81c Mon Sep 17 00:00:00 2001 From: Renato Durrer <renato.durrer@visium.ch> Date: Fri, 5 Apr 2024 10:33:09 +0200 Subject: [PATCH 7/8] remove unused function. Remove draw_lines and implement it in main.py directly. --- src/stratigraphy/extract.py | 34 ------------------------------ src/stratigraphy/line_detection.py | 19 ----------------- src/stratigraphy/main.py | 16 ++++++++++---- 3 files changed, 12 insertions(+), 57 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 9cd622ae..cfd44633 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -2,8 +2,6 @@ import logging import math -import os -from pathlib import Path import fitz @@ -454,35 +452,3 @@ def is_below(best_x0, best_y1, line): return max(candidate_rects, key=lambda rect: score_column_match(depth_column, rect)) else: return candidate_rects[0] - - -def perform_matching(directory: Path, **params: dict) -> dict: - """Perform the matching of text blocks with depth intervals. - - Args: - directory (Path): Path to the directory that contains the pdfs. - **params (dict): Additional parameters for the matching pipeline. - - Returns: - dict: The predictions. - """ - for root, _dirs, files in os.walk(directory): - output = {} - for filename in files: - if filename.endswith(".pdf"): - in_path = os.path.join(root, filename) - logger.info("Processing file: %s", in_path) - output[filename] = {} - - with fitz.Document(in_path) as doc: - for page_index, page in enumerate(doc): - page_number = page_index + 1 - logger.info("Processing page %s", page_number) - - predictions, depths_materials_column_pairs = process_page(page, **params) - - output[filename][f"page_{page_number}"] = { - "layers": predictions, - "depths_materials_column_pairs": depths_materials_column_pairs, - } - return output diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py index a5870cff..6f9aa3b8 100644 --- a/src/stratigraphy/line_detection.py +++ b/src/stratigraphy/line_detection.py @@ -14,7 +14,6 @@ merge_parallel_lines_approximately, merge_parallel_lines_efficiently, ) -from stratigraphy.util.plot_utils import plot_lines from stratigraphy.util.util import line_from_array, read_params load_dotenv() @@ -85,21 +84,3 @@ def extract_lines(page: fitz.Page, line_detection_params: dict) -> list[Line]: lines, tol=merging_params["merging_tolerance"], angle_threshold=merging_params["angle_threshold"] ) return lines - - -def draw_lines_on_page(filename: str, page: fitz.Page, geometric_lines: list[Line]): - """Draw lines on pdf pages and stores them as artifacts in mlflow. - - Note: now the function draw_lines_on_pdfs may not even be needed any more. - - Args: - filename (str): The filename of the pdf. - page (fitz.Page): The page to draw lines on. - geometric_lines (list[Line]): The lines to draw on the pdf page. - """ - if not mlflow_tracking: - raise Warning("MLFlow tracking is not enabled. MLFLow is required to store the images.") - import mlflow - - img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"]) - mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png") diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index f7213a6f..9856de9e 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -12,8 +12,9 @@ from stratigraphy import DATAPATH from stratigraphy.benchmark.score import add_ground_truth_to_predictions, evaluate_matching from stratigraphy.extract import process_page -from stratigraphy.line_detection import draw_lines_on_page, extract_lines, line_detection_params +from stratigraphy.line_detection import extract_lines, line_detection_params from stratigraphy.util.draw import draw_predictions +from stratigraphy.util.plot_utils import plot_lines from stratigraphy.util.util import flatten, read_params load_dotenv() @@ -130,9 +131,16 @@ def start_pipeline( "layers": layer_predictions, "depths_materials_column_pairs": depths_materials_column_pairs, } - if draw_lines: - logger.info("Drawing lines on pdf pages.") - draw_lines_on_page(filename, page, geometric_lines) + if draw_lines: # could be changed to if draw_lines and mflow_tracking: + if not mlflow_tracking: + logger.warning( + "MLFlow tracking is not enabled. MLFLow is required to store the images." + ) + else: + img = plot_lines( + page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"] + ) + mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png") with open(predictions_path, "w") as file: file.write(json.dumps(predictions)) From 3421484361f00e7b0ea1d478f527bf94ec190583 Mon Sep 17 00:00:00 2001 From: Renato Durrer <renato.durrer@visium.ch> Date: Fri, 5 Apr 2024 11:35:17 +0200 Subject: [PATCH 8/8] Update comments. --- src/stratigraphy/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index 9856de9e..d9dc239a 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -145,7 +145,7 @@ def start_pipeline( with open(predictions_path, "w") as file: file.write(json.dumps(predictions)) - # evaluate the predictions + # evaluate the predictions; if file doesnt exist, the predictions are not changed. predictions, number_of_truth_values = add_ground_truth_to_predictions(predictions, ground_truth_path) if not skip_draw_predictions: