From f8d6db88eb79d6eb04975aa42169297f2bcc16a2 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Wed, 3 Apr 2024 10:27:39 +0200 Subject: [PATCH] Remove line_detection script --- README.md | 5 +---- src/stratigraphy/line_detection.py | 36 +----------------------------- 2 files changed, 2 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 50764cf6..61155a08 100644 --- a/README.md +++ b/README.md @@ -52,8 +52,6 @@ To execute the data extraction pipeline, follow these steps: Once the script has finished running, you can check the results in the `data/Benchmark/extract` directory. The result is a `predictions.json` file as well as a png file for each page of each PDF in the `data/Benchmark` directory. -Please note that for now the pipeline assumes that all PDF files to be analyzed are placed in the `data/Benchmark` directory. If you want to analyze different files, please place them in this directory. - ### Output Structure The `predictions.json` file contains the results of a data extraction process from PDF files. Each key in the JSON object is the name of a PDF file, and the value is a list of extracted items in a dictionary like object. The extracted items for now are the material descriptions in their correct order (given by their depths). @@ -151,7 +149,7 @@ The project structure and the most important files are as follows: - `src/` : The source code of the project. - `stratigraphy/` : The main package of the project. - `main.py` : The main script of the project. This script runs the data extraction pipeline. - - `line_detection.py`: This script runs the line detection on provided sample pdfs. Will be deprecated in the future. + - `line_detection.py`: Contains functionalities for line detection on pdf pages. - `util/` : Utility scripts and modules. - `benchmark/` : Scripts to evaluate the data extraction. - `data/` : The data used by the project. @@ -166,7 +164,6 @@ The project structure and the most important files are as follows: - `main.py` : This is the main script of the project. It runs the data extraction pipeline, which analyzes the PDF files in the `data/Benchmark` directory and saves the results in the `predictions.json` file. -- `line_detection.py` : Runs the line detection algorithm on pdfs using `lsd` from opencv. It is meant to find all lines that potentially separate two material descriptions. It is incorporated in the script `main.py` and will be deprecated as a standalone script in the future. ## Experiment Tracking We perform experiment tracking using MLFlow. Each developer has his own local MLFlow instance. diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py index e99e309c..6b95b6a3 100644 --- a/src/stratigraphy/line_detection.py +++ b/src/stratigraphy/line_detection.py @@ -9,7 +9,6 @@ from dotenv import load_dotenv from numpy.typing import ArrayLike -from stratigraphy import DATAPATH from stratigraphy.util.dataclasses import Line from stratigraphy.util.geometric_line_utilities import ( drop_vertical_lines, @@ -17,7 +16,7 @@ merge_parallel_lines_efficiently, ) from stratigraphy.util.plot_utils import plot_lines -from stratigraphy.util.util import flatten, line_from_array, read_params +from stratigraphy.util.util import line_from_array, read_params load_dotenv() @@ -111,36 +110,3 @@ def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict): import mlflow mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png") - - -if __name__ == "__main__": - # Some test pdfs - selected_pdfs = [ - "270124083-bp.pdf", - "268124307-bp.pdf", - "268125268-bp.pdf", - "267125378-bp.pdf", - "268124435-bp.pdf", - "267123060-bp.pdf", - "268124635-bp.pdf", - "675230002-bp.pdf", - "268125592-bp.pdf", - "267124070-bp.pdf", - "699248001-bp.pdf", - ] - - if mlflow_tracking: - import mlflow - - mlflow.set_experiment("LineDetection") - mlflow.start_run() - mlflow.log_params(flatten(line_detection_params)) - lines = {} - for pdf in selected_pdfs: - doc = fitz.open(DATAPATH / "Benchmark" / pdf) - - for page in doc: - lines[pdf] = extract_lines(page, line_detection_params) - img = plot_lines(page, lines[pdf], scale_factor=line_detection_params["pdf_scale_factor"]) - if mlflow_tracking: - mlflow.log_image(img, f"lines_{pdf}.png")