From f8d6db88eb79d6eb04975aa42169297f2bcc16a2 Mon Sep 17 00:00:00 2001
From: Renato Durrer <renato.durrer@visium.ch>
Date: Wed, 3 Apr 2024 10:27:39 +0200
Subject: [PATCH] Remove line_detection script

---
 README.md                          |  5 +----
 src/stratigraphy/line_detection.py | 36 +-----------------------------
 2 files changed, 2 insertions(+), 39 deletions(-)

diff --git a/README.md b/README.md
index 50764cf6..61155a08 100644
--- a/README.md
+++ b/README.md
@@ -52,8 +52,6 @@ To execute the data extraction pipeline, follow these steps:
 
     Once the script has finished running, you can check the results in the `data/Benchmark/extract` directory. The result is a `predictions.json` file as well as a png file for each page of each PDF in the `data/Benchmark` directory.
 
-Please note that for now the pipeline assumes that all PDF files to be analyzed are placed in the `data/Benchmark` directory. If you want to analyze different files, please place them in this directory.
-
 ### Output Structure
 The `predictions.json` file contains the results of a data extraction process from PDF files. Each key in the JSON object is the name of a PDF file, and the value is a list of extracted items in a dictionary like object. The extracted items for now are the material descriptions in their correct order (given by their depths).
 
@@ -151,7 +149,7 @@ The project structure and the most important files are as follows:
   - `src/` : The source code of the project.
     - `stratigraphy/` : The main package of the project.
       - `main.py` : The main script of the project. This script runs the data extraction pipeline.
-      - `line_detection.py`: This script runs the line detection on provided sample pdfs. Will be deprecated in the future.
+      - `line_detection.py`: Contains functionalities for line detection on pdf pages.
       - `util/` : Utility scripts and modules.
       - `benchmark/` : Scripts to evaluate the data extraction.
   - `data/` : The data used by the project.
@@ -166,7 +164,6 @@ The project structure and the most important files are as follows:
 
 - `main.py` : This is the main script of the project. It runs the data extraction pipeline, which analyzes the PDF files in the `data/Benchmark` directory and saves the results in the `predictions.json` file.
 
-- `line_detection.py` : Runs the line detection algorithm on pdfs using `lsd` from opencv. It is meant to find all lines that potentially separate two material descriptions. It is incorporated in the script `main.py` and will be deprecated as a standalone script in the future.
 
 ## Experiment Tracking
 We perform experiment tracking using MLFlow. Each developer has his own local MLFlow instance. 
diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py
index e99e309c..6b95b6a3 100644
--- a/src/stratigraphy/line_detection.py
+++ b/src/stratigraphy/line_detection.py
@@ -9,7 +9,6 @@
 from dotenv import load_dotenv
 from numpy.typing import ArrayLike
 
-from stratigraphy import DATAPATH
 from stratigraphy.util.dataclasses import Line
 from stratigraphy.util.geometric_line_utilities import (
     drop_vertical_lines,
@@ -17,7 +16,7 @@
     merge_parallel_lines_efficiently,
 )
 from stratigraphy.util.plot_utils import plot_lines
-from stratigraphy.util.util import flatten, line_from_array, read_params
+from stratigraphy.util.util import line_from_array, read_params
 
 load_dotenv()
 
@@ -111,36 +110,3 @@ def draw_lines_on_pdfs(input_directory: Path, line_detection_params: dict):
                             import mlflow
 
                             mlflow.log_image(img, f"pages/{filename}_page_{page_index}_lines.png")
-
-
-if __name__ == "__main__":
-    # Some test pdfs
-    selected_pdfs = [
-        "270124083-bp.pdf",
-        "268124307-bp.pdf",
-        "268125268-bp.pdf",
-        "267125378-bp.pdf",
-        "268124435-bp.pdf",
-        "267123060-bp.pdf",
-        "268124635-bp.pdf",
-        "675230002-bp.pdf",
-        "268125592-bp.pdf",
-        "267124070-bp.pdf",
-        "699248001-bp.pdf",
-    ]
-
-    if mlflow_tracking:
-        import mlflow
-
-        mlflow.set_experiment("LineDetection")
-        mlflow.start_run()
-        mlflow.log_params(flatten(line_detection_params))
-    lines = {}
-    for pdf in selected_pdfs:
-        doc = fitz.open(DATAPATH / "Benchmark" / pdf)
-
-        for page in doc:
-            lines[pdf] = extract_lines(page, line_detection_params)
-            img = plot_lines(page, lines[pdf], scale_factor=line_detection_params["pdf_scale_factor"])
-            if mlflow_tracking:
-                mlflow.log_image(img, f"lines_{pdf}.png")