swisstopo · redur · Apr 5, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 3, 2024
diff --git a/README.md b/README.md
@@ -40,18 +40,18 @@ To execute the data extraction pipeline, follow these steps:
 
     `conda activate boreholes-dev`
 
-2. **Run the main script**
+2. **Run the extraction script**
 
-    The main script for the extraction pipeline is located at `src/stratigraphy/main.py`. Run this script to start the extraction process.
+    The main script for the extraction pipeline is located at `src/stratigraphy/main.py`. A cli command is created to run this script.
 
-    This script will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory.
+    Run `boreholes-extract-layers` to run the main extraction script. With the default options, the command will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory.
+
+    Use `boreholes-extract-layers --help` to see all options for the extraction script.
 
 3. **Check the results**
 
     Once the script has finished running, you can check the results in the `data/Benchmark/extract` directory. The result is a `predictions.json` file as well as a png file for each page of each PDF in the `data/Benchmark` directory.
 
-Please note that for now the pipeline assumes that all PDF files to be analyzed are placed in the `data/Benchmark` directory. If you want to analyze different files, please place them in this directory.
-
 ### Output Structure
 The `predictions.json` file contains the results of a data extraction process from PDF files. Each key in the JSON object is the name of a PDF file, and the value is a list of extracted items in a dictionary like object. The extracted items for now are the material descriptions in their correct order (given by their depths).
 
@@ -149,7 +149,7 @@ The project structure and the most important files are as follows:
   - `src/` : The source code of the project.
     - `stratigraphy/` : The main package of the project.
       - `main.py` : The main script of the project. This script runs the data extraction pipeline.
-      - `line_detection.py`: This script runs the line detection on provided sample pdfs. Will be deprecated in the future.
+      - `line_detection.py`: Contains functionalities for line detection on pdf pages.
       - `util/` : Utility scripts and modules.
       - `benchmark/` : Scripts to evaluate the data extraction.
   - `data/` : The data used by the project.
@@ -164,7 +164,6 @@ The project structure and the most important files are as follows:
 
 - `main.py` : This is the main script of the project. It runs the data extraction pipeline, which analyzes the PDF files in the `data/Benchmark` directory and saves the results in the `predictions.json` file.
 
-- `line_detection.py` : Runs the line detection algorithm on pdfs using `lsd` from opencv. It is meant to find all lines that potentially separate two material descriptions. It is incorporated in the script `main.py` and will be deprecated as a standalone script in the future.
 
 ## Experiment Tracking
 We perform experiment tracking using MLFlow. Each developer has his own local MLFlow instance. 

diff --git a/environment-dev.yml b/environment-dev.yml
@@ -10,14 +10,15 @@ dependencies:
   - pathlib==1.0.1
   - opencv==4.9.0
   - python-dotenv==1.0.1
-  - pytest==8.1.1
+  - click==8.1.7
   - pip
 # dev dependencies
   - matplotlib==3.8.0
   - isort==5.13.2
   - jupyterlab==4.1.3
   - black==24.2.0
   - pre-commit==3.6.2
+  - pytest==8.1.1
   - pip:
 # prod pip dependencies; needs to be a strict copy of environment-prod.yml
       - amazon-textract-textractor

diff --git a/environment-prod.yml b/environment-prod.yml
@@ -9,6 +9,7 @@ dependencies:
   - pathlib==1.0.1
   - opencv==4.9.0
   - python-dotenv==1.0.1
+  - click==8.1.7
   - pip
   - pip:
       - amazon-textract-textractor

diff --git a/pyproject.toml b/pyproject.toml
@@ -7,6 +7,9 @@ requires-python = ">=3.10"
 dependencies = [
 ]
 
+[project.scripts]
+boreholes-extract-layers = "stratigraphy.main:start_pipeline"
+
 [tool.ruff.lint]
 select = [
     # pydocstyle

diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py
@@ -1,6 +1,5 @@
 """Evaluate the predictions against the ground truth."""
 
-import json
 import logging
 import os
 from pathlib import Path
@@ -9,7 +8,6 @@
 from dotenv import load_dotenv
 from stratigraphy import DATAPATH
 from stratigraphy.benchmark.ground_truth import GroundTruth
-from stratigraphy.util.draw import draw_predictions
 from stratigraphy.util.util import parse_text
 
 load_dotenv()
@@ -56,32 +54,20 @@ def f1(precision: float, recall: float) -> float:
         return 0
 
 
-def evaluate_matching(
-    predictions_path: Path, ground_truth_path: Path, directory: Path, out_directory: Path
-) -> tuple[dict, pd.DataFrame]:
+def evaluate_matching(predictions: dict, number_of_truth_values: dict) -> tuple[dict, pd.DataFrame]:
     """Calculate F1, precision and recall for the predictions.
 
     Calculate F1, precision and recall for the individual documents as well as overall.
     The individual document metrics are returned as a DataFrame.
 
     Args:
-        predictions_path (Path): Path to the predictions.json file.
-        ground_truth_path (Path): Path to the ground truth annotated data.
-        directory (Path): Path to the directory containing the pdf files.
-        out_directory (Path): Path to the directory where the evaluation images should be saved.
+        predictions (dict): The predictions.
+        number_of_truth_values (dict): The number of ground truth values per file.
 
     Returns:
         tuple[dict, pd.DataFrame]: A tuple containing the overall F1, precision and recall as a dictionary and the
         individual document metrics as a DataFrame.
     """
-    ground_truth = GroundTruth(ground_truth_path)
-    with open(predictions_path) as in_file:
-        predictions = json.load(in_file)
-
-    predictions, number_of_truth_values = _add_ground_truth_to_predictions(predictions, ground_truth)
-
-    draw_predictions(predictions, directory, out_directory)
-
     document_level_metrics = {
         "document_name": [],
         "F1": [],
@@ -135,16 +121,18 @@ def evaluate_matching(
     }, pd.DataFrame(document_level_metrics)
 
 
-def _add_ground_truth_to_predictions(predictions: dict, ground_truth: GroundTruth) -> (dict, dict):
+def add_ground_truth_to_predictions(predictions: dict, ground_truth_path: Path) -> tuple[dict, dict]:
     """Add the ground truth to the predictions.
 
     Args:
         predictions (dict): The predictions.
-        ground_truth (GroundTruth): The ground truth.
+        ground_truth_path (Path): The path to the ground truth file.
 
     Returns:
-        (dict, dict): The predictions with the ground truth added, and the number of ground truth values per file.
+        tuple[dict, dict]: The predictions with the ground truth added, and the number of ground truth values per file.
     """
+    ground_truth = GroundTruth(ground_truth_path)
+
     number_of_truth_values = {}
     for file, file_predictions in predictions.items():
         ground_truth_for_file = ground_truth.for_file(file)