Merge branch 'main' into feat/fix_material_descr_rect

swisstopo · Apr 8, 2024 · a4554c8 · a4554c8
2 parents 871ea6f + df7f9f6
commit a4554c8
Show file tree

Hide file tree

Showing 11 changed files with 635 additions and 560 deletions.
diff --git a/.github/workflows/pipeline_run.yml b/.github/workflows/pipeline_run.yml
@@ -0,0 +1,24 @@
+name: pipeline_run
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pipeline_run:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Create Environment and run pipeline
+      shell: bash
+      run: |
+        wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
+        bash Miniforge3.sh -b -p "${HOME}/conda"
+        source "${HOME}/conda/etc/profile.d/conda.sh"
+        conda init --all
+        source "${HOME}/.bash_profile"
+        conda env create -f environment-prod.yml
+        conda activate boreholes-prod
+        echo "Running pipeline"
+        boreholes-extract-layers -l -i example/example_borehole_profile.pdf -o example/ -p example/predictions.json
diff --git a/README.md b/README.md
@@ -40,18 +40,18 @@ To execute the data extraction pipeline, follow these steps:
 
     `conda activate boreholes-dev`
 
-2. **Run the main script**
+2. **Run the extraction script**
 
-    The main script for the extraction pipeline is located at `src/stratigraphy/main.py`. Run this script to start the extraction process.
+    The main script for the extraction pipeline is located at `src/stratigraphy/main.py`. A cli command is created to run this script.
 
-    This script will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory.
+    Run `boreholes-extract-layers` to run the main extraction script. With the default options, the command will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory.
+
+    Use `boreholes-extract-layers --help` to see all options for the extraction script.
 
 3. **Check the results**
 
     Once the script has finished running, you can check the results in the `data/Benchmark/extract` directory. The result is a `predictions.json` file as well as a png file for each page of each PDF in the `data/Benchmark` directory.
 
-Please note that for now the pipeline assumes that all PDF files to be analyzed are placed in the `data/Benchmark` directory. If you want to analyze different files, please place them in this directory.
-
 ### Output Structure
 The `predictions.json` file contains the results of a data extraction process from PDF files. Each key in the JSON object is the name of a PDF file, and the value is a list of extracted items in a dictionary like object. The extracted items for now are the material descriptions in their correct order (given by their depths).
 
@@ -149,7 +149,7 @@ The project structure and the most important files are as follows:
   - `src/` : The source code of the project.
     - `stratigraphy/` : The main package of the project.
       - `main.py` : The main script of the project. This script runs the data extraction pipeline.
-      - `line_detection.py`: This script runs the line detection on provided sample pdfs. Will be deprecated in the future.
+      - `line_detection.py`: Contains functionalities for line detection on pdf pages.
       - `util/` : Utility scripts and modules.
       - `benchmark/` : Scripts to evaluate the data extraction.
   - `data/` : The data used by the project.
@@ -164,7 +164,6 @@ The project structure and the most important files are as follows:
 
 - `main.py` : This is the main script of the project. It runs the data extraction pipeline, which analyzes the PDF files in the `data/Benchmark` directory and saves the results in the `predictions.json` file.
 
-- `line_detection.py` : Runs the line detection algorithm on pdfs using `lsd` from opencv. It is meant to find all lines that potentially separate two material descriptions. It is incorporated in the script `main.py` and will be deprecated as a standalone script in the future.
 
 ## Experiment Tracking
 We perform experiment tracking using MLFlow. Each developer has his own local MLFlow instance. 

diff --git a/environment-dev.yml b/environment-dev.yml
@@ -8,21 +8,24 @@ dependencies:
   - pandas
   - levenshtein
   - pathlib==1.0.1
-  - opencv==4.9.0
   - python-dotenv==1.0.1
-  - pytest==8.1.1
+  - click==8.1.7
+  - PyYAML==6.0.1
+  - scikit-learn==1.4.0
   - pip
 # dev dependencies
   - matplotlib==3.8.0
   - isort==5.13.2
   - jupyterlab==4.1.3
   - black==24.2.0
   - pre-commit==3.6.2
+  - pytest==8.1.1
   - pip:
 # prod pip dependencies; needs to be a strict copy of environment-prod.yml
       - amazon-textract-textractor
       - backoff
       - PyMuPDF==1.23.26
+      - opencv-python-headless
       - -e .
 # dev pip dependencies
       - mlflow==2.11.1
diff --git a/environment-prod.yml b/environment-prod.yml
@@ -7,11 +7,14 @@ dependencies:
   - pandas
   - levenshtein
   - pathlib==1.0.1
-  - opencv==4.9.0
   - python-dotenv==1.0.1
+  - scikit-learn==1.4.0
+  - click==8.1.7
+  - PyYAML==6.0.1
   - pip
   - pip:
       - amazon-textract-textractor
       - backoff
       - PyMuPDF==1.23.26
+      - opencv-python-headless
       - -e .
diff --git a/example/example_borehole_profile.pdf b/example/example_borehole_profile.pdf
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,6 +7,9 @@ requires-python = ">=3.10"
 dependencies = [
 ]
 
+[project.scripts]
+boreholes-extract-layers = "stratigraphy.main:start_pipeline"
+
 [tool.ruff.lint]
 select = [
     # pydocstyle

diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py
@@ -1,6 +1,5 @@
 """Evaluate the predictions against the ground truth."""
 
-import json
 import logging
 import os
 from pathlib import Path
@@ -9,7 +8,6 @@
 from dotenv import load_dotenv
 from stratigraphy import DATAPATH
 from stratigraphy.benchmark.ground_truth import GroundTruth
-from stratigraphy.util.draw import draw_predictions
 from stratigraphy.util.util import parse_text
 
 load_dotenv()
@@ -56,32 +54,20 @@ def f1(precision: float, recall: float) -> float:
         return 0
 
 
-def evaluate_matching(
-    predictions_path: Path, ground_truth_path: Path, directory: Path, out_directory: Path
-) -> tuple[dict, pd.DataFrame]:
+def evaluate_matching(predictions: dict, number_of_truth_values: dict) -> tuple[dict, pd.DataFrame]:
     """Calculate F1, precision and recall for the predictions.
 
     Calculate F1, precision and recall for the individual documents as well as overall.
     The individual document metrics are returned as a DataFrame.
 
     Args:
-        predictions_path (Path): Path to the predictions.json file.
-        ground_truth_path (Path): Path to the ground truth annotated data.
-        directory (Path): Path to the directory containing the pdf files.
-        out_directory (Path): Path to the directory where the evaluation images should be saved.
+        predictions (dict): The predictions.
+        number_of_truth_values (dict): The number of ground truth values per file.
 
     Returns:
         tuple[dict, pd.DataFrame]: A tuple containing the overall F1, precision and recall as a dictionary and the
         individual document metrics as a DataFrame.
     """
-    ground_truth = GroundTruth(ground_truth_path)
-    with open(predictions_path) as in_file:
-        predictions = json.load(in_file)
-
-    predictions, number_of_truth_values = _add_ground_truth_to_predictions(predictions, ground_truth)
-
-    draw_predictions(predictions, directory, out_directory)
-
     document_level_metrics = {
         "document_name": [],
         "F1": [],
@@ -135,16 +121,23 @@ def evaluate_matching(
     }, pd.DataFrame(document_level_metrics)
 
 
-def _add_ground_truth_to_predictions(predictions: dict, ground_truth: GroundTruth) -> (dict, dict):
+def add_ground_truth_to_predictions(predictions: dict, ground_truth_path: Path) -> tuple[dict, dict]:
     """Add the ground truth to the predictions.
 
     Args:
         predictions (dict): The predictions.
-        ground_truth (GroundTruth): The ground truth.
+        ground_truth_path (Path): The path to the ground truth file.
 
     Returns:
-        (dict, dict): The predictions with the ground truth added, and the number of ground truth values per file.
+        tuple[dict, dict]: The predictions with the ground truth added, and the number of ground truth values per file.
     """
+    try:  # for inference no ground truth is available
+        ground_truth = GroundTruth(ground_truth_path)
+
+    except FileNotFoundError:
+        logging.warning("Ground truth file not found.")
+        return predictions, {}
+
     number_of_truth_values = {}
     for file, file_predictions in predictions.items():
         ground_truth_for_file = ground_truth.for_file(file)