Add pipeline run to CI.

Add missing dependency for prod environment. use opencv version that does not require additional non python dependencies. Catch the case when there is no ground truth file.
swisstopo · Apr 5, 2024 · fa86d15 · fa86d15
1 parent 2166ad1
commit fa86d15
Show file tree

Hide file tree

Showing 7 changed files with 56 additions and 14 deletions.
diff --git a/.github/workflows/pipeline_run.yml b/.github/workflows/pipeline_run.yml
@@ -0,0 +1,24 @@
+name: pipeline_run
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pipeline_run:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Create Environment and run pipeline
+      shell: bash
+      run: |
+        wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
+        bash Miniforge3.sh -b -p "${HOME}/conda"
+        source "${HOME}/conda/etc/profile.d/conda.sh"
+        conda init --all
+        source "${HOME}/.bash_profile"
+        conda env create -f environment-prod.yml
+        conda activate boreholes-prod
+        echo "Running pipeline"
+        boreholes-extract-layers -l -i example/example_borehole_profile.pdf -o example/ -p example/predictions.json
diff --git a/environment-dev.yml b/environment-dev.yml
@@ -8,9 +8,10 @@ dependencies:
   - pandas
   - levenshtein
   - pathlib==1.0.1
-  - opencv==4.9.0
   - python-dotenv==1.0.1
   - click==8.1.7
+  - PyYAML==6.0.1
+  - scikit-learn==1.4.0
   - pip
 # dev dependencies
   - matplotlib==3.8.0
@@ -24,6 +25,7 @@ dependencies:
       - amazon-textract-textractor
       - backoff
       - PyMuPDF==1.23.26
+      - opencv-python-headless
       - -e .
 # dev pip dependencies
       - mlflow==2.11.1
diff --git a/environment-prod.yml b/environment-prod.yml
@@ -7,12 +7,14 @@ dependencies:
   - pandas
   - levenshtein
   - pathlib==1.0.1
-  - opencv==4.9.0
   - python-dotenv==1.0.1
+  - scikit-learn==1.4.0
   - click==8.1.7
+  - PyYAML==6.0.1
   - pip
   - pip:
       - amazon-textract-textractor
       - backoff
       - PyMuPDF==1.23.26
+      - opencv-python-headless
       - -e .
diff --git a/example/example_borehole_profile.pdf b/example/example_borehole_profile.pdf
diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py
@@ -131,7 +131,12 @@ def add_ground_truth_to_predictions(predictions: dict, ground_truth_path: Path)
     Returns:
         tuple[dict, dict]: The predictions with the ground truth added, and the number of ground truth values per file.
     """
-    ground_truth = GroundTruth(ground_truth_path)
+    try:  # for inference no ground truth is available
+        ground_truth = GroundTruth(ground_truth_path)
+
+    except FileNotFoundError:
+        logging.warning("Ground truth file not found.")
+        return predictions, {}
 
     number_of_truth_values = {}
     for file, file_predictions in predictions.items():

diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py
@@ -1,5 +1,6 @@
 """Script for line detection in pdf pages."""
 
+import logging
 import os
 
 import cv2
@@ -19,8 +20,9 @@
 
 load_dotenv()
 
-mlflow_tracking = os.getenv("MLFLOW_TRACKING") == "True"  # Checks whether MLFlow tracking is enabled
+logger = logging.getLogger(__name__)
 
+mlflow_tracking = os.getenv("MLFLOW_TRACKING") == "True"  # Checks whether MLFlow tracking is enabled
 
 line_detection_params = read_params("line_detection_params.yml")
 
@@ -98,8 +100,10 @@ def draw_lines_on_page(filename: str, page: fitz.Page, geometric_lines: list[Lin
         geometric_lines (list[Line]): The lines to draw on the pdf page.
     """
     if not mlflow_tracking:
-        raise Warning("MLFlow tracking is not enabled. MLFLow is required to store the images.")
-    import mlflow
+        logger.warning("MLFlow tracking is not enabled. MLFLow is required to store the images.")
+
+    else:
+        import mlflow
 
-    img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"])
-    mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png")
+        img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"])
+        mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png")
diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
@@ -37,7 +37,7 @@
 @click.option(
     "-g",
     "--ground_truth_path",
-    type=click.Path(exists=True, path_type=Path),
+    type=click.Path(exists=False, path_type=Path),
     default=DATAPATH / "Benchmark" / "ground_truth.json",
     help="Path to the ground truth file.",
 )
@@ -143,12 +143,17 @@ def start_pipeline(
     if not skip_draw_predictions:
         draw_predictions(predictions, input_directory, out_directory)
 
-    metrics, document_level_metrics = evaluate_matching(predictions, number_of_truth_values)
-    document_level_metrics.to_csv(temp_directory / "document_level_metrics.csv")  # mlflow.log_artifact expects a file
+    if number_of_truth_values:  # only evaluate if ground truth is available
+        metrics, document_level_metrics = evaluate_matching(predictions, number_of_truth_values)
+        document_level_metrics.to_csv(
+            temp_directory / "document_level_metrics.csv"
+        )  # mlflow.log_artifact expects a file
 
-    if mlflow_tracking:
-        mlflow.log_metrics(metrics)
-        mlflow.log_artifact(temp_directory / "document_level_metrics.csv")
+        if mlflow_tracking:
+            mlflow.log_metrics(metrics)
+            mlflow.log_artifact(temp_directory / "document_level_metrics.csv")
+    else:
+        logger.warning("Ground truth file not found. Skipping evaluation.")
 
 
 if __name__ == "__main__":