Skip to content

Commit

Permalink
Add pipeline run to CI.
Browse files Browse the repository at this point in the history
Add missing dependency for prod environment.
use opencv version that does not require additional non python dependencies.
Catch the case when there is no ground truth file.
  • Loading branch information
redur committed Apr 5, 2024
1 parent 2166ad1 commit fa86d15
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 14 deletions.
24 changes: 24 additions & 0 deletions .github/workflows/pipeline_run.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: pipeline_run

on:
pull_request:
push:
branches: [main]

jobs:
pipeline_run:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Create Environment and run pipeline
shell: bash
run: |
wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
bash Miniforge3.sh -b -p "${HOME}/conda"
source "${HOME}/conda/etc/profile.d/conda.sh"
conda init --all
source "${HOME}/.bash_profile"
conda env create -f environment-prod.yml
conda activate boreholes-prod
echo "Running pipeline"
boreholes-extract-layers -l -i example/example_borehole_profile.pdf -o example/ -p example/predictions.json
4 changes: 3 additions & 1 deletion environment-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ dependencies:
- pandas
- levenshtein
- pathlib==1.0.1
- opencv==4.9.0
- python-dotenv==1.0.1
- click==8.1.7
- PyYAML==6.0.1
- scikit-learn==1.4.0
- pip
# dev dependencies
- matplotlib==3.8.0
Expand All @@ -24,6 +25,7 @@ dependencies:
- amazon-textract-textractor
- backoff
- PyMuPDF==1.23.26
- opencv-python-headless
- -e .
# dev pip dependencies
- mlflow==2.11.1
4 changes: 3 additions & 1 deletion environment-prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ dependencies:
- pandas
- levenshtein
- pathlib==1.0.1
- opencv==4.9.0
- python-dotenv==1.0.1
- scikit-learn==1.4.0
- click==8.1.7
- PyYAML==6.0.1
- pip
- pip:
- amazon-textract-textractor
- backoff
- PyMuPDF==1.23.26
- opencv-python-headless
- -e .
Binary file added example/example_borehole_profile.pdf
Binary file not shown.
7 changes: 6 additions & 1 deletion src/stratigraphy/benchmark/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,12 @@ def add_ground_truth_to_predictions(predictions: dict, ground_truth_path: Path)
Returns:
tuple[dict, dict]: The predictions with the ground truth added, and the number of ground truth values per file.
"""
ground_truth = GroundTruth(ground_truth_path)
try: # for inference no ground truth is available
ground_truth = GroundTruth(ground_truth_path)

except FileNotFoundError:
logging.warning("Ground truth file not found.")
return predictions, {}

number_of_truth_values = {}
for file, file_predictions in predictions.items():
Expand Down
14 changes: 9 additions & 5 deletions src/stratigraphy/line_detection.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Script for line detection in pdf pages."""

import logging
import os

import cv2
Expand All @@ -19,8 +20,9 @@

load_dotenv()

mlflow_tracking = os.getenv("MLFLOW_TRACKING") == "True" # Checks whether MLFlow tracking is enabled
logger = logging.getLogger(__name__)

mlflow_tracking = os.getenv("MLFLOW_TRACKING") == "True" # Checks whether MLFlow tracking is enabled

line_detection_params = read_params("line_detection_params.yml")

Expand Down Expand Up @@ -98,8 +100,10 @@ def draw_lines_on_page(filename: str, page: fitz.Page, geometric_lines: list[Lin
geometric_lines (list[Line]): The lines to draw on the pdf page.
"""
if not mlflow_tracking:
raise Warning("MLFlow tracking is not enabled. MLFLow is required to store the images.")
import mlflow
logger.warning("MLFlow tracking is not enabled. MLFLow is required to store the images.")

else:
import mlflow

img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"])
mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png")
img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"])
mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png")
17 changes: 11 additions & 6 deletions src/stratigraphy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
@click.option(
"-g",
"--ground_truth_path",
type=click.Path(exists=True, path_type=Path),
type=click.Path(exists=False, path_type=Path),
default=DATAPATH / "Benchmark" / "ground_truth.json",
help="Path to the ground truth file.",
)
Expand Down Expand Up @@ -143,12 +143,17 @@ def start_pipeline(
if not skip_draw_predictions:
draw_predictions(predictions, input_directory, out_directory)

metrics, document_level_metrics = evaluate_matching(predictions, number_of_truth_values)
document_level_metrics.to_csv(temp_directory / "document_level_metrics.csv") # mlflow.log_artifact expects a file
if number_of_truth_values: # only evaluate if ground truth is available
metrics, document_level_metrics = evaluate_matching(predictions, number_of_truth_values)
document_level_metrics.to_csv(
temp_directory / "document_level_metrics.csv"
) # mlflow.log_artifact expects a file

if mlflow_tracking:
mlflow.log_metrics(metrics)
mlflow.log_artifact(temp_directory / "document_level_metrics.csv")
if mlflow_tracking:
mlflow.log_metrics(metrics)
mlflow.log_artifact(temp_directory / "document_level_metrics.csv")
else:
logger.warning("Ground truth file not found. Skipping evaluation.")


if __name__ == "__main__":
Expand Down

0 comments on commit fa86d15

Please sign in to comment.