diff --git a/.github/workflows/pipeline_run.yml b/.github/workflows/pipeline_run.yml new file mode 100644 index 00000000..787dd1fd --- /dev/null +++ b/.github/workflows/pipeline_run.yml @@ -0,0 +1,24 @@ +name: pipeline_run + +on: + pull_request: + push: + branches: [main] + +jobs: + pipeline_run: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Create Environment and run pipeline + shell: bash + run: | + wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" + bash Miniforge3.sh -b -p "${HOME}/conda" + source "${HOME}/conda/etc/profile.d/conda.sh" + conda init --all + source "${HOME}/.bash_profile" + conda env create -f environment-prod.yml + conda activate boreholes-prod + echo "Running pipeline" + boreholes-extract-layers -l -i example/example_borehole_profile.pdf -o example/ -p example/predictions.json \ No newline at end of file diff --git a/environment-dev.yml b/environment-dev.yml index 552f7558..1d5acb5e 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -8,9 +8,10 @@ dependencies: - pandas - levenshtein - pathlib==1.0.1 - - opencv==4.9.0 - python-dotenv==1.0.1 - click==8.1.7 + - PyYAML==6.0.1 + - scikit-learn==1.4.0 - pip # dev dependencies - matplotlib==3.8.0 @@ -24,6 +25,7 @@ dependencies: - amazon-textract-textractor - backoff - PyMuPDF==1.23.26 + - opencv-python-headless - -e . # dev pip dependencies - mlflow==2.11.1 \ No newline at end of file diff --git a/environment-prod.yml b/environment-prod.yml index da30c50e..92b9ec03 100644 --- a/environment-prod.yml +++ b/environment-prod.yml @@ -7,12 +7,14 @@ dependencies: - pandas - levenshtein - pathlib==1.0.1 - - opencv==4.9.0 - python-dotenv==1.0.1 + - scikit-learn==1.4.0 - click==8.1.7 + - PyYAML==6.0.1 - pip - pip: - amazon-textract-textractor - backoff - PyMuPDF==1.23.26 + - opencv-python-headless - -e . diff --git a/example/example_borehole_profile.pdf b/example/example_borehole_profile.pdf new file mode 100644 index 00000000..89c154a2 Binary files /dev/null and b/example/example_borehole_profile.pdf differ diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py index a655bc7e..0e48e6f2 100644 --- a/src/stratigraphy/benchmark/score.py +++ b/src/stratigraphy/benchmark/score.py @@ -131,7 +131,12 @@ def add_ground_truth_to_predictions(predictions: dict, ground_truth_path: Path) Returns: tuple[dict, dict]: The predictions with the ground truth added, and the number of ground truth values per file. """ - ground_truth = GroundTruth(ground_truth_path) + try: # for inference no ground truth is available + ground_truth = GroundTruth(ground_truth_path) + + except FileNotFoundError: + logging.warning("Ground truth file not found.") + return predictions, {} number_of_truth_values = {} for file, file_predictions in predictions.items(): diff --git a/src/stratigraphy/line_detection.py b/src/stratigraphy/line_detection.py index a5870cff..66e8fdbd 100644 --- a/src/stratigraphy/line_detection.py +++ b/src/stratigraphy/line_detection.py @@ -1,5 +1,6 @@ """Script for line detection in pdf pages.""" +import logging import os import cv2 @@ -19,8 +20,9 @@ load_dotenv() -mlflow_tracking = os.getenv("MLFLOW_TRACKING") == "True" # Checks whether MLFlow tracking is enabled +logger = logging.getLogger(__name__) +mlflow_tracking = os.getenv("MLFLOW_TRACKING") == "True" # Checks whether MLFlow tracking is enabled line_detection_params = read_params("line_detection_params.yml") @@ -98,8 +100,10 @@ def draw_lines_on_page(filename: str, page: fitz.Page, geometric_lines: list[Lin geometric_lines (list[Line]): The lines to draw on the pdf page. """ if not mlflow_tracking: - raise Warning("MLFlow tracking is not enabled. MLFLow is required to store the images.") - import mlflow + logger.warning("MLFlow tracking is not enabled. MLFLow is required to store the images.") + + else: + import mlflow - img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"]) - mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png") + img = plot_lines(page, geometric_lines, scale_factor=line_detection_params["pdf_scale_factor"]) + mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png") diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index f7213a6f..eceda9c4 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -37,7 +37,7 @@ @click.option( "-g", "--ground_truth_path", - type=click.Path(exists=True, path_type=Path), + type=click.Path(exists=False, path_type=Path), default=DATAPATH / "Benchmark" / "ground_truth.json", help="Path to the ground truth file.", ) @@ -143,12 +143,17 @@ def start_pipeline( if not skip_draw_predictions: draw_predictions(predictions, input_directory, out_directory) - metrics, document_level_metrics = evaluate_matching(predictions, number_of_truth_values) - document_level_metrics.to_csv(temp_directory / "document_level_metrics.csv") # mlflow.log_artifact expects a file + if number_of_truth_values: # only evaluate if ground truth is available + metrics, document_level_metrics = evaluate_matching(predictions, number_of_truth_values) + document_level_metrics.to_csv( + temp_directory / "document_level_metrics.csv" + ) # mlflow.log_artifact expects a file - if mlflow_tracking: - mlflow.log_metrics(metrics) - mlflow.log_artifact(temp_directory / "document_level_metrics.csv") + if mlflow_tracking: + mlflow.log_metrics(metrics) + mlflow.log_artifact(temp_directory / "document_level_metrics.csv") + else: + logger.warning("Ground truth file not found. Skipping evaluation.") if __name__ == "__main__":