swisstopo · dcleres · Sep 18, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 11, 2024
diff --git a/.github/workflows/pipeline_run.yml b/.github/workflows/pipeline_run.yml
@@ -21,4 +21,4 @@ jobs:
         source env/bin/activate
         pip install -e .
         echo "Running pipeline"
-        boreholes-extract-all -l -i example/example_borehole_profile.pdf -o example/ -p example/predictions.json
+        boreholes-extract-all -l -i example/example_borehole_profile.pdf -o example/ -p example/predictions.json -m example/metadata.json -g example/example_groundtruth.json -pa all
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,5 +1,6 @@
 {
     "cSpell.words": [
+        "dataframe",
         "DATAPATH",
         "depthcolumn",
         "depthcolumnentry",

diff --git a/example/example_groundtruth.json b/example/example_groundtruth.json
@@ -0,0 +1,18 @@
+{
+    "example_borehole_profile.pdf": {
+        "groundwater": [],
+        "layers": [],
+        "metadata": {
+            "coordinates": {
+                "E": 615790,
+                "N": 157500
+            },
+            "drilling_date": "1995-09-03",
+            "drilling_methods": null,
+            "original_name": "",
+            "project_name": "",
+            "reference_elevation": 788.6,
+            "total_depth": null
+        }
+    }
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -57,6 +57,7 @@ all = ["swissgeol-boreholes-dataextraction[test, lint, experiment-tracking, visu
 
 [project.scripts]
 boreholes-extract-all = "stratigraphy.main:click_pipeline"
+boreholes-extract-metadata = "stratigraphy.main:click_pipeline_metadata"
 boreholes-download-profiles = "stratigraphy.get_files:download_directory_froms3"
 
 [tool.ruff.lint]

diff --git a/src/app/api/v1/endpoints/extract_data.py b/src/app/api/v1/endpoints/extract_data.py
@@ -16,8 +16,8 @@
     FormatTypes,
     NotFoundResponse,
 )
-from stratigraphy.coordinates.coordinate_extraction import CoordinateExtractor, LV03Coordinate, LV95Coordinate
-from stratigraphy.util.extract_text import extract_text_lines_from_bbox
+from stratigraphy.metadata.coordinate_extraction import CoordinateExtractor, LV03Coordinate, LV95Coordinate
+from stratigraphy.text.extract_text import extract_text_lines_from_bbox
 
 
 def extract_data(extract_data_request: ExtractDataRequest) -> ExtractDataResponse:

diff --git a/src/app/common/schemas.py b/src/app/common/schemas.py
@@ -106,6 +106,7 @@ def to_fitz_rect(self) -> fitz.Rect:
         """
         return fitz.Rect(self.x0, self.y0, self.x1, self.y1)
 
+    @staticmethod
     def load_from_fitz_rect(rect: fitz.Rect) -> "BoundingBox":
         """Load the bounding box from a PyMuPDF rectangle.
 

diff --git a/src/scripts/label_studio_annotation_to_ground_truth.py b/src/scripts/label_studio_annotation_to_ground_truth.py
@@ -9,10 +9,11 @@
 
 import click
 import fitz
-from stratigraphy.coordinates.coordinate_extraction import Coordinate
+from stratigraphy.layer.layer import LayerPrediction
+from stratigraphy.metadata.coordinate_extraction import Coordinate
+from stratigraphy.text.textblock import MaterialDescription
 from stratigraphy.util.interval import AnnotatedInterval
-from stratigraphy.util.predictions import BoreholeMetaData, FilePredictions, LayerPrediction
-from stratigraphy.util.textblock import MaterialDescription
+from stratigraphy.util.predictions import BoreholeMetaData, FilePredictions
 
 logger = logging.getLogger(__name__)
 

diff --git a/src/stratigraphy/util/draw.py → src/stratigraphy/annotations/draw.py b/src/stratigraphy/util/draw.py → src/stratigraphy/annotations/draw.py
@@ -5,15 +5,15 @@
 from pathlib import Path
 
 import fitz
+import pandas as pd
 from dotenv import load_dotenv
-
-from stratigraphy.benchmark.metrics import Metrics
-from stratigraphy.coordinates.coordinate_extraction import Coordinate
-from stratigraphy.elevation.elevation_extraction import ElevationInformation
 from stratigraphy.groundwater.groundwater_extraction import GroundwaterInformationOnPage
+from stratigraphy.layer.layer import LayerPrediction
+from stratigraphy.metadata.coordinate_extraction import Coordinate
+from stratigraphy.metadata.elevation_extraction import Elevation
+from stratigraphy.text.textblock import TextBlock
 from stratigraphy.util.interval import BoundaryInterval
-from stratigraphy.util.predictions import FilePredictions, LayerPrediction
-from stratigraphy.util.textblock import TextBlock
+from stratigraphy.util.predictions import FilePredictions
 
 load_dotenv()
 
@@ -24,7 +24,12 @@
 logger = logging.getLogger(__name__)
 
 
-def draw_predictions(predictions: dict[str, FilePredictions], directory: Path, out_directory: Path) -> None:
+def draw_predictions(
+    predictions: dict[str, FilePredictions],
+    directory: Path,
+    out_directory: Path,
+    document_level_metadata_metrics: pd.DataFrame,
+) -> None:
     """Draw predictions on pdf pages.
 
     Draws various recognized information on the pdf pages present at directory and saves
@@ -42,6 +47,7 @@ def draw_predictions(predictions: dict[str, FilePredictions], directory: Path, o
         predictions (dict): Content of the predictions.json file.
         directory (Path): Path to the directory containing the pdf files.
         out_directory (Path): Path to the output directory where the images are saved.
+        document_level_metadata_metrics (pd.DataFrame): Document level metadata metrics.
     """
     if directory.is_file():  # deal with the case when we pass a file instead of a directory
         directory = directory.parent
@@ -51,6 +57,11 @@ def draw_predictions(predictions: dict[str, FilePredictions], directory: Path, o
         depths_materials_column_pairs = file_prediction.depths_materials_columns_pairs
         coordinates = file_prediction.metadata.coordinates
         elevation = file_prediction.metadata.elevation
+
+        # Assess the correctness of the metadata
+        is_coordinates_correct = document_level_metadata_metrics.loc[file_name].coordinate
+        is_elevation_correct = document_level_metadata_metrics.loc[file_name].elevation
+
         with fitz.Document(directory / file_name) as doc:
             for page_index, page in enumerate(doc):
                 page_number = page_index + 1
@@ -61,9 +72,9 @@ def draw_predictions(predictions: dict[str, FilePredictions], directory: Path, o
                         page.derotation_matrix,
                         page.rotation,
                         coordinates,
-                        file_prediction.metadata_is_correct.get("coordinates"),
+                        is_coordinates_correct,
                         elevation,
-                        file_prediction.metadata_is_correct.get("elevation"),
+                        is_elevation_correct,
                     )
                 if coordinates is not None and page_number == coordinates.page:
                     draw_coordinates(shape, coordinates)
@@ -107,9 +118,9 @@ def draw_metadata(
     derotation_matrix: fitz.Matrix,
     rotation: float,
     coordinates: Coordinate | None,
-    coordinates_is_correct: Metrics,
-    elevation_info: ElevationInformation | None,
-    elevation_is_correct: Metrics,
+    is_coordinate_correct: bool,
+    elevation_info: Elevation | None,
+    is_elevation_correct: bool,
 ) -> None:
     """Draw the extracted metadata on the top of the given PDF page.
 
@@ -121,17 +132,15 @@ def draw_metadata(
         derotation_matrix (fitz.Matrix): The derotation matrix of the page.
         rotation (float): The rotation of the page.
         coordinates (Coordinate | None): The coordinate object to draw.
-        coordinates_is_correct (Metrics): Whether the coordinates are correct.
+        is_coordinate_correct (Metrics): Whether the coordinate information is correct.
         elevation_info (ElevationInformation | None): The elevation information to draw.
-        elevation_is_correct (Metrics): Whether the elevation information is correct.
+        is_elevation_correct (Metrics): Whether the elevation information is correct.
     """
     # TODO associate correctness with the extracted coordinates in a better way
-    coordinate_correct = coordinates_is_correct is not None and coordinates_is_correct.tp > 0
-    coordinate_color = "green" if coordinate_correct else "red"
+    coordinate_color = "green" if is_coordinate_correct else "red"
     coordinate_rect = fitz.Rect([5, 5, 200, 25])
 
-    elevation_correct = elevation_is_correct is not None and elevation_is_correct.tp > 0
-    elevation_color = "green" if elevation_correct else "red"
+    elevation_color = "green" if is_elevation_correct else "red"
     elevation_rect = fitz.Rect([5, 25, 200, 45])
 
     shape.draw_rect(coordinate_rect * derotation_matrix)
@@ -185,12 +194,12 @@ def draw_groundwater(shape: fitz.Shape, groundwater_entry: GroundwaterInformatio
     shape.finish(color=fitz.utils.getColor("pink"))
 
 
-def draw_elevation(shape: fitz.Shape, elevation: ElevationInformation) -> None:
+def draw_elevation(shape: fitz.Shape, elevation: Elevation) -> None:
     """Draw a bounding box around the area of the page where the coordinates were extracted from.
 
     Args:
         shape (fitz.Shape): The shape object for drawing.
-        elevation (ElevationInformation): The elevation information to draw.
+        elevation (Elevation): The elevation information to draw.
     """
     shape.draw_rect(elevation.rect)
     shape.finish(color=fitz.utils.getColor("blue"))

diff --git a/src/stratigraphy/util/plot_utils.py → src/stratigraphy/annotations/plot_utils.py b/src/stratigraphy/util/plot_utils.py → src/stratigraphy/annotations/plot_utils.py
@@ -5,9 +5,8 @@
 import cv2
 import fitz
 import numpy as np
-
+from stratigraphy.text.textblock import TextBlock
 from stratigraphy.util.dataclasses import Line
-from stratigraphy.util.textblock import TextBlock
 
 logger = logging.getLogger(__name__)
 

diff --git a/src/stratigraphy/benchmark/ground_truth.py b/src/stratigraphy/benchmark/ground_truth.py
@@ -16,8 +16,11 @@ class GroundTruth:
     def __init__(self, path: Path):
         self.ground_truth = defaultdict(dict)
 
-        with open(path) as in_file:
+        # Load the ground truth data
+        with open(path, encoding="utf-8") as in_file:
             ground_truth = json.load(in_file)
+
+        # Parse the ground truth data
         for borehole_profile, ground_truth_item in ground_truth.items():
             layers = ground_truth_item["layers"]
             self.ground_truth[borehole_profile]["layers"] = [
@@ -42,6 +45,6 @@ def for_file(self, file_name: str) -> dict:
         """
         if file_name in self.ground_truth:
             return self.ground_truth[file_name]
-        else:
-            logger.warning(f"No ground truth data found for {file_name}.")
-            return {}
+
+        logger.warning("No ground truth data found for %s.", file_name)
+        return {}
diff --git a/src/stratigraphy/benchmark/metrics.py b/src/stratigraphy/benchmark/metrics.py
@@ -1,66 +1,21 @@
 """Classes for keeping track of metrics such as the F1-score, precision and recall."""
 
 from collections.abc import Callable
-from dataclasses import dataclass
 
 import pandas as pd
-
-
-@dataclass
-class Metrics:
-    """Computes F-score metrics.
-
-    See also https://en.wikipedia.org/wiki/F-score
-
-    Args:
-        tp (int): The true positive count
-        fp (int): The false positive count
-        fn (int): The false negative count
-    """
-
-    tp: int
-    fp: int
-    fn: int
-
-    @property
-    def precision(self) -> float:
-        """Calculate the precision."""
-        if self.tp + self.fp > 0:
-            return self.tp / (self.tp + self.fp)
-        else:
-            return 0
-
-    @property
-    def recall(self) -> float:
-        """Calculate the recall."""
-        if self.tp + self.fn > 0:
-            return self.tp / (self.tp + self.fn)
-        else:
-            return 0
-
-    @property
-    def f1(self) -> float:
-        """Calculate the F1 score."""
-        if self.precision + self.recall > 0:
-            return 2 * self.precision * self.recall / (self.precision + self.recall)
-        else:
-            return 0
+from stratigraphy.evaluation.evaluation_dataclasses import Metrics
 
 
 class DatasetMetrics:
     """Keeps track of a particular metrics for all documents in a dataset."""
 
+    # TODO: Currently, some methods for averaging metrics are in the Metrics class.
+    # (see micro_average(metric_list: list["Metrics"]). On the long run, we should refactor
+    # this to have a single place where these averaging computations are implemented.
+
     def __init__(self):
         self.metrics: dict[str, Metrics] = {}
 
-    def overall_metrics(self) -> Metrics:
-        """Can be used to compute micro averages."""
-        return Metrics(
-            tp=sum(metric.tp for metric in self.metrics.values()),
-            fp=sum(metric.fp for metric in self.metrics.values()),
-            fn=sum(metric.fn for metric in self.metrics.values()),
-        )
-
     def macro_f1(self) -> float:
         """Compute the macro F1 score."""
         if self.metrics:
@@ -93,6 +48,7 @@ def pseudo_macro_f1(self) -> float:
             return 0
 
     def to_dataframe(self, name: str, fn: Callable[[Metrics], float]) -> pd.DataFrame:
+        """Convert the metrics to a DataFrame."""
         series = pd.Series({filename: fn(metric) for filename, metric in self.metrics.items()})
         return series.to_frame(name=name)
 
@@ -104,15 +60,14 @@ def __init__(self):
         self.metrics: dict[str, DatasetMetrics] = {}
 
     def document_level_metrics_df(self) -> pd.DataFrame:
+        """Return a DataFrame with all the document level metrics."""
         all_series = [
             self.metrics["layer"].to_dataframe("F1", lambda metric: metric.f1),
             self.metrics["layer"].to_dataframe("precision", lambda metric: metric.precision),
             self.metrics["layer"].to_dataframe("recall", lambda metric: metric.recall),
             self.metrics["depth_interval"].to_dataframe("Depth_interval_accuracy", lambda metric: metric.precision),
             self.metrics["layer"].to_dataframe("Number Elements", lambda metric: metric.tp + metric.fn),
             self.metrics["layer"].to_dataframe("Number wrong elements", lambda metric: metric.fp + metric.fn),
-            self.metrics["coordinates"].to_dataframe("coordinates", lambda metric: metric.f1),
-            self.metrics["elevation"].to_dataframe("elevation", lambda metric: metric.f1),
             self.metrics["groundwater"].to_dataframe("groundwater", lambda metric: metric.f1),
             self.metrics["groundwater_depth"].to_dataframe("groundwater_depth", lambda metric: metric.f1),
         ]
@@ -122,10 +77,9 @@ def document_level_metrics_df(self) -> pd.DataFrame:
         return document_level_metrics
 
     def metrics_dict(self) -> dict[str, float]:
-        coordinates_metrics = self.metrics["coordinates"].overall_metrics()
-        groundwater_metrics = self.metrics["groundwater"].overall_metrics()
-        groundwater_depth_metrics = self.metrics["groundwater_depth"].overall_metrics()
-        elevation_metrics = self.metrics["elevation"].overall_metrics()
+        """Return a dictionary with the overall metrics."""
+        groundwater_metrics = Metrics.micro_average(self.metrics["groundwater"].metrics.values())
+        groundwater_depth_metrics = Metrics.micro_average(self.metrics["groundwater_depth"].metrics.values())
 
         return {
             "F1": self.metrics["layer"].pseudo_macro_f1(),
@@ -140,16 +94,10 @@ def metrics_dict(self) -> dict[str, float]:
             "fr_recall": self.metrics["fr_layer"].macro_recall(),
             "fr_precision": self.metrics["fr_layer"].macro_precision(),
             "fr_depth_interval_accuracy": self.metrics["fr_depth_interval"].macro_precision(),
-            "coordinate_f1": coordinates_metrics.f1,
-            "coordinate_recall": coordinates_metrics.recall,
-            "coordinate_precision": coordinates_metrics.precision,
             "groundwater_f1": groundwater_metrics.f1,
             "groundwater_recall": groundwater_metrics.recall,
             "groundwater_precision": groundwater_metrics.precision,
             "groundwater_depth_f1": groundwater_depth_metrics.f1,
             "groundwater_depth_recall": groundwater_depth_metrics.recall,
             "groundwater_depth_precision": groundwater_depth_metrics.precision,
-            "elevation_f1": elevation_metrics.f1,
-            "elevation_recall": elevation_metrics.recall,
-            "elevation_precision": elevation_metrics.precision,
         }