Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Close #LGVISIUM-73: Create a metadata object and look into the file organisation #80

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
e7cbdd6
Close #LGVISIUM-73: Moved the util files into more specific directori…
dcleres Sep 10, 2024
ff36d85
Added refactored metadata class
dcleres Sep 10, 2024
6002790
BAckup for today
dcleres Sep 11, 2024
540f82a
Added code to evaluate the metadata individually
dcleres Sep 12, 2024
a613f41
minor edits
dcleres Sep 12, 2024
aa171af
Merge branch 'main' of https://github.com/swisstopo/swissgeol-borehol…
dcleres Sep 16, 2024
af0fd12
Edited the pipeline command
dcleres Sep 16, 2024
750194d
Fixed typo in the eval file
dcleres Sep 16, 2024
0bcd51f
Added groundtruth file
dcleres Sep 16, 2024
d2509c8
Minor improvements
dcleres Sep 16, 2024
a7a98a5
minor changes
dcleres Sep 16, 2024
f9411d5
Merge branch 'main' of https://github.com/swisstopo/swissgeol-borehol…
dcleres Sep 17, 2024
d2920e3
Addressed the comments made during the PR
dcleres Sep 17, 2024
4c1dc87
Addressed the comments raised during the review
dcleres Sep 17, 2024
a92c2da
Merge branch 'LGVISIUM-73-Create-a-Metadata-Object-and-look-into-the-…
dcleres Sep 17, 2024
bedf096
Edited the yaml file for the CI
dcleres Sep 17, 2024
3b4c61b
Removed duplicated code
dcleres Sep 17, 2024
3b0773d
Fixed the issue with the metrics
dcleres Sep 17, 2024
da8ba85
Address the elevation metric difference by passing none to provide th…
dcleres Sep 18, 2024
ffd3fef
Addressed review comments and fix the issue with the drawing of the c…
dcleres Sep 18, 2024
af75c57
Updated the document level metric computation
dcleres Sep 18, 2024
1cc4818
update comments
stijnvermeeren-swisstopo Sep 18, 2024
6e7ef2a
code review
stijnvermeeren-swisstopo Sep 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pipeline_run.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ jobs:
source env/bin/activate
pip install -e .
echo "Running pipeline"
boreholes-extract-all -l -i example/example_borehole_profile.pdf -o example/ -p example/predictions.json
boreholes-extract-all -l -i example/example_borehole_profile.pdf -o example/ -p example/predictions.json -m example/metadata.json -g example/example_groundtruth.json -pa all
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"cSpell.words": [
"dataframe",
"DATAPATH",
"depthcolumn",
"depthcolumnentry",
Expand Down
18 changes: 18 additions & 0 deletions example/example_groundtruth.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"example_borehole_profile.pdf": {
"groundwater": [],
"layers": [],
"metadata": {
"coordinates": {
"E": 615790,
"N": 157500
},
"drilling_date": "1995-09-03",
"drilling_methods": null,
"original_name": "",
"project_name": "",
"reference_elevation": 788.6,
"total_depth": null
}
}
}
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ all = ["swissgeol-boreholes-dataextraction[test, lint, experiment-tracking, visu

[project.scripts]
boreholes-extract-all = "stratigraphy.main:click_pipeline"
boreholes-extract-metadata = "stratigraphy.main:click_pipeline_metadata"
boreholes-download-profiles = "stratigraphy.get_files:download_directory_froms3"

[tool.ruff.lint]
Expand Down
4 changes: 2 additions & 2 deletions src/app/api/v1/endpoints/extract_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
FormatTypes,
NotFoundResponse,
)
from stratigraphy.coordinates.coordinate_extraction import CoordinateExtractor, LV03Coordinate, LV95Coordinate
from stratigraphy.util.extract_text import extract_text_lines_from_bbox
from stratigraphy.metadata.coordinate_extraction import CoordinateExtractor, LV03Coordinate, LV95Coordinate
from stratigraphy.text.extract_text import extract_text_lines_from_bbox


def extract_data(extract_data_request: ExtractDataRequest) -> ExtractDataResponse:
Expand Down
1 change: 1 addition & 0 deletions src/app/common/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def to_fitz_rect(self) -> fitz.Rect:
"""
return fitz.Rect(self.x0, self.y0, self.x1, self.y1)

@staticmethod
def load_from_fitz_rect(rect: fitz.Rect) -> "BoundingBox":
"""Load the bounding box from a PyMuPDF rectangle.

Expand Down
7 changes: 4 additions & 3 deletions src/scripts/label_studio_annotation_to_ground_truth.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@

import click
import fitz
from stratigraphy.coordinates.coordinate_extraction import Coordinate
from stratigraphy.layer.layer import LayerPrediction
from stratigraphy.metadata.coordinate_extraction import Coordinate
from stratigraphy.text.textblock import MaterialDescription
from stratigraphy.util.interval import AnnotatedInterval
from stratigraphy.util.predictions import BoreholeMetaData, FilePredictions, LayerPrediction
from stratigraphy.util.textblock import MaterialDescription
from stratigraphy.util.predictions import BoreholeMetaData, FilePredictions

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
from pathlib import Path

import fitz
import pandas as pd
from dotenv import load_dotenv

from stratigraphy.benchmark.metrics import Metrics
from stratigraphy.coordinates.coordinate_extraction import Coordinate
from stratigraphy.elevation.elevation_extraction import ElevationInformation
from stratigraphy.groundwater.groundwater_extraction import GroundwaterInformationOnPage
from stratigraphy.layer.layer import LayerPrediction
from stratigraphy.metadata.coordinate_extraction import Coordinate
from stratigraphy.metadata.elevation_extraction import Elevation
from stratigraphy.text.textblock import TextBlock
from stratigraphy.util.interval import BoundaryInterval
from stratigraphy.util.predictions import FilePredictions, LayerPrediction
from stratigraphy.util.textblock import TextBlock
from stratigraphy.util.predictions import FilePredictions

load_dotenv()

Expand All @@ -24,7 +24,12 @@
logger = logging.getLogger(__name__)


def draw_predictions(predictions: dict[str, FilePredictions], directory: Path, out_directory: Path) -> None:
def draw_predictions(
predictions: dict[str, FilePredictions],
directory: Path,
out_directory: Path,
document_level_metadata_metrics: pd.DataFrame,
) -> None:
"""Draw predictions on pdf pages.

Draws various recognized information on the pdf pages present at directory and saves
Expand All @@ -42,6 +47,7 @@ def draw_predictions(predictions: dict[str, FilePredictions], directory: Path, o
predictions (dict): Content of the predictions.json file.
directory (Path): Path to the directory containing the pdf files.
out_directory (Path): Path to the output directory where the images are saved.
document_level_metadata_metrics (pd.DataFrame): Document level metadata metrics.
"""
if directory.is_file(): # deal with the case when we pass a file instead of a directory
directory = directory.parent
Expand All @@ -51,6 +57,11 @@ def draw_predictions(predictions: dict[str, FilePredictions], directory: Path, o
depths_materials_column_pairs = file_prediction.depths_materials_columns_pairs
coordinates = file_prediction.metadata.coordinates
elevation = file_prediction.metadata.elevation

# Assess the correctness of the metadata
is_coordinates_correct = document_level_metadata_metrics.loc[file_name].coordinate
is_elevation_correct = document_level_metadata_metrics.loc[file_name].elevation

with fitz.Document(directory / file_name) as doc:
for page_index, page in enumerate(doc):
page_number = page_index + 1
Expand All @@ -61,9 +72,9 @@ def draw_predictions(predictions: dict[str, FilePredictions], directory: Path, o
page.derotation_matrix,
page.rotation,
coordinates,
file_prediction.metadata_is_correct.get("coordinates"),
is_coordinates_correct,
elevation,
file_prediction.metadata_is_correct.get("elevation"),
is_elevation_correct,
)
if coordinates is not None and page_number == coordinates.page:
draw_coordinates(shape, coordinates)
Expand Down Expand Up @@ -107,9 +118,9 @@ def draw_metadata(
derotation_matrix: fitz.Matrix,
rotation: float,
coordinates: Coordinate | None,
coordinates_is_correct: Metrics,
elevation_info: ElevationInformation | None,
elevation_is_correct: Metrics,
is_coordinate_correct: bool,
elevation_info: Elevation | None,
is_elevation_correct: bool,
) -> None:
"""Draw the extracted metadata on the top of the given PDF page.

Expand All @@ -121,17 +132,15 @@ def draw_metadata(
derotation_matrix (fitz.Matrix): The derotation matrix of the page.
rotation (float): The rotation of the page.
coordinates (Coordinate | None): The coordinate object to draw.
coordinates_is_correct (Metrics): Whether the coordinates are correct.
is_coordinate_correct (Metrics): Whether the coordinate information is correct.
elevation_info (ElevationInformation | None): The elevation information to draw.
elevation_is_correct (Metrics): Whether the elevation information is correct.
is_elevation_correct (Metrics): Whether the elevation information is correct.
"""
# TODO associate correctness with the extracted coordinates in a better way
coordinate_correct = coordinates_is_correct is not None and coordinates_is_correct.tp > 0
coordinate_color = "green" if coordinate_correct else "red"
coordinate_color = "green" if is_coordinate_correct else "red"
coordinate_rect = fitz.Rect([5, 5, 200, 25])

elevation_correct = elevation_is_correct is not None and elevation_is_correct.tp > 0
elevation_color = "green" if elevation_correct else "red"
elevation_color = "green" if is_elevation_correct else "red"
elevation_rect = fitz.Rect([5, 25, 200, 45])

shape.draw_rect(coordinate_rect * derotation_matrix)
Expand Down Expand Up @@ -185,12 +194,12 @@ def draw_groundwater(shape: fitz.Shape, groundwater_entry: GroundwaterInformatio
shape.finish(color=fitz.utils.getColor("pink"))


def draw_elevation(shape: fitz.Shape, elevation: ElevationInformation) -> None:
def draw_elevation(shape: fitz.Shape, elevation: Elevation) -> None:
"""Draw a bounding box around the area of the page where the coordinates were extracted from.

Args:
shape (fitz.Shape): The shape object for drawing.
elevation (ElevationInformation): The elevation information to draw.
elevation (Elevation): The elevation information to draw.
"""
shape.draw_rect(elevation.rect)
shape.finish(color=fitz.utils.getColor("blue"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
import cv2
import fitz
import numpy as np

from stratigraphy.text.textblock import TextBlock
from stratigraphy.util.dataclasses import Line
from stratigraphy.util.textblock import TextBlock

logger = logging.getLogger(__name__)

Expand Down
11 changes: 7 additions & 4 deletions src/stratigraphy/benchmark/ground_truth.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@ class GroundTruth:
def __init__(self, path: Path):
self.ground_truth = defaultdict(dict)

with open(path) as in_file:
# Load the ground truth data
with open(path, encoding="utf-8") as in_file:
ground_truth = json.load(in_file)

# Parse the ground truth data
for borehole_profile, ground_truth_item in ground_truth.items():
layers = ground_truth_item["layers"]
self.ground_truth[borehole_profile]["layers"] = [
Expand All @@ -42,6 +45,6 @@ def for_file(self, file_name: str) -> dict:
"""
if file_name in self.ground_truth:
return self.ground_truth[file_name]
else:
logger.warning(f"No ground truth data found for {file_name}.")
return {}

logger.warning("No ground truth data found for %s.", file_name)
return {}
72 changes: 10 additions & 62 deletions src/stratigraphy/benchmark/metrics.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,21 @@
"""Classes for keeping track of metrics such as the F1-score, precision and recall."""

from collections.abc import Callable
from dataclasses import dataclass

import pandas as pd


@dataclass
class Metrics:
"""Computes F-score metrics.

See also https://en.wikipedia.org/wiki/F-score

Args:
tp (int): The true positive count
fp (int): The false positive count
fn (int): The false negative count
"""

tp: int
fp: int
fn: int

@property
def precision(self) -> float:
"""Calculate the precision."""
if self.tp + self.fp > 0:
return self.tp / (self.tp + self.fp)
else:
return 0

@property
def recall(self) -> float:
"""Calculate the recall."""
if self.tp + self.fn > 0:
return self.tp / (self.tp + self.fn)
else:
return 0

@property
def f1(self) -> float:
"""Calculate the F1 score."""
if self.precision + self.recall > 0:
return 2 * self.precision * self.recall / (self.precision + self.recall)
else:
return 0
from stratigraphy.evaluation.evaluation_dataclasses import Metrics


class DatasetMetrics:
"""Keeps track of a particular metrics for all documents in a dataset."""

# TODO: Currently, some methods for averaging metrics are in the Metrics class.
# (see micro_average(metric_list: list["Metrics"]). On the long run, we should refactor
# this to have a single place where these averaging computations are implemented.

def __init__(self):
self.metrics: dict[str, Metrics] = {}

def overall_metrics(self) -> Metrics:
"""Can be used to compute micro averages."""
return Metrics(
tp=sum(metric.tp for metric in self.metrics.values()),
fp=sum(metric.fp for metric in self.metrics.values()),
fn=sum(metric.fn for metric in self.metrics.values()),
)

def macro_f1(self) -> float:
"""Compute the macro F1 score."""
if self.metrics:
Expand Down Expand Up @@ -93,6 +48,7 @@ def pseudo_macro_f1(self) -> float:
return 0

def to_dataframe(self, name: str, fn: Callable[[Metrics], float]) -> pd.DataFrame:
"""Convert the metrics to a DataFrame."""
series = pd.Series({filename: fn(metric) for filename, metric in self.metrics.items()})
return series.to_frame(name=name)

Expand All @@ -104,15 +60,14 @@ def __init__(self):
self.metrics: dict[str, DatasetMetrics] = {}

def document_level_metrics_df(self) -> pd.DataFrame:
"""Return a DataFrame with all the document level metrics."""
all_series = [
self.metrics["layer"].to_dataframe("F1", lambda metric: metric.f1),
self.metrics["layer"].to_dataframe("precision", lambda metric: metric.precision),
self.metrics["layer"].to_dataframe("recall", lambda metric: metric.recall),
self.metrics["depth_interval"].to_dataframe("Depth_interval_accuracy", lambda metric: metric.precision),
self.metrics["layer"].to_dataframe("Number Elements", lambda metric: metric.tp + metric.fn),
self.metrics["layer"].to_dataframe("Number wrong elements", lambda metric: metric.fp + metric.fn),
self.metrics["coordinates"].to_dataframe("coordinates", lambda metric: metric.f1),
self.metrics["elevation"].to_dataframe("elevation", lambda metric: metric.f1),
self.metrics["groundwater"].to_dataframe("groundwater", lambda metric: metric.f1),
self.metrics["groundwater_depth"].to_dataframe("groundwater_depth", lambda metric: metric.f1),
]
Expand All @@ -122,10 +77,9 @@ def document_level_metrics_df(self) -> pd.DataFrame:
return document_level_metrics

def metrics_dict(self) -> dict[str, float]:
coordinates_metrics = self.metrics["coordinates"].overall_metrics()
groundwater_metrics = self.metrics["groundwater"].overall_metrics()
groundwater_depth_metrics = self.metrics["groundwater_depth"].overall_metrics()
elevation_metrics = self.metrics["elevation"].overall_metrics()
"""Return a dictionary with the overall metrics."""
groundwater_metrics = Metrics.micro_average(self.metrics["groundwater"].metrics.values())
groundwater_depth_metrics = Metrics.micro_average(self.metrics["groundwater_depth"].metrics.values())

return {
"F1": self.metrics["layer"].pseudo_macro_f1(),
Expand All @@ -140,16 +94,10 @@ def metrics_dict(self) -> dict[str, float]:
"fr_recall": self.metrics["fr_layer"].macro_recall(),
"fr_precision": self.metrics["fr_layer"].macro_precision(),
"fr_depth_interval_accuracy": self.metrics["fr_depth_interval"].macro_precision(),
"coordinate_f1": coordinates_metrics.f1,
"coordinate_recall": coordinates_metrics.recall,
"coordinate_precision": coordinates_metrics.precision,
"groundwater_f1": groundwater_metrics.f1,
"groundwater_recall": groundwater_metrics.recall,
"groundwater_precision": groundwater_metrics.precision,
"groundwater_depth_f1": groundwater_depth_metrics.f1,
"groundwater_depth_recall": groundwater_depth_metrics.recall,
"groundwater_depth_precision": groundwater_depth_metrics.precision,
"elevation_f1": elevation_metrics.f1,
"elevation_recall": elevation_metrics.recall,
"elevation_precision": elevation_metrics.precision,
}
Loading
Loading