Skip to content

Commit

Permalink
Merge pull request #101 from swisstopo/LGVISIUM-80/review-stijn
Browse files Browse the repository at this point in the history
LGVISIUM-80: review proposal Stijn
  • Loading branch information
stijnvermeeren-swisstopo authored Oct 30, 2024
2 parents 54631f1 + aa07938 commit 5c6db29
Show file tree
Hide file tree
Showing 14 changed files with 176 additions and 135 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pipeline_run.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
python-version: '3.11'
- name: Create Environment and run pipeline
shell: bash
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
with:
python-version: 3.10.14
python-version: '3.11'
- uses: pre-commit/[email protected]
2 changes: 1 addition & 1 deletion .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
python-version: '3.11'
- name: Create Environment and run tests
shell: bash
run: |
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ name = "swissgeol-boreholes-dataextraction"
version = "0.0.1-dev"
description = "Python project to analyse borehole profiles."
readme = "README.md"
requires-python = ">=3.10"
requires-python = ">=3.11"
dependencies = [
"boto3",
"pandas",
Expand Down
9 changes: 5 additions & 4 deletions src/stratigraphy/annotations/draw.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
import fitz
import pandas as pd
from dotenv import load_dotenv
from stratigraphy.data_extractor.data_extractor import FeatureOnPage
from stratigraphy.depthcolumn.depthcolumn import DepthColumn
from stratigraphy.depths_materials_column_pairs.depths_materials_column_pairs import DepthsMaterialsColumnPairs
from stratigraphy.groundwater.groundwater_extraction import GroundwaterOnPage
from stratigraphy.groundwater.groundwater_extraction import Groundwater
from stratigraphy.layer.layer import Layer
from stratigraphy.metadata.coordinate_extraction import Coordinate
from stratigraphy.metadata.elevation_extraction import Elevation
Expand Down Expand Up @@ -90,7 +91,7 @@ def draw_predictions(
draw_coordinates(shape, coordinates)
if elevation is not None and page_number == elevation.page:
draw_elevation(shape, elevation)
for groundwater_entry in file_prediction.get_groundwater_entries():
for groundwater_entry in file_prediction.groundwater.groundwater:
if page_number == groundwater_entry.page:
draw_groundwater(shape, groundwater_entry)
draw_depth_columns_and_material_rect(
Expand Down Expand Up @@ -197,12 +198,12 @@ def draw_coordinates(shape: fitz.Shape, coordinates: Coordinate) -> None:
shape.finish(color=fitz.utils.getColor("purple"))


def draw_groundwater(shape: fitz.Shape, groundwater_entry: GroundwaterOnPage) -> None:
def draw_groundwater(shape: fitz.Shape, groundwater_entry: FeatureOnPage[Groundwater]) -> None:
"""Draw a bounding box around the area of the page where the groundwater information was extracted from.
Args:
shape (fitz.Shape): The shape object for drawing.
groundwater_entry (GroundwaterOnPage): The groundwater information to draw.
groundwater_entry (FeatureOnPage[Groundwater]): The groundwater information to draw.
"""
shape.draw_rect(groundwater_entry.rect)
shape.finish(color=fitz.utils.getColor("pink"))
Expand Down
79 changes: 75 additions & 4 deletions src/stratigraphy/data_extractor/data_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging
from abc import ABC, ABCMeta, abstractmethod
from dataclasses import dataclass
from typing import Generic, Self, TypeVar

import fitz
import regex
Expand All @@ -16,13 +17,9 @@
logger = logging.getLogger(__name__)


@dataclass(kw_only=True)
class ExtractedFeature(metaclass=ABCMeta):
"""Class for extracted feature information."""

rect: fitz.Rect # The rectangle that contains the extracted information
page: int # The page number of the PDF document

@abstractmethod
def is_valid(self) -> bool:
"""Checks if the information is valid.
Expand All @@ -32,6 +29,72 @@ def is_valid(self) -> bool:
"""
pass

@abstractmethod
def to_json(self) -> dict:
"""Converts the object to a dictionary.
Returns:
dict: The object as a dictionary.
"""
pass

@classmethod
@abstractmethod
def from_json(cls, data: dict) -> Self:
"""Converts a dictionary to an object.
Args:
data (dict): A dictionary representing the information.
Returns:
Self: An instance of the class.
"""
pass


T = TypeVar("T", bound=ExtractedFeature)


@dataclass
class FeatureOnPage(Generic[T]):
"""Class for an extracted feature, together with the page and where on that page the feature was extracted from."""

feature: T
rect: fitz.Rect # The rectangle that contains the extracted information
page: int # The page number of the PDF document

def to_json(self) -> dict:
"""Converts the object to a dictionary.
Returns:
dict: The object as a dictionary.
"""
result = self.feature.to_json()
result.update(
{
"page": self.page if self.page else None,
"rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1] if self.rect else None,
}
)
return result

@classmethod
def from_json(cls, data: dict, feature_cls: type[T]) -> Self:
"""Converts a dictionary to an object.
Args:
data (dict): A dictionary representing the feature on a page information.
feature_cls (T): The extracted feature
Returns:
Self: The resulting FeatureOnPage object.
"""
return cls(
feature=feature_cls.from_json(data),
page=data["page"],
rect=fitz.Rect(data["rect"]),
)


class DataExtractor(ABC):
"""Abstract class for data extraction from stratigraphy data files.
Expand Down Expand Up @@ -70,6 +133,14 @@ def __init__(self, document: fitz.Document):
self.feature_fp_keys = read_params("matching_params.yml")[f"{self.feature_name}_fp_keys"] or []

def preprocess(self, value: str) -> str:
"""Preprocesses the value before searching for the feature.
Args:
value (str): The value to preprocess.
Returns:
str: The preprocessed value.
"""
for old, new in self.preprocess_replacements.items():
value = value.replace(old, new)
return value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ def from_json(cls, json_depths_materials_column_pairs: dict) -> "DepthsMaterials
Returns:
DepthsMaterialsColumnPairs: The depths materials column pairs object.
"""
depth_column = DepthColumnFactory.create(json_depths_materials_column_pairs["depth_column"])
depth_column_entry = json_depths_materials_column_pairs["depth_column"]
depth_column = DepthColumnFactory.create(depth_column_entry) if depth_column_entry else None
material_description_rect = fitz.Rect(json_depths_materials_column_pairs["material_description_rect"])
page = json_depths_materials_column_pairs["page"]

Expand Down
12 changes: 6 additions & 6 deletions src/stratigraphy/evaluation/groundwater_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,24 +94,24 @@ def evaluate(self) -> OverallGroundwaterMetrics:
groundwater_metrics = count_against_ground_truth(
[
(
entry.groundwater.depth,
entry.groundwater.format_date(),
entry.groundwater.elevation,
entry.feature.depth,
entry.feature.format_date(),
entry.feature.elevation,
)
for entry in groundwater_in_doc.groundwater
],
[(entry.depth, entry.format_date(), entry.elevation) for entry in gt_groundwater],
)
groundwater_depth_metrics = count_against_ground_truth(
[entry.groundwater.depth for entry in groundwater_in_doc.groundwater],
[entry.feature.depth for entry in groundwater_in_doc.groundwater],
[entry.depth for entry in gt_groundwater],
)
groundwater_elevation_metrics = count_against_ground_truth(
[entry.groundwater.elevation for entry in groundwater_in_doc.groundwater],
[entry.feature.elevation for entry in groundwater_in_doc.groundwater],
[entry.elevation for entry in gt_groundwater],
)
groundwater_date_metrics = count_against_ground_truth(
[entry.groundwater.format_date() for entry in groundwater_in_doc.groundwater],
[entry.feature.format_date() for entry in groundwater_in_doc.groundwater],
[entry.format_date() for entry in gt_groundwater],
)

Expand Down
Loading

0 comments on commit 5c6db29

Please sign in to comment.