Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lgvisium 100/extract depth value error handling #107

Merged
merged 12 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/app/api/v1/endpoints/extract_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def create_response(coord, srs):
),
)

coord_extractor = CoordinateExtractor(pdf_page)
coord_extractor = CoordinateExtractor()
extracted_coord = coord_extractor.extract_coordinates_from_bbox(
pdf_page, extract_data_request.page_number, user_defined_bbox
)
Expand Down
45 changes: 29 additions & 16 deletions src/stratigraphy/data_extractor/data_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

import fitz
import regex
from stratigraphy.data_extractor.utility import get_lines_near_rect
from stratigraphy.lines.line import TextLine
from stratigraphy.util.util import read_params

Expand Down Expand Up @@ -96,7 +95,6 @@ class DataExtractor(ABC):
This class defines the interface for extracting data from stratigraphy data files.
"""

doc: fitz.Document = None
feature_keys: list[str] = None
feature_fp_keys: list[str] = None
feature_name: str = None
Expand All @@ -112,17 +110,15 @@ class DataExtractor(ABC):

preprocess_replacements: dict[str, str] = {}

def __init__(self, document: fitz.Document):
def __init__(self):
"""Initializes the DataExtractor object.

Args:
document (fitz.Document): A PDF document.
feature_name (str): The name of the feature to extract.
"""
if not self.feature_name:
raise ValueError("Feature name must be specified.")

self.doc = document
self.feature_keys = read_params("matching_params.yml")[f"{self.feature_name}_keys"]
self.feature_fp_keys = read_params("matching_params.yml")[f"{self.feature_name}_fp_keys"] or []

Expand Down Expand Up @@ -193,7 +189,7 @@ def get_lines_near_key(self, lines, key_line: TextLine) -> list[TextLine]:
list[TextLine]: The lines close to the key.
"""
key_rect = key_line.rect
feature_lines = self.get_lines_near_rect(lines, key_rect)
feature_lines = self.get_axis_aligned_lines(lines, key_rect)

# Insert key_line first and remove duplicates
feature_lines.insert(0, key_line)
Expand All @@ -204,21 +200,38 @@ def get_lines_near_key(self, lines, key_line: TextLine) -> list[TextLine]:

return feature_lines_sorted

def get_lines_near_rect(self, lines, rect: fitz.Rect) -> list[TextLine]:
"""Find the lines of the text that are close to a given rectangle.
def get_axis_aligned_lines(self, lines: list[TextLine], rect: fitz.Rect) -> list[TextLine]:
stijnvermeeren-swisstopo marked this conversation as resolved.
Show resolved Hide resolved
"""Find the lines of text that are horizontally and vertically close to a given rectangle.

Lines that are found both horizontally and vertically are included only once.

Args:
lines (list[TextLine]): Arbitrary text lines to search in.
rect (fitz.Rect): The rectangle to search around.

Returns:
list[TextLine]: The lines close to the rectangle.
list[TextLine]: A combined list of lines close to the rectangle within the horizontal
(left/right) and vertical (above/below) regions, with intersection included only once.
"""
return get_lines_near_rect(
self.search_left_factor,
self.search_right_factor,
self.search_above_factor,
self.search_below_factor,
lines,
rect,
# Horizontal rectangle (left-right limits)
horizontal_rect = fitz.Rect(
rect.x0 - self.search_left_factor * rect.width,
rect.y0,
rect.x1 + self.search_right_factor * rect.width,
rect.y1,
)

# Vertical rectangle (above-below limits)
vertical_rect = fitz.Rect(
rect.x0,
rect.y0 - self.search_above_factor * rect.height,
rect.x1,
rect.y1 + self.search_below_factor * rect.height,
)

horizontal_lines = {line for line in lines if line.rect.intersects(horizontal_rect)}
vertical_lines = {line for line in lines if line.rect.intersects(vertical_rect)}

feature_lines = horizontal_lines | vertical_lines

return list(feature_lines)
137 changes: 73 additions & 64 deletions src/stratigraphy/groundwater/groundwater_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
from dataclasses import dataclass
from datetime import date as dt
from datetime import datetime
from pathlib import Path

import fitz
import numpy as np
from stratigraphy.data_extractor.data_extractor import DataExtractor, ExtractedFeature, FeatureOnPage
from stratigraphy.data_extractor.utility import get_lines_near_rect
from stratigraphy.depths_materials_column_pairs.bounding_boxes import BoundingBox
from stratigraphy.groundwater.utility import extract_date, extract_depth, extract_elevation
from stratigraphy.lines.line import TextLine
from stratigraphy.metadata.elevation_extraction import Elevation
from stratigraphy.text.extract_text import extract_text_lines

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -114,30 +114,43 @@ class GroundwaterInDocument:
filename: str

@classmethod
def from_document(cls, doc: fitz.Document, terrain_elevation: Elevation | None = None) -> "GroundwaterInDocument":
"""Initializes the GroundwaterInDocument object and extracts the groundwater from the document.
def near_material_description(
cls,
document: fitz.Document,
page_number: int,
lines: list[TextLine],
material_description_bbox: BoundingBox,
terrain_elevation: Elevation | None = None,
) -> list[FeatureOnPage[Groundwater]]:
"""Extracts groundwater information from a near material description bounding box on a page.

Args:
doc (fitz.Document): The PDF document.
document (fitz.Document): The PDF document.
page_number (int): The page number (1-based) to process.
lines (list[TextLine]): The list of text lines to retrieve the groundwater from.
material_description_bbox (BoundingBox): The material description box from which
terrain_elevation (Elevation | None): The elevation of the terrain.

Returns:
GroundwaterInDocument: The extracted groundwater information from the document.
list[FeatureOnPage[Groundwater]]: The groundwater information near a material description bounding box.
"""
filename = Path(doc.name).name

groundwater_extractor = GroundwaterLevelExtractor(document=doc)
groundwater: list[FeatureOnPage[Groundwater]] = groundwater_extractor.extract_groundwater(terrain_elevation)

return GroundwaterInDocument(groundwater=groundwater, filename=filename)

def get_groundwater_per_page(self) -> list[FeatureOnPage[Groundwater]]:
"""Returns the groundwater information in the document.
groundwater_extractor = GroundwaterLevelExtractor()

lines_for_groundwater_key = get_lines_near_rect(
search_left_factor=4,
search_right_factor=4,
search_above_factor=2,
search_below_factor=3,
lines=lines,
rect=material_description_bbox.rect,
)

Returns:
list[FeatureOnPage[Groundwater]]: The groundwater information in the document.
"""
return self.groundwater
return groundwater_extractor.extract_groundwater(
page_number=page_number,
lines=lines_for_groundwater_key,
document=document,
terrain_elevation=terrain_elevation,
)

def to_json(self) -> list[dict]:
"""Converts the object to a list of dictionaries.
Expand All @@ -157,14 +170,14 @@ class GroundwaterLevelExtractor(DataExtractor):

# look for elevation values to the left, right and/or immediately below the key
search_left_factor: float = 2
search_right_factor: float = 10
search_below_factor: float = 4
search_right_factor: float = 8
search_below_factor: float = 2
search_above_factor: float = 0

preprocess_replacements = {",": ".", "'": ".", "o": "0", "\n": " ", "ü": "u"}

def __init__(self, document):
super().__init__(document)
def __init__(self):
super().__init__()

self.is_searching_groundwater_illustration = os.getenv("IS_SEARCHING_GROUNDWATER_ILLUSTRATION") == "True"
if self.is_searching_groundwater_illustration:
Expand Down Expand Up @@ -194,18 +207,14 @@ def get_groundwater_near_key(self, lines: list[TextLine], page: int) -> list[Fea
key_center = (key_rect.x0 + key_rect.x1) / 2
groundwater_info_lines.sort(key=lambda line: abs((line.rect.x0 + line.rect.x1) / 2 - key_center))

try:
extracted_gw = self.get_groundwater_info_from_lines(groundwater_info_lines, page)
if extracted_gw.feature.depth or extracted_gw.feature.elevation:
# if the depth or elevation is extracted, add the extracted groundwater information to the list
extracted_groundwater_list.append(extracted_gw)
except ValueError as error:
logger.warning("ValueError: %s", error)
logger.warning("Could not extract groundwater information from the lines near the key.")
extracted_groundwater = self.get_groundwater_info_from_lines(groundwater_info_lines, page)
if extracted_groundwater:
# if the depth or elevation is extracted, add the extracted groundwater information to the list
extracted_groundwater_list.append(extracted_groundwater)

return extracted_groundwater_list

def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> FeatureOnPage[Groundwater]:
def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> FeatureOnPage[Groundwater] | None:
"""Extracts the groundwater information from a list of text lines.

Args:
Expand All @@ -219,7 +228,6 @@ def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> F
elevation: float | None = None

matched_lines_rect = []

for idx, line in enumerate(lines):
text = self.preprocess(line.text)

Expand Down Expand Up @@ -303,9 +311,11 @@ def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> F
page=page,
)
else:
raise ValueError("Could not extract all required information from the lines provided.")
logger.warning("Could not extract groundwater depth nor elevation from the lines near the key.")

def extract_groundwater(self, terrain_elevation: Elevation | None) -> list[FeatureOnPage[Groundwater]]:
def extract_groundwater(
self, page_number: int, lines: list[TextLine], document: fitz.Document, terrain_elevation: Elevation | None
) -> list[FeatureOnPage[Groundwater]]:
"""Extracts the groundwater information from a borehole profile.

Processes the borehole profile page by page and tries to find the coordinates in the respective text of the
Expand All @@ -314,41 +324,40 @@ def extract_groundwater(self, terrain_elevation: Elevation | None) -> list[Featu
1. if that gives no results, search for coordinates close to an explicit "groundwater" label (e.g. "Gswp")

Args:
terrain_elevation (ElevationInformation | None): The elevation of the borehole.
page_number (int): The page number (1-based) of the PDF document.
lines (list[TextLine]): The lines of text to extract the groundwater information from.
document (fitz.Document): The document used to extract groundwater from illustration.
terrain_elevation (Elevation | None): The elevation of the borehole.

Returns:
list[FeatureOnPage[Groundwater]]: the extracted coordinates (if any)
"""
for page in self.doc:
lines = extract_text_lines(page)
page_number = page.number + 1 # NOTE: page.number is 0-based

found_groundwater = self.get_groundwater_near_key(lines, page_number)
if not found_groundwater and self.is_searching_groundwater_illustration:
from stratigraphy.groundwater.gw_illustration_template_matching import (
get_groundwater_from_illustration,
)

# Extract groundwater from illustration
found_groundwater, confidence_list = get_groundwater_from_illustration(
self, lines, page_number, terrain_elevation
)
if found_groundwater:
logger.info("Confidence list: %s", confidence_list)
logger.info("Found groundwater from illustration on page %s: %s", page_number, found_groundwater)

if terrain_elevation:
# If the elevation is provided, calculate the depth of the groundwater
for entry in found_groundwater:
if not entry.feature.depth and entry.feature.elevation:
entry.feature.depth = round(terrain_elevation.elevation - entry.feature.elevation, 2)
if not entry.feature.elevation and entry.feature.depth:
entry.feature.elevation = round(terrain_elevation.elevation - entry.feature.depth, 2)
found_groundwater = self.get_groundwater_near_key(lines, page_number)
if not found_groundwater and self.is_searching_groundwater_illustration:
from stratigraphy.groundwater.gw_illustration_template_matching import (
get_groundwater_from_illustration,
)

# Extract groundwater from illustration
found_groundwater, confidence_list = get_groundwater_from_illustration(
self, lines, page_number, document, terrain_elevation
)
if found_groundwater:
groundwater_output = ", ".join([str(entry.feature) for entry in found_groundwater])
logger.info("Found groundwater information on page %s: %s", page_number, groundwater_output)
return found_groundwater
logger.info("Confidence list: %s", confidence_list)
logger.info("Found groundwater from illustration on page %s: %s", page_number, found_groundwater)

if terrain_elevation:
# If the elevation is provided, calculate the depth of the groundwater
for entry in found_groundwater:
if not entry.feature.depth and entry.feature.elevation:
entry.feature.depth = round(terrain_elevation.elevation - entry.feature.elevation, 2)
if not entry.feature.elevation and entry.feature.depth:
entry.feature.elevation = round(terrain_elevation.elevation - entry.feature.depth, 2)

if found_groundwater:
groundwater_output = ", ".join([str(entry.feature) for entry in found_groundwater])
logger.info("Found groundwater information on page %s: %s", page_number, groundwater_output)
return found_groundwater

logger.info("No groundwater found in this borehole profile.")
return []
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,16 @@ def get_groundwater_from_illustration(
groundwater_extractor: GroundwaterLevelExtractor,
lines: list[TextLine],
page_number: int,
document: fitz.Document,
terrain_elevation: Elevation | None,
) -> tuple[list[FeatureOnPage[Groundwater]], list[float]]:
"""Extracts the groundwater information from an illustration.

Args:
groundwater_extractor (GroundwaterLevelExtractor): the groundwater level extractor
lines (list[TextLine]): the lines of text to extract the groundwater information from
page_number (int): the page number (1-based) of the PDF document
groundwater_extractor (GroundwaterLevelExtractor): the groundwater level extractor.
lines (list[TextLine]): The lines of text to extract the groundwater information from.
page_number (int): The page number (1-based) of the PDF document.
document (fitz.Document): The document to extract groundwater from illustration from.
terrain_elevation (Elevation | None): The elevation of the terrain.

Returns:
Expand All @@ -57,8 +59,8 @@ def get_groundwater_from_illustration(
confidence_list = []

# convert the doc to an image
page = groundwater_extractor.doc.load_page(page_number - 1)
filename = Path(groundwater_extractor.doc.name).stem
page = document.load_page(page_number - 1)
filename = Path(document.name).stem
png_filename = f"{filename}-{page_number + 1}.png"
png_path = f"/tmp/{png_filename}" # Local path to save the PNG
fitz.utils.get_pixmap(page, matrix=fitz.Matrix(2, 2), clip=page.rect).save(png_path)
Expand Down
19 changes: 11 additions & 8 deletions src/stratigraphy/groundwater/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,23 @@ def extract_depth(text: str, max_depth: int) -> float | None:
depth_patterns = [
r"([\d.]+)\s*m\s*u\.t\.",
r"([\d.]+)\s*m\s*u\.t",
r"(\d+.\d+)",
r"(\d+\.\d+)",
]

depth = None
corrected_text = correct_ocr_text(text).lower()
for pattern in depth_patterns:
depth_match = regex.search(pattern, corrected_text)
if depth_match:
depth = float(depth_match.group(1).replace(",", "."))
if depth > max_depth:
# If the extracted depth is greater than the max depth, set it to None and continue searching.
depth = None
else:
break
try:
if depth_match:
depth = float(depth_match.group(1).replace(",", "."))
if depth > max_depth:
# If the extracted depth is greater than the max depth, set it to None and continue searching.
depth = None
else:
break
except ValueError:
continue
return depth


Expand Down
Loading
Loading