Skip to content

Commit

Permalink
Merge pull request #88 from swisstopo/LGVISIUM-83-Extract-coordinates…
Browse files Browse the repository at this point in the history
…-with-non-integer-values

Close #LGVISIUM-83: Extract coordinates with non integer values
  • Loading branch information
dcleres authored Oct 7, 2024
2 parents b164bb8 + 066307e commit 2771506
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 15 deletions.
Binary file added example/A7367.pdf
Binary file not shown.
4 changes: 2 additions & 2 deletions src/stratigraphy/annotations/draw.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,10 @@ def draw_metadata(
"""
# TODO associate correctness with the extracted coordinates in a better way
coordinate_color = "green" if is_coordinate_correct else "red"
coordinate_rect = fitz.Rect([5, 5, 200, 25])
coordinate_rect = fitz.Rect([5, 5, 250, 30])

elevation_color = "green" if is_elevation_correct else "red"
elevation_rect = fitz.Rect([5, 25, 200, 45])
elevation_rect = fitz.Rect([5, 30, 250, 55])

shape.draw_rect(coordinate_rect * derotation_matrix)
shape.finish(fill=fitz.utils.getColor("gray"), fill_opacity=0.5)
Expand Down
35 changes: 26 additions & 9 deletions src/stratigraphy/metadata/coordinate_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@

logger = logging.getLogger(__name__)

COORDINATE_ENTRY_REGEX = r"(?:([12])[\.\s'‘’]{0,2})?(\d{3})[\.\s'‘’]{0,2}(\d{3})\.?\d?"
COORDINATE_ENTRY_REGEX = r"(?:([12])[\.\s'‘’]{0,2})?(\d{3})[\.\s'‘’]{0,2}(\d{3})(?:\.(\d{1,}))?"


@dataclass(kw_only=True)
class CoordinateEntry:
"""Dataclass to represent a coordinate entry."""

coordinate_value: int
coordinate_value: float

def __repr__(self):
if self.coordinate_value > 1e5:
Expand All @@ -44,7 +44,7 @@ def __post_init__(self):
self.north, self.east = self.east, self.north

def __str__(self):
return f"E: {self.east}, N: {self.north}"
return f"E: {self.east.coordinate_value}, N: {self.north.coordinate_value}"

def to_json(self) -> dict:
"""Converts the object to a dictionary.
Expand All @@ -64,7 +64,18 @@ def is_valid(self):
pass

@staticmethod
def from_values(east: int, north: int, rect: fitz.Rect, page: int) -> Coordinate | None:
def from_values(east: float, north: float, rect: fitz.Rect, page: int) -> Coordinate | None:
"""Creates a Coordinate object from the given values.
Args:
east (float): The east coordinate value.
north (float): The north coordinate value.
rect (fitz.Rect): The rectangle that contains the extracted information.
page (int): The page number of the PDF document.
Returns:
Coordinate | None: The coordinate object.
"""
if 1e6 < east < 1e7:
return LV95Coordinate(
east=CoordinateEntry(coordinate_value=east),
Expand Down Expand Up @@ -104,7 +115,9 @@ class LV95Coordinate(Coordinate):

def is_valid(self):
"""Reference: https://de.wikipedia.org/wiki/Schweizer_Landeskoordinaten#Beispielkoordinaten."""
return 2324800 < self.east.coordinate_value < 2847500 and 1074000 < self.north.coordinate_value < 1302000
return (
2324800.0 < self.east.coordinate_value < 2847500.0 and 1074000.0 < self.north.coordinate_value < 1302000.0
)


@dataclass
Expand All @@ -116,7 +129,7 @@ def is_valid(self):
To account for uncertainties in the conversion of LV03 to LV95, we allow a margin of 2.
"""
return 324798 < self.east.coordinate_value < 847502 and 73998 < self.north.coordinate_value < 302002
return 324798.0 < self.east.coordinate_value < 847502.0 and 73998.0 < self.north.coordinate_value < 302002.0


class CoordinateExtractor(DataExtractor):
Expand Down Expand Up @@ -224,12 +237,16 @@ def get_coordinates_from_lines(self, lines: list[TextLine], page: int) -> list[C
list[Coordinate]: A list of potential coordinates
"""
full_regex = regex.compile(
r"(?:[XY][=:\s]{0,2})?" + COORDINATE_ENTRY_REGEX + r".{0,4}?[XY]?[=:\s]{0,2}" + COORDINATE_ENTRY_REGEX
r"(?:[XY][=:\s]{0,2})?"
+ COORDINATE_ENTRY_REGEX
+ r".{0,4}?[XY]?[=:\s]{0,2}"
+ COORDINATE_ENTRY_REGEX
+ r"\b"
)
potential_coordinates = [
Coordinate.from_values(
east=int("".join(match.groups(default="")[:3])),
north=int("".join(match.groups(default="")[3:])),
east=float("{}.{}".format("".join(match.groups(default="")[:3]), match.groups(default="")[3])),
north=float("{}.{}".format("".join(match.groups(default="")[4:-1]), match.groups(default="")[-1])),
rect=rect,
page=page,
)
Expand Down
51 changes: 47 additions & 4 deletions tests/test_coordinate_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_strLV95(): # noqa: D103
rect=fitz.Rect(),
page=1,
)
assert str(coord) == "E: 2'789'456, N: 1'123'012"
assert str(coord) == "E: 2789456, N: 1123012"


def test_to_jsonLV95(): # noqa: D103
Expand Down Expand Up @@ -52,7 +52,7 @@ def test_strLV03(): # noqa: D103
rect=fitz.Rect(),
page=1,
)
assert str(coord) == "E: 789'456, N: 123'012"
assert str(coord) == "E: 789456, N: 123012"


def test_to_jsonLV03(): # noqa: D103
Expand All @@ -67,6 +67,7 @@ def test_to_jsonLV03(): # noqa: D103


doc = fitz.open(DATAPATH.parent / "example" / "example_borehole_profile.pdf")
doc_with_digits_in_coordinates = fitz.open(DATAPATH.parent / "example" / "A7367.pdf")
extractor = CoordinateExtractor(doc)


Expand All @@ -76,8 +77,18 @@ def test_CoordinateExtractor_extract_coordinates(): # noqa: D103
coordinates = extractor.extract_coordinates()
# Check if the returned value is a list
assert isinstance(coordinates, Coordinate)
assert repr(coordinates.east) == "615'790"
assert repr(coordinates.north) == "157'500"
assert repr(coordinates.east) == "615'790.0"
assert repr(coordinates.north) == "157'500.0"


def test_CoordinateExtractor_extract_coordinates_with_digits_in_coordinates(): # noqa: D103
"""Test the extraction of coordinates from a PDF document with digits in the coordinates."""
# Assuming there is a method called 'extract' in CoordinateExtractor class
coordinates = CoordinateExtractor(doc_with_digits_in_coordinates).extract_coordinates()
# Check if the returned value is a list
assert isinstance(coordinates, Coordinate)
assert repr(coordinates.east) == "607'562.0"
assert repr(coordinates.north) == "187'087.5"


def _create_simple_lines(text_lines: list[str]) -> list[TextLine]:
Expand Down Expand Up @@ -211,3 +222,35 @@ def test_CoordinateExtractor_get_coordinates_from_lines_rect(): # noqa: D103
expected_rect.include_rect(lines[2].rect)
assert coordinates[0].rect == expected_rect
assert coordinates[0].page == 1

# Example from 269126143-bp.pdf (a slash in the middle of the coordinates as misread by OCR as the digit 1)
lines = _create_simple_lines(["269578211260032"])
coordinates = extractor.get_coordinates_from_lines(lines, page=1)
assert coordinates[0].east.coordinate_value == 2695782
assert coordinates[0].north.coordinate_value == 1260032


def test_get_single_decimal_coordinates():
"""Test the extraction of decimal coordinates from a list of text lines."""
lines = _create_simple_lines(["615.790.6 / 157.500.5"])
coordinates = extractor.get_coordinates_from_lines(lines, page=1)
assert coordinates[0].east.coordinate_value == 615790.6
assert coordinates[0].north.coordinate_value == 157500.5

lines = _create_simple_lines(["2600000.6 / 1200000.5"])
coordinates = extractor.get_coordinates_from_lines(lines, page=1)
assert coordinates[0].east.coordinate_value == 2600000.6
assert coordinates[0].north.coordinate_value == 1200000.5


def test_get_double_decimal_coordinates():
"""Test the extraction of decimal coordinates from a list of text lines."""
lines = _create_simple_lines(["615.790.64 / 157.500.55"])
coordinates = extractor.get_coordinates_from_lines(lines, page=1)
assert coordinates[0].east.coordinate_value == 615790.64
assert coordinates[0].north.coordinate_value == 157500.55

lines = _create_simple_lines(["2600000.64 / 1200000.55"])
coordinates = extractor.get_coordinates_from_lines(lines, page=1)
assert coordinates[0].east.coordinate_value == 2600000.64
assert coordinates[0].north.coordinate_value == 1200000.55

1 comment on commit 2771506

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
src/stratigraphy
   __init__.py8188%11
   extract.py1861860%3–483
   get_files.py19190%3–47
   main.py1191190%3–310
src/stratigraphy/data_extractor
   data_extractor.py50394%32, 62, 98
src/stratigraphy/depthcolumn
   boundarydepthcolumnvalidator.py412051%47, 57, 60, 81–84, 110–128, 140–149
   depthcolumn.py1946467%25, 29, 50, 56, 59–60, 84, 87, 94, 101, 109–110, 120, 137–153, 191, 228, 247–255, 266, 271, 278, 309, 314–321, 336–337, 380–422
   depthcolumnentry.py28679%17, 21, 36, 39, 56, 65
   find_depth_columns.py1061982%42–43, 73, 86, 180–181, 225–245
src/stratigraphy/layer
   layer_identifier_column.py745230%16–17, 20, 28, 43, 47, 51, 59–63, 66, 74, 91–96, 99, 112, 125–126, 148–158, 172–199
src/stratigraphy/lines
   geometric_line_utilities.py86298%81, 131
   line.py51492%25, 50, 60, 110
   linesquadtree.py46198%75
src/stratigraphy/metadata
   coordinate_extraction.py108595%30, 64, 94–95, 107
src/stratigraphy/text
   description_block_splitter.py70297%24, 139
   extract_text.py29390%19, 53–54
   find_description.py642856%27–35, 50–63, 79–95, 172–175
   textblock.py80989%28, 56, 64, 89, 101, 124, 145, 154, 183
src/stratigraphy/util
   dataclasses.py32391%37–39
   interval.py1045547%29–32, 37–40, 46, 52, 56, 66–68, 107–153, 174, 180–196
   predictions.py1071070%3–282
   util.py391756%41, 69–76, 90–92, 116–117, 129–133
TOTAL164172556% 

Tests Skipped Failures Errors Time
82 0 💤 0 ❌ 0 🔥 5.954s ⏱️

Please sign in to comment.