Merge pull request #79 from swisstopo/LGVISIUM-74-Update-the-API-base…

…d-on-Feedback Close #LGVISIUM-74: Update the API based on the new s3 specs & feedback
swisstopo · Sep 11, 2024 · b7d34ce · b7d34ce · github-actions · Sep 11, 2024
2 parents 6451af6 + 07d5348
commit b7d34ce
Show file tree

Hide file tree

Showing 5 changed files with 13 additions and 13 deletions.
diff --git a/src/app/api/v1/endpoints/create_pngs.py b/src/app/api/v1/endpoints/create_pngs.py
@@ -14,7 +14,7 @@ def create_pngs(aws_filename: Path):
     """Convert a PDF document to PNG images. Please note that this function will overwrite any existing PNG files.
 
     Args:
-        aws_filename (str): The name of the PDF document in the S3 bucket. For example, "pdfs/10012.pdf".
+        aws_filename (str): The key of the PDF document in the S3 bucket. For example, "10012.pdf".
 
     Returns:
         PNGResponse: The URLs of the PNG images in the S3 bucket.
@@ -38,7 +38,7 @@ def create_pngs(aws_filename: Path):
             pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
             png_filename = f"{filename}-{page_number + 1}.png"
             png_path = f"/tmp/{png_filename}"  # Local path to save the PNG
-            s3_bucket_png_path = f"pngs/{png_filename}"
+            s3_bucket_png_path = f"dataextraction/{png_filename}"
 
             pix.save(png_path)
 

diff --git a/src/app/api/v1/endpoints/extract_data.py b/src/app/api/v1/endpoints/extract_data.py
@@ -44,7 +44,7 @@ def extract_data(extract_data_request: ExtractDataRequest) -> ExtractDataRespons
     pdf_page_height = pdf_page.rect.height
 
     # Load the PNG image the boreholes app is showing to the user
-    # Convert the PDF filename to a PNG filename: "pdfs/geoquat/train/10012.pdf" -> 'pngs/geoquat/train/10012_0.png'
+    # Convert the PDF filename to a PNG filename: "10012.pdf" -> 'dataextraction/10012-1.png'
     # Remove the file extension and replace it with '.png'
     base_filename = extract_data_request.filename.stem
     png_filename = Path(f"{base_filename}-{extract_data_request.page_number}.png")

diff --git a/src/app/common/aws.py b/src/app/common/aws.py
@@ -26,7 +26,7 @@ def load_pdf_from_aws(filename: Path) -> fitz.Document:
     """
     # Load the PDF from the S3 object
     try:
-        data = load_data_from_aws(filename, "pdfs")
+        data = load_data_from_aws(filename)
         pdf_document = fitz.open(stream=data, filetype="pdf")
     except Exception:
         raise HTTPException(
@@ -45,7 +45,7 @@ def load_png_from_aws(filename: Path) -> np.ndarray:
     Returns:
         ndarray: The loaded PNG image.
     """
-    data = load_data_from_aws(filename, "pngs")
+    data = load_data_from_aws(filename, "dataextraction")
 
     # Convert the PNG data to an image using PIL
     image = Image.open(io.BytesIO(data))
@@ -54,21 +54,21 @@ def load_png_from_aws(filename: Path) -> np.ndarray:
     return np.array(image)
 
 
-def load_data_from_aws(filename: Path, format: str) -> bytes:
+def load_data_from_aws(filename: Path, prefix: str = "") -> bytes:
     """Load a document from AWS S3.
 
     Args:
         filename (str): The filename of the PNG image.
-        format (str): The format of the file.
+        prefix (str): The prefix of the file in the bucket.
 
     Returns:
         bytes: The loaded PNG image.
     """
     # Check if the PNG exists in S3
     try:
-        png_object = s3_client.get_object(Bucket=config.bucket_name, Key=str(format / filename))
+        png_object = s3_client.get_object(Bucket=config.bucket_name, Key=str(prefix / filename))
     except s3_client.exceptions.NoSuchKey:
-        raise HTTPException(status_code=404, detail=f"Document {format + filename} not found in S3 bucket.") from None
+        raise HTTPException(status_code=404, detail=f"Document {prefix + filename} not found in S3 bucket.") from None
 
     # Load the PNG from the S3 object
     try:

diff --git a/tests/test_create_pngs.py b/tests/test_create_pngs.py
@@ -13,9 +13,9 @@
 from botocore.exceptions import ClientError
 from fastapi.testclient import TestClient
 
-TEST_PDF_KEY = "pdfs/sample.pdf"
+TEST_PDF_KEY = "sample.pdf"
 TEST_PDF_PATH = Path(__file__).parent.parent / "example" / "example_borehole_profile.pdf"
-TEST_PNG_KEY = "pngs/sample-1.png"
+TEST_PNG_KEY = "dataextraction/sample-1.png"
 TEST_PNG_PATH = Path(__file__).parent.parent / "example" / "sample-1.png"
 
 

diff --git a/tests/test_data_extraction_from_bbox.py b/tests/test_data_extraction_from_bbox.py
@@ -16,9 +16,9 @@
 from app.common.schemas import ExtractDataRequest, FormatTypes
 from fastapi.testclient import TestClient
 
-TEST_PDF_KEY = Path("pdfs/sample.pdf")
+TEST_PDF_KEY = Path("sample.pdf")
 TEST_PDF_PATH = Path(__file__).parent.parent / "example" / "example_borehole_profile.pdf"
-TEST_PNG_KEY = Path("pngs/sample-1.png")
+TEST_PNG_KEY = Path("dataextraction/sample-1.png")
 TEST_PNG_PATH = Path(__file__).parent.parent / "example" / "sample-1.png"
File	Stmts	Miss	Cover	Missing
src/stratigraphy
__init__.py	8	1	88%	11
extract.py	188	188	0%	3–491
get_files.py	19	19	0%	3–47
line_detection.py	26	26	0%	3–76
main.py	119	119	0%	3–274
src/stratigraphy/coordinates
coordinate_extraction.py	108	5	95%	30, 64, 83–84, 96
src/stratigraphy/data_extractor
data_extractor.py	50	3	94%	32, 62, 98
src/stratigraphy/util
boundarydepthcolumnvalidator.py	41	20	51%	47, 57, 60, 81–84, 110–128, 140–149
dataclasses.py	32	3	91%	37–39
depthcolumn.py	194	64	67%	26, 30, 51, 57, 60–61, 85, 88, 95, 102, 110–111, 121, 138–154, 192, 229, 248–256, 267, 272, 279, 310, 315–322, 337–338, 381–423
depthcolumnentry.py	28	6	79%	17, 21, 36, 39, 56, 65
description_block_splitter.py	70	2	97%	25, 140
draw.py	117	117	0%	3–349
duplicate_detection.py	51	51	0%	3–146
extract_text.py	31	4	87%	20, 36, 57–58
find_depth_columns.py	91	6	93%	42–43, 73, 86, 180–181
find_description.py	63	28	56%	27–35, 50–63, 79–95, 172–175
geometric_line_utilities.py	86	2	98%	82, 132
interval.py	104	55	47%	25–28, 33–36, 42, 48, 52, 62–64, 101–147, 168, 174–190
language_detection.py	18	18	0%	3–45
layer_identifier_column.py	91	91	0%	3–234
line.py	51	4	92%	26, 51, 61, 111
linesquadtree.py	46	1	98%	76
plot_utils.py	43	43	0%	3–120
predictions.py	154	154	0%	3–364
textblock.py	80	9	89%	29, 57, 65, 90, 102, 125, 146, 155, 184
util.py	39	17	56%	22, 40–47, 61–63, 87–88, 100–104
TOTAL	1948	1056	46%