Skip to content

Commit

Permalink
Merge pull request #79 from swisstopo/LGVISIUM-74-Update-the-API-base…
Browse files Browse the repository at this point in the history
…d-on-Feedback

Close #LGVISIUM-74: Update the API based on the new s3 specs & feedback
  • Loading branch information
dcleres authored Sep 11, 2024
2 parents 6451af6 + 07d5348 commit b7d34ce
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 13 deletions.
4 changes: 2 additions & 2 deletions src/app/api/v1/endpoints/create_pngs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def create_pngs(aws_filename: Path):
"""Convert a PDF document to PNG images. Please note that this function will overwrite any existing PNG files.
Args:
aws_filename (str): The name of the PDF document in the S3 bucket. For example, "pdfs/10012.pdf".
aws_filename (str): The key of the PDF document in the S3 bucket. For example, "10012.pdf".
Returns:
PNGResponse: The URLs of the PNG images in the S3 bucket.
Expand All @@ -38,7 +38,7 @@ def create_pngs(aws_filename: Path):
pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
png_filename = f"{filename}-{page_number + 1}.png"
png_path = f"/tmp/{png_filename}" # Local path to save the PNG
s3_bucket_png_path = f"pngs/{png_filename}"
s3_bucket_png_path = f"dataextraction/{png_filename}"

pix.save(png_path)

Expand Down
2 changes: 1 addition & 1 deletion src/app/api/v1/endpoints/extract_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def extract_data(extract_data_request: ExtractDataRequest) -> ExtractDataRespons
pdf_page_height = pdf_page.rect.height

# Load the PNG image the boreholes app is showing to the user
# Convert the PDF filename to a PNG filename: "pdfs/geoquat/train/10012.pdf" -> 'pngs/geoquat/train/10012_0.png'
# Convert the PDF filename to a PNG filename: "10012.pdf" -> 'dataextraction/10012-1.png'
# Remove the file extension and replace it with '.png'
base_filename = extract_data_request.filename.stem
png_filename = Path(f"{base_filename}-{extract_data_request.page_number}.png")
Expand Down
12 changes: 6 additions & 6 deletions src/app/common/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def load_pdf_from_aws(filename: Path) -> fitz.Document:
"""
# Load the PDF from the S3 object
try:
data = load_data_from_aws(filename, "pdfs")
data = load_data_from_aws(filename)
pdf_document = fitz.open(stream=data, filetype="pdf")
except Exception:
raise HTTPException(
Expand All @@ -45,7 +45,7 @@ def load_png_from_aws(filename: Path) -> np.ndarray:
Returns:
ndarray: The loaded PNG image.
"""
data = load_data_from_aws(filename, "pngs")
data = load_data_from_aws(filename, "dataextraction")

# Convert the PNG data to an image using PIL
image = Image.open(io.BytesIO(data))
Expand All @@ -54,21 +54,21 @@ def load_png_from_aws(filename: Path) -> np.ndarray:
return np.array(image)


def load_data_from_aws(filename: Path, format: str) -> bytes:
def load_data_from_aws(filename: Path, prefix: str = "") -> bytes:
"""Load a document from AWS S3.
Args:
filename (str): The filename of the PNG image.
format (str): The format of the file.
prefix (str): The prefix of the file in the bucket.
Returns:
bytes: The loaded PNG image.
"""
# Check if the PNG exists in S3
try:
png_object = s3_client.get_object(Bucket=config.bucket_name, Key=str(format / filename))
png_object = s3_client.get_object(Bucket=config.bucket_name, Key=str(prefix / filename))
except s3_client.exceptions.NoSuchKey:
raise HTTPException(status_code=404, detail=f"Document {format + filename} not found in S3 bucket.") from None
raise HTTPException(status_code=404, detail=f"Document {prefix + filename} not found in S3 bucket.") from None

# Load the PNG from the S3 object
try:
Expand Down
4 changes: 2 additions & 2 deletions tests/test_create_pngs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
from botocore.exceptions import ClientError
from fastapi.testclient import TestClient

TEST_PDF_KEY = "pdfs/sample.pdf"
TEST_PDF_KEY = "sample.pdf"
TEST_PDF_PATH = Path(__file__).parent.parent / "example" / "example_borehole_profile.pdf"
TEST_PNG_KEY = "pngs/sample-1.png"
TEST_PNG_KEY = "dataextraction/sample-1.png"
TEST_PNG_PATH = Path(__file__).parent.parent / "example" / "sample-1.png"


Expand Down
4 changes: 2 additions & 2 deletions tests/test_data_extraction_from_bbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
from app.common.schemas import ExtractDataRequest, FormatTypes
from fastapi.testclient import TestClient

TEST_PDF_KEY = Path("pdfs/sample.pdf")
TEST_PDF_KEY = Path("sample.pdf")
TEST_PDF_PATH = Path(__file__).parent.parent / "example" / "example_borehole_profile.pdf"
TEST_PNG_KEY = Path("pngs/sample-1.png")
TEST_PNG_KEY = Path("dataextraction/sample-1.png")
TEST_PNG_PATH = Path(__file__).parent.parent / "example" / "sample-1.png"


Expand Down

1 comment on commit b7d34ce

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
src/stratigraphy
   __init__.py8188%11
   extract.py1881880%3–491
   get_files.py19190%3–47
   line_detection.py26260%3–76
   main.py1191190%3–274
src/stratigraphy/coordinates
   coordinate_extraction.py108595%30, 64, 83–84, 96
src/stratigraphy/data_extractor
   data_extractor.py50394%32, 62, 98
src/stratigraphy/util
   boundarydepthcolumnvalidator.py412051%47, 57, 60, 81–84, 110–128, 140–149
   dataclasses.py32391%37–39
   depthcolumn.py1946467%26, 30, 51, 57, 60–61, 85, 88, 95, 102, 110–111, 121, 138–154, 192, 229, 248–256, 267, 272, 279, 310, 315–322, 337–338, 381–423
   depthcolumnentry.py28679%17, 21, 36, 39, 56, 65
   description_block_splitter.py70297%25, 140
   draw.py1171170%3–349
   duplicate_detection.py51510%3–146
   extract_text.py31487%20, 36, 57–58
   find_depth_columns.py91693%42–43, 73, 86, 180–181
   find_description.py632856%27–35, 50–63, 79–95, 172–175
   geometric_line_utilities.py86298%82, 132
   interval.py1045547%25–28, 33–36, 42, 48, 52, 62–64, 101–147, 168, 174–190
   language_detection.py18180%3–45
   layer_identifier_column.py91910%3–234
   line.py51492%26, 51, 61, 111
   linesquadtree.py46198%76
   plot_utils.py43430%3–120
   predictions.py1541540%3–364
   textblock.py80989%29, 57, 65, 90, 102, 125, 146, 155, 184
   util.py391756%22, 40–47, 61–63, 87–88, 100–104
TOTAL1948105646% 

Tests Skipped Failures Errors Time
79 0 💤 0 ❌ 0 🔥 5.367s ⏱️

Please sign in to comment.