diff --git a/src/app/api/v1/endpoints/create_pngs.py b/src/app/api/v1/endpoints/create_pngs.py index 2254b05c..912c3893 100644 --- a/src/app/api/v1/endpoints/create_pngs.py +++ b/src/app/api/v1/endpoints/create_pngs.py @@ -14,7 +14,7 @@ def create_pngs(aws_filename: Path): """Convert a PDF document to PNG images. Please note that this function will overwrite any existing PNG files. Args: - aws_filename (str): The name of the PDF document in the S3 bucket. For example, "pdfs/10012.pdf". + aws_filename (str): The key of the PDF document in the S3 bucket. For example, "10012.pdf". Returns: PNGResponse: The URLs of the PNG images in the S3 bucket. @@ -38,7 +38,7 @@ def create_pngs(aws_filename: Path): pix = page.get_pixmap(matrix=fitz.Matrix(3, 3)) png_filename = f"{filename}-{page_number + 1}.png" png_path = f"/tmp/{png_filename}" # Local path to save the PNG - s3_bucket_png_path = f"pngs/{png_filename}" + s3_bucket_png_path = f"dataextraction/{png_filename}" pix.save(png_path) diff --git a/src/app/api/v1/endpoints/extract_data.py b/src/app/api/v1/endpoints/extract_data.py index 221f8c6a..f0e2ec88 100644 --- a/src/app/api/v1/endpoints/extract_data.py +++ b/src/app/api/v1/endpoints/extract_data.py @@ -44,7 +44,7 @@ def extract_data(extract_data_request: ExtractDataRequest) -> ExtractDataRespons pdf_page_height = pdf_page.rect.height # Load the PNG image the boreholes app is showing to the user - # Convert the PDF filename to a PNG filename: "pdfs/geoquat/train/10012.pdf" -> 'pngs/geoquat/train/10012_0.png' + # Convert the PDF filename to a PNG filename: "10012.pdf" -> 'dataextraction/10012-1.png' # Remove the file extension and replace it with '.png' base_filename = extract_data_request.filename.stem png_filename = Path(f"{base_filename}-{extract_data_request.page_number}.png") diff --git a/src/app/common/aws.py b/src/app/common/aws.py index f2cc05a4..c930697e 100644 --- a/src/app/common/aws.py +++ b/src/app/common/aws.py @@ -26,7 +26,7 @@ def load_pdf_from_aws(filename: Path) -> fitz.Document: """ # Load the PDF from the S3 object try: - data = load_data_from_aws(filename, "pdfs") + data = load_data_from_aws(filename) pdf_document = fitz.open(stream=data, filetype="pdf") except Exception: raise HTTPException( @@ -45,7 +45,7 @@ def load_png_from_aws(filename: Path) -> np.ndarray: Returns: ndarray: The loaded PNG image. """ - data = load_data_from_aws(filename, "pngs") + data = load_data_from_aws(filename, "dataextraction") # Convert the PNG data to an image using PIL image = Image.open(io.BytesIO(data)) @@ -54,21 +54,21 @@ def load_png_from_aws(filename: Path) -> np.ndarray: return np.array(image) -def load_data_from_aws(filename: Path, format: str) -> bytes: +def load_data_from_aws(filename: Path, prefix: str = "") -> bytes: """Load a document from AWS S3. Args: filename (str): The filename of the PNG image. - format (str): The format of the file. + prefix (str): The prefix of the file in the bucket. Returns: bytes: The loaded PNG image. """ # Check if the PNG exists in S3 try: - png_object = s3_client.get_object(Bucket=config.bucket_name, Key=str(format / filename)) + png_object = s3_client.get_object(Bucket=config.bucket_name, Key=str(prefix / filename)) except s3_client.exceptions.NoSuchKey: - raise HTTPException(status_code=404, detail=f"Document {format + filename} not found in S3 bucket.") from None + raise HTTPException(status_code=404, detail=f"Document {prefix + filename} not found in S3 bucket.") from None # Load the PNG from the S3 object try: diff --git a/tests/test_create_pngs.py b/tests/test_create_pngs.py index dc735e29..231c71c9 100644 --- a/tests/test_create_pngs.py +++ b/tests/test_create_pngs.py @@ -13,9 +13,9 @@ from botocore.exceptions import ClientError from fastapi.testclient import TestClient -TEST_PDF_KEY = "pdfs/sample.pdf" +TEST_PDF_KEY = "sample.pdf" TEST_PDF_PATH = Path(__file__).parent.parent / "example" / "example_borehole_profile.pdf" -TEST_PNG_KEY = "pngs/sample-1.png" +TEST_PNG_KEY = "dataextraction/sample-1.png" TEST_PNG_PATH = Path(__file__).parent.parent / "example" / "sample-1.png" diff --git a/tests/test_data_extraction_from_bbox.py b/tests/test_data_extraction_from_bbox.py index d6655028..43bed8d0 100644 --- a/tests/test_data_extraction_from_bbox.py +++ b/tests/test_data_extraction_from_bbox.py @@ -16,9 +16,9 @@ from app.common.schemas import ExtractDataRequest, FormatTypes from fastapi.testclient import TestClient -TEST_PDF_KEY = Path("pdfs/sample.pdf") +TEST_PDF_KEY = Path("sample.pdf") TEST_PDF_PATH = Path(__file__).parent.parent / "example" / "example_borehole_profile.pdf" -TEST_PNG_KEY = Path("pngs/sample-1.png") +TEST_PNG_KEY = Path("dataextraction/sample-1.png") TEST_PNG_PATH = Path(__file__).parent.parent / "example" / "sample-1.png"