Merge branch 'main' into LGVISIUM-66/metrics-refactoring

# Conflicts: # src/stratigraphy/benchmark/score.py
swisstopo · Sep 11, 2024 · 3048782 · 3048782
2 parents d9dab88 + 6451af6
commit 3048782
Show file tree

Hide file tree

Showing 24 changed files with 1,272 additions and 27 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -34,7 +34,24 @@
             "python": "./swisstopo/bin/python3",
         },
         {
-            "name": "Python: Run pytest",
+            "name": "API",
+            "type": "debugpy",
+            "request": "launch",
+            "cwd": "${workspaceFolder}/",
+            "module": "uvicorn",
+            "args": [
+                "src.app.main:app",
+                "--reload",
+                "--host",
+                "0.0.0.0",
+                "--port",
+                "8002",
+            ],
+            "console": "integratedTerminal",
+            "justMyCode": true
+        },
+        {
+            "name": "Python: Run pytests",
             "type": "debugpy",
             "request": "launch",
             "module": "pytest",

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -7,6 +7,7 @@
         "fitz",
         "mlflow",
         "pixmap",
+        "pydantic",
         "pyproject",
         "swissgeol",
         "swisstopo",

diff --git a/README.md b/README.md
@@ -214,6 +214,11 @@ The project structure and the most important files are as follows:
 
 - `root/` : The root directory of the project.
   - `src/` : The source code of the project.
+    - `app/`: The API of the project.
+      - `main.py`: The main script that launches the API.
+      - `common/config.py`: Config file for the API.
+      - `v1/`: Contain all the code for the version 1 of the API.
+      - `v1/router.py`: Presents at a glance all the available endpoints.
     - `stratigraphy/` : The main package of the project.
       - `main.py` : The main script of the project. This script runs the data extraction pipeline.
       - `line_detection.py`: Contains functionalities for line detection on pdf pages.
@@ -231,6 +236,55 @@ The project structure and the most important files are as follows:
 
 - `main.py` : This is the main script of the project. It runs the data extraction pipeline, which analyzes the PDF files in the `data/Benchmark` directory and saves the results in the `predictions.json` file.
 
+## API
+
+The API for this project is built using FastAPI, a modern, fast (high-performance), web framework for building APIs with Python.
+
+To launch the API and access its endpoints, follow these steps:
+
+1. **Activate the virtual environment**
+
+    Activate your virtual environment. On Unix systems, this can be done with the following command:
+
+    ```bash
+    source env/bin/activate
+    ```
+
+2. **Environment variables**
+
+    Please make sure to define the environment variables needed for the API to access the S3 Bucket of interest.
+
+    ```python
+    aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
+    aws_secret_key_access = os.environ.get("AWS_SECRET_ACCESS_KEY")
+    aws_session_token = os.environ.get("AWS_SESSION_TOKEN")
+    aws_endpoint = os.environ.get("AWS_ENDPOINT")
+    ```
+
+3. **Start the FastAPI server**
+
+    Run the following command to start the FastAPI server:
+
+    ```bash
+    uvicorn src.app.main:app --reload --host 0.0.0.0 --port 8002
+    ```
+
+    This will start the server on port 8002 of the localhost and enable automatic reloading whenever changes are made to the code. You can see the OpenAPI Specification (formerly Swagger Specification) by opening: `http://127.0.0.1:8002/docs#/` in your favorite browser. 
+
+4. **Access the API endpoints**
+
+    Once the server is running, you can access the API endpoints using a web browser or an API testing tool like Postman.
+
+    The main endpoint for the data extraction pipeline is `http://localhost:8000/extract-data`. You can send a POST request to this endpoint with the PDF file you want to extract data from.
+
+    Additional endpoints and their functionalities can be found in the project's source code.
+
+    **Note:** Make sure to replace `localhost` with the appropriate hostname or IP address if you are running the server on a remote machine.
+
+5. **Stop the server**
+
+    To stop the FastAPI server, press `Ctrl + C` in the terminal where the server is running. Please refer to the [FastAPI documentation](https://fastapi.tiangolo.com) for more information on how to work with FastAPI and build APIs using this framework.
+
 
 ## Experiment Tracking
 We perform experiment tracking using MLFlow. Each developer has his own local MLFlow instance. 

diff --git a/example/sample-1.png b/example/sample-1.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,13 @@ dependencies = [
     "python-dotenv",
     "setuptools",
     "tqdm",
+    "fastapi",
+    "uvicorn",
+    "pydantic_settings",
+    "pydantic",
+    "httpx",
+    "moto",
+    "pillow",
     "scikit-learn>=1.4.0",
     "click>=8.0.0",
     "PyYAML>=6.0.1",

diff --git a/src/app/api/__init__.py b/src/app/api/__init__.py
@@ -0,0 +1 @@
+"""API Module for the borehole ML application."""
diff --git a/src/app/api/v1/__init__.py b/src/app/api/v1/__init__.py
@@ -0,0 +1 @@
+"""API V1 Module for the borehole ML application."""
diff --git a/src/app/api/v1/endpoints/__init__.py b/src/app/api/v1/endpoints/__init__.py
@@ -0,0 +1 @@
+"""Endpoint module."""
diff --git a/src/app/api/v1/endpoints/create_pngs.py b/src/app/api/v1/endpoints/create_pngs.py
@@ -0,0 +1,60 @@
+"""This module defines the FastAPI endpoint for converting a PDF document to PNG images."""
+
+import os
+from pathlib import Path
+
+import fitz
+from app.common.aws import load_pdf_from_aws, upload_file_to_s3
+from app.common.config import config
+from app.common.schemas import PNGResponse
+from fastapi import HTTPException
+
+
+def create_pngs(aws_filename: Path):
+    """Convert a PDF document to PNG images. Please note that this function will overwrite any existing PNG files.
+
+    Args:
+        aws_filename (str): The name of the PDF document in the S3 bucket. For example, "pdfs/10012.pdf".
+
+    Returns:
+        PNGResponse: The URLs of the PNG images in the S3 bucket.
+    """
+    # Check if the PDF name is valid
+    if not aws_filename.suffix == ".pdf":
+        raise HTTPException(status_code=400, detail="Invalid request. The filename must end with '.pdf'.")
+
+    # Get the filename from the path
+    filename = aws_filename.stem
+
+    # Initialize the S3 client
+    pdf_document = load_pdf_from_aws(aws_filename)
+
+    png_urls = []
+
+    # Convert each page of the PDF to PNG
+    try:
+        for page_number in range(pdf_document.page_count):
+            page = pdf_document.load_page(page_number)
+            pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
+            png_filename = f"{filename}-{page_number + 1}.png"
+            png_path = f"/tmp/{png_filename}"  # Local path to save the PNG
+            s3_bucket_png_path = f"pngs/{png_filename}"
+
+            pix.save(png_path)
+
+            # Upload the PNG to S3
+            upload_file_to_s3(
+                png_path,  # The local path to the file to upload
+                s3_bucket_png_path,  # The key (name) of the file in the bucket
+            )
+
+            # Generate the S3 URL
+            png_url = f"https://{config.bucket_name}.s3.amazonaws.com/{s3_bucket_png_path}"
+            png_urls.append(png_url)
+
+            # Clean up the local file
+            os.remove(png_path)
+    except Exception:
+        raise HTTPException(status_code=500, detail="An error occurred while processing the PDF.") from None
+
+    return PNGResponse(png_urls=png_urls)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""API Module for the borehole ML application."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""API V1 Module for the borehole ML application."""