Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[#105] File type schemas #116

Draft
wants to merge 22 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
ee96ab4
[#105] schemas for netcdf and raster datasets
pkdash Apr 3, 2024
2a07531
[#105] update to schema adapters to support specific schema types
pkdash Apr 3, 2024
0ddf19c
[#105] update to api endpoints to support metadata of different schemas
pkdash Apr 3, 2024
4bb3f21
[#105] generating schema json using generic dataset schema for now
pkdash Apr 3, 2024
aa8ff27
[#105] updating scheduler to handle metadata catalog records of diffe…
pkdash Apr 3, 2024
4b28d25
[#105] updating tests and new tests for netcdf and raster metadata sc…
pkdash Apr 3, 2024
db02e85
[#105] removing commented code
pkdash Apr 3, 2024
9ca3e1d
[#105] async pytest fixtures to yield
pkdash Apr 4, 2024
de7e208
[#105] cleaning of testing code
pkdash Apr 4, 2024
041b22d
[#105] using generic schema type for type hints to handle multiple sc…
pkdash Apr 4, 2024
d8ea4d4
[#105] using a test bucket for testing
pkdash Apr 4, 2024
677575b
[#105] code changes to handle metadata registration from AWS S3
pkdash Apr 5, 2024
e7342a8
[#105] test for registering metadata from AWS S3
pkdash Apr 5, 2024
c75cc0d
[#105] generating schema.json file for each of the schema types suppo…
pkdash Apr 5, 2024
9b9a20c
[#105] using RepositoryException for s3 fetch object operation errors
pkdash Apr 9, 2024
0c1142d
[#105] refactoring catalog route
pkdash Apr 9, 2024
138aa8f
[#105] adding endpoint for registering s3 generic dataset
pkdash Apr 9, 2024
86cda56
[#105] endpoint for retrieving netcdf metadata catalog record
pkdash Apr 9, 2024
1429d52
[#105] test for registering s3 generic dataset
pkdash Apr 9, 2024
aa5d85c
Merge branch 'develop' of https://github.com/I-GUIDE/catalogapi into …
pkdash Apr 23, 2024
e12d918
[#105] updating with develop along with some code refactoring
pkdash Jun 5, 2024
bb5eaa3
[#105] fixing the aws s3 endpoint url paths in tests
pkdash Jun 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions api/adapters/base.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import abc

from typing import Type
from api.config import Settings, get_settings
from api.models.catalog import DatasetMetadataDOC
from api.models.catalog import T
from api.models.user import Submission


class AbstractRepositoryRequestHandler(abc.ABC):
settings: Settings = get_settings()

@abc.abstractmethod
def get_metadata(self, record_id: str):
def get_metadata(self, record_id: str) -> dict:
"""Returns the metadata for the specified record from a repository"""
...

Expand All @@ -19,13 +19,13 @@ class AbstractRepositoryMetadataAdapter(abc.ABC):

@staticmethod
@abc.abstractmethod
def to_catalog_record(metadata: dict) -> DatasetMetadataDOC:
def to_catalog_record(metadata: dict, meta_model_type: Type[T]) -> T:
"""Converts repository metadata to a catalog dataset record"""
...

@staticmethod
@abc.abstractmethod
def to_repository_record(catalog_record: DatasetMetadataDOC):
def to_repository_record(catalog_record: T):
"""Converts dataset catalog dataset record to repository metadata"""
...

Expand All @@ -35,7 +35,7 @@ def update_submission(submission: Submission, repo_record_id: str) -> Submission
"""Sets additional repository specific metadata to submission record"""
...

async def get_metadata(self, record_id: str):
async def get_metadata(self, record_id: str) -> dict:
"""Returns the metadata for the specified record from a repository"""

return self.repo_api_handler.get_metadata(record_id)
13 changes: 7 additions & 6 deletions api/adapters/hydroshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from api.adapters.utils import RepositoryType, register_adapter
from api.exceptions import RepositoryException
from api.models import schema
from api.models.catalog import DatasetMetadataDOC
from api.models.user import Submission, SubmissionType
from api.models.catalog import HSResourceMetadataDOC
from api.models.user import Submission


class Creator(BaseModel):
Expand Down Expand Up @@ -163,7 +163,8 @@ def to_dataset_license(self):


class _HydroshareRequestHandler(AbstractRepositoryRequestHandler):
def get_metadata(self, record_id: str):

def get_metadata(self, record_id: str) -> dict:
hs_meta_url = self.settings.hydroshare_meta_read_url % record_id
hs_file_url = self.settings.hydroshare_file_read_url % record_id

Expand Down Expand Up @@ -194,13 +195,13 @@ class HydroshareMetadataAdapter(AbstractRepositoryMetadataAdapter):
repo_api_handler = _HydroshareRequestHandler()

@staticmethod
def to_catalog_record(metadata: dict) -> DatasetMetadataDOC:
def to_catalog_record(metadata: dict, meta_model_type: HSResourceMetadataDOC) -> HSResourceMetadataDOC:
"""Converts hydroshare resource metadata to a catalog dataset record"""
hs_metadata_model = _HydroshareResourceMetadata(**metadata)
return hs_metadata_model.to_catalog_dataset()

@staticmethod
def to_repository_record(catalog_record: DatasetMetadataDOC):
def to_repository_record(catalog_record: HSResourceMetadataDOC):
"""Converts dataset catalog record to hydroshare resource metadata"""
raise NotImplementedError

Expand Down Expand Up @@ -293,7 +294,7 @@ def to_dataset_provider():
return provider

def to_catalog_dataset(self):
dataset = DatasetMetadataDOC.construct()
dataset = HSResourceMetadataDOC.construct()
dataset.provider = self.to_dataset_provider()
dataset.name = self.title
dataset.description = self.abstract
Expand Down
65 changes: 49 additions & 16 deletions api/adapters/s3.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,74 @@
import boto3
import json
from botocore.client import Config
from http import HTTPStatus
from typing import Type

import boto3
from botocore import UNSIGNED
from botocore.client import Config
from botocore.exceptions import ClientError as S3ClientError

from api.adapters.base import AbstractRepositoryMetadataAdapter, AbstractRepositoryRequestHandler
from api.adapters.base import (
AbstractRepositoryMetadataAdapter,
AbstractRepositoryRequestHandler,
)
from api.adapters.utils import RepositoryType, register_adapter
from api.models.catalog import DatasetMetadataDOC
from api.models.user import Submission, SubmissionType
from api.exceptions import RepositoryException
from api.models.catalog import T
from api.models.user import Submission


class _S3RequestHandler(AbstractRepositoryRequestHandler):
def get_metadata(self, record_id: str):
def get_metadata(self, record_id: str) -> dict:
endpoint_url = record_id.split("+")[0]
bucket_name = record_id.split("+")[1]
file_key = record_id.split("+")[2]

s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED), endpoint_url=endpoint_url)

response = s3.get_object(Bucket=bucket_name, Key=file_key)
json_content = response['Body'].read().decode('utf-8')
# TODO: Should we be expecting the path for the data file and then compute the metadata file path from that?
# Or should we be expecting the metadata file path directly? May be we should get path for both
# data file and metadata file. If have the path for the data file we can check that the data file
# exists and then retrieve the metadata file and catalog the metadata.

# check if the endpoint URL is an AWS S3 URL
if endpoint_url.endswith("amazonaws.com"):
endpoint_url = None
s3 = boto3.client(
"s3", config=Config(signature_version=UNSIGNED), endpoint_url=endpoint_url
)
try:
response = s3.get_object(Bucket=bucket_name, Key=file_key)
except S3ClientError as ex:
if ex.response["Error"]["Code"] == "NoSuchKey":
raise RepositoryException(
detail=f"Specified metadata file was not found in S3: {bucket_name}/{file_key}",
status_code=HTTPStatus.NOT_FOUND
)
else:
err_msg = f"Error accessing S3 file({bucket_name}/{file_key}): {str(ex)}"
raise RepositoryException(detail=err_msg, status_code=HTTPStatus.BAD_REQUEST)

# Parse the JSON content
data = json.loads(json_content)
json_content = response["Body"].read().decode("utf-8")
# parse the JSON content
try:
data = json.loads(json_content)
except json.JSONDecodeError as ex:
err_msg = f"Invalid JSON content in S3 file ({file_key}). Error: {str(ex)}"
raise RepositoryException(detail=err_msg, status_code=HTTPStatus.BAD_REQUEST)

# remove additionalType field - this will be set by the schema model
data.pop("additionalType", None)
return data


class S3MetadataAdapter(AbstractRepositoryMetadataAdapter):
repo_api_handler = _S3RequestHandler()

@staticmethod
def to_catalog_record(metadata: dict) -> DatasetMetadataDOC:
return DatasetMetadataDOC(**metadata)
def to_catalog_record(metadata: dict, meta_model_type: Type[T]) -> T:
return meta_model_type(**metadata)

@staticmethod
def to_repository_record(catalog_record: DatasetMetadataDOC):
"""Converts dataset catalog record to hydroshare resource metadata"""
def to_repository_record(catalog_record: T):
"""Converts dataset catalog record to repository resource/dataset metadata"""
raise NotImplementedError

@staticmethod
Expand Down
5 changes: 3 additions & 2 deletions api/adapters/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enum import Enum
from typing import Type, Union
from typing import Type, Union, Dict

from api.adapters.base import AbstractRepositoryMetadataAdapter

Expand All @@ -9,7 +9,7 @@ class RepositoryType(str, Enum):
S3 = 'S3'


_adapter_registry = {}
_adapter_registry: Dict[RepositoryType, Type[AbstractRepositoryMetadataAdapter]] = {}


def register_adapter(repository_type: RepositoryType, adapter_class: Type[AbstractRepositoryMetadataAdapter]) -> None:
Expand All @@ -21,3 +21,4 @@ def get_adapter_by_type(repository_type: RepositoryType) -> Union[AbstractReposi
if adapter_cls:
return adapter_cls()
return None

14 changes: 12 additions & 2 deletions api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@
from starlette.responses import PlainTextResponse

from api.config import get_settings
from api.models.catalog import DatasetMetadataDOC
from api.models.catalog import (
CoreMetadataDOC,
HSResourceMetadataDOC,
GenericDatasetMetadataDOC,
NetCDFMetadataDOC,
RasterMetadataDOC,
)
from api.models.user import Submission, User
from api.routes.catalog import router as catalog_router
from api.routes.discovery import router as discovery_router
Expand Down Expand Up @@ -48,7 +54,11 @@ async def startup_db_client():
settings = get_settings()
app.mongodb_client = AsyncIOMotorClient(settings.db_connection_string)
app.mongodb = app.mongodb_client[settings.database_name]
await init_beanie(database=app.mongodb, document_models=[DatasetMetadataDOC, User, Submission])
await init_beanie(
database=app.mongodb,
document_models=[CoreMetadataDOC, HSResourceMetadataDOC, GenericDatasetMetadataDOC, NetCDFMetadataDOC,
RasterMetadataDOC, User, Submission]
)


@app.on_event("shutdown")
Expand Down
42 changes: 35 additions & 7 deletions api/models/catalog.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
import datetime
from typing import Optional
from typing import TypeVar, Optional

from beanie import Document

from api.models.user import Submission, S3Path
from .schema import CoreMetadata, DatasetMetadata
from .schema import (
CoreMetadata,
GenericDatasetMetadata,
HSResourceMetadata,
HSNetCDFMetadata,
HSRasterMetadata,
)


class CoreMetadataDOC(Document, CoreMetadata):
# this field is not stored in the database, but is populated from the corresponding submission record
# using the type field in the submission record
# these fields are not stored in the database, but are populated from the corresponding submission record
submission_type: str = None
repository_identifier: str = None
s3_path: Optional[S3Path] = None

class Settings:
# name is the collection name in database (iguide) where the Metadata Record documents will be stored
Expand All @@ -36,6 +43,27 @@ def as_submission(self) -> Submission:
)


class DatasetMetadataDOC(CoreMetadataDOC, DatasetMetadata):
repository_identifier: str = None
s3_path: Optional[S3Path] = None
class HSResourceMetadataDOC(CoreMetadataDOC, HSResourceMetadata):

def as_submission(self) -> Submission:
submission = super().as_submission()
submission.repository = "HydroShare"
submission.repository_identifier = self.repository_identifier
return submission


class GenericDatasetMetadataDOC(CoreMetadataDOC, GenericDatasetMetadata):
pass


class NetCDFMetadataDOC(CoreMetadataDOC, HSNetCDFMetadata):
pass


class RasterMetadataDOC(CoreMetadataDOC, HSRasterMetadata):
pass

# T is a type variable that can be used for type hinting for any schema model that inherits from CoreMetadataDOC


T = TypeVar("T", bound=CoreMetadataDOC)
83 changes: 59 additions & 24 deletions api/models/management/generate_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,65 @@

import typer

from api.models.schema import DatasetMetadata


def main(output_name: str = "api/models/schemas/schema.json"):
schema = DatasetMetadata.schema()
json_schema = DatasetMetadata.schema_json()#indent=2)
# Have to run it a few times for the definitions to get updated before inserted into another model
while "#/definitions/" in json_schema:
for definition in schema["definitions"]:
class_definition = schema["definitions"][definition]
# replace allOf with a single definition
json_schema = json_schema.replace(
f'"allOf": [{{"$ref": "#/definitions/{definition}"}}]',
json.dumps(class_definition)[1:-1]
)
#replace definition directly
json_schema = json_schema.replace(
f'"$ref": "#/definitions/{definition}"',
json.dumps(class_definition)[1:-1]
)
embedded_schema = json.loads(json_schema)
current_directory = absolute_directory(output_name)
with open(current_directory, "w") as f:
f.write(json.dumps(embedded_schema, indent=2))
from api.models.schema import (
GenericDatasetMetadata,
HSNetCDFMetadata,
HSRasterMetadata,
HSResourceMetadata,
)


def main():
def generate_schema_json(schema_model, folder_name):
base_directory = "api/models/schemas"
schema_file_path = os.path.join(base_directory, folder_name, "schema.json")
schema = schema_model.schema()
json_schema = schema_model.schema_json()

# Have to run it a few times for the definitions to get updated before inserted into another model
while "#/definitions/" in json_schema:
for definition in schema["definitions"]:
class_definition = schema["definitions"][definition]
# replace allOf with a single definition
json_schema = json_schema.replace(
f'"allOf": [{{"$ref": "#/definitions/{definition}"}}]',
json.dumps(class_definition)[1:-1]
)
#replace definition directly
json_schema = json_schema.replace(
f'"$ref": "#/definitions/{definition}"',
json.dumps(class_definition)[1:-1]
)
embedded_schema = json.loads(json_schema)
current_directory = absolute_directory(schema_file_path)
with open(current_directory, "w") as f:
f.write(json.dumps(embedded_schema, indent=2))

schemas = get_schemas()
for schema_item in schemas:
generate_schema_json(schema_model=schema_item["model"], folder_name=schema_item["folder_name"])


def get_schemas():
schemas = [
{
"model": GenericDatasetMetadata,
"folder_name": "generic",
},
{
"model": HSResourceMetadata,
"folder_name": "hs_resource",
},
{
"model": HSNetCDFMetadata,
"folder_name": "netcdf",
},
{
"model": HSRasterMetadata,
"folder_name": "raster",
},
]
return schemas


def absolute_directory(output_name):
Expand Down
Loading
Loading