diff --git a/.gitignore b/.gitignore index c907531198ef..67bf620e983f 100644 --- a/.gitignore +++ b/.gitignore @@ -130,6 +130,7 @@ tool_test_output.json client/**/jsconfig.json vetur.config.js .pre-commit-config.yaml +galaxy.code-workspace # Chrom len files *.len diff --git a/client/src/utils/upload-payload.js b/client/src/utils/upload-payload.js index 2fbca26c06da..4a841c0e1ffb 100644 --- a/client/src/utils/upload-payload.js +++ b/client/src/utils/upload-payload.js @@ -12,6 +12,7 @@ export const URI_PREFIXES = [ "drs://", "invenio://", "zenodo://", + "dataverse://", ]; export function isUrl(content) { diff --git a/lib/galaxy/config/sample/file_sources_conf.yml.sample b/lib/galaxy/config/sample/file_sources_conf.yml.sample index 0bc5e9e10aed..74bc157f6773 100644 --- a/lib/galaxy/config/sample/file_sources_conf.yml.sample +++ b/lib/galaxy/config/sample/file_sources_conf.yml.sample @@ -229,6 +229,24 @@ public_name: ${user.preferences['zenodo_sandbox|public_name']} writable: true +- type: dataverse + id: dataverse + doc: Dataverse is an open-source data repository platform designed for sharing, preserving, and managing research data, offering tools for data citation, exploration, and collaboration. + label: Dataverse + url: https://dataverse.org + token: ${user.user_vault.read_secret('preferences/dataverse/token')} + public_name: ${user.preferences['dataverse|public_name']} + writable: true + +- type: dataverse + id: dataverse_sandbox + doc: This is the sandbox instance of Dataverse. It is used for testing purposes only, content is NOT preserved. DOIs created in this instance are not real and will not resolve. + label: Dataverse Sandbox (use only for testing purposes) + url: https://demo.dataverse.org + token: ${user.user_vault.read_secret('preferences/dataverse_sandbox/token')} + public_name: ${user.preferences['dataverse_sandbox|public_name']} + writable: true + # Note for developers: you can easily set up a minimal, dockerized Onedata environment # using the so-called "demo-mode": https://onedata.org/#/home/documentation/topic/stable/demo-mode - type: onedata diff --git a/lib/galaxy/config/sample/user_preferences_extra_conf.yml.sample b/lib/galaxy/config/sample/user_preferences_extra_conf.yml.sample index 9f38f40bba1d..fb73fe16c2ea 100644 --- a/lib/galaxy/config/sample/user_preferences_extra_conf.yml.sample +++ b/lib/galaxy/config/sample/user_preferences_extra_conf.yml.sample @@ -135,6 +135,32 @@ preferences: label: Creator name to associate with new records (formatted as "Last name, First name"). If left blank "Anonymous Galaxy User" will be used. You can always change this by editing your record directly. type: text required: False + + dataverse: + description: Your Dataverse Integration Settings + inputs: + - name: token + label: API Token used to create draft records and to upload files. You can manage your tokens at https://YOUR_INSTANCE/dataverseuser.xhtml?selectTab=apiTokenTab (Replace YOUR_INSTANCE with your Dataverse instance URL) + type: secret + # store: vault # Requires setting up vault_config_file in your galaxy.yml + required: False + - name: public_name + label: Creator name to associate with new datasets (formatted as "Last name, First name"). If left blank "Anonymous Galaxy User" will be used. You can always change this by editing your dataset directly. + type: text + required: False + + dataverse_sandbox: + description: Your Dataverse Integration Settings (TESTING ONLY) + inputs: + - name: token + label: API Token used to create draft records and to upload files. You can manage your tokens at https://demo.dataverse.org/dataverseuser.xhtml?selectTab=apiTokenTab (Replace demo.dataverse.org with your Dataverse instance URL) + type: secret + # store: vault # Requires setting up vault_config_file in your galaxy.yml + required: False + - name: public_name + label: Creator name to associate with new datasets (formatted as "Last name, First name"). If left blank "Anonymous Galaxy User" will be used. You can always change this by editing your dataset directly. + type: text + required: False # Used in file_sources_conf.yml onedata: diff --git a/lib/galaxy/files/sources/_rdm.py b/lib/galaxy/files/sources/_rdm.py index 1848cf57cb24..025b24fc684f 100644 --- a/lib/galaxy/files/sources/_rdm.py +++ b/lib/galaxy/files/sources/_rdm.py @@ -25,9 +25,10 @@ class RDMFilesSourceProperties(FilesSourceProperties): public_name: str -class RecordFilename(NamedTuple): - record_id: str - filename: str +class ContainerAndFileIdentifier(NamedTuple): + """The file_identifier could be a filename or a file_id.""" + container_id: str + file_identifier: str class RDMRepositoryInteractor: @@ -35,6 +36,10 @@ class RDMRepositoryInteractor: This class is not intended to be used directly, but rather to be subclassed by file sources that interact with RDM repositories. + + Different RDM repositories use different terminology. Also they use the same term for different things. + To prevent confusion, we use the term "container" in the base repository. + This is an abstract term for the entity that contains multiple files. """ def __init__(self, repository_url: str, plugin: "RDMFilesSource"): @@ -54,13 +59,13 @@ def repository_url(self) -> str: """ return self._repository_url - def to_plugin_uri(self, record_id: str, filename: Optional[str] = None) -> str: - """Creates a valid plugin URI to reference the given record_id. + def to_plugin_uri(self, container_id: str, filename: Optional[str] = None) -> str: + """Creates a valid plugin URI to reference the given container_id. - If a filename is provided, the URI will reference the specific file in the record.""" + If a filename is provided, the URI will reference the specific file in the container.""" raise NotImplementedError() - def get_records( + def get_file_containers( self, writeable: bool, user_context: OptionalUserContext = None, @@ -69,54 +74,56 @@ def get_records( query: Optional[str] = None, sort_by: Optional[str] = None, ) -> Tuple[List[RemoteDirectory], int]: - """Returns the list of records in the repository and the total count of records. + """Returns the list of file containers in the repository and the total count containers. - If writeable is True, only records that the user can write to will be returned. + If writeable is True, only containers that the user can write to will be returned. The user_context might be required to authenticate the user in the repository. """ raise NotImplementedError() - def get_files_in_record( - self, record_id: str, writeable: bool, user_context: OptionalUserContext = None + def get_files_in_container( + self, container_id: str, writeable: bool, user_context: OptionalUserContext = None, query: Optional[str] = None, ) -> List[RemoteFile]: - """Returns the list of files contained in the given record. + """Returns the list of files of a file container. - If writeable is True, we are signaling that the user intends to write to the record. + If writeable is True, we are signaling that the user intends to write to the container. """ raise NotImplementedError() - def create_draft_record( + def create_draft_file_container( + self, title: str, public_name: Optional[str] = None, user_context: OptionalUserContext = None ): - """Creates a draft record (directory) in the repository with basic metadata. + """Creates a draft file container in the repository with basic metadata. - The metadata is usually just the title of the record and the user that created it. + The metadata is usually just the title of the container and the user that created it. Some plugins might also provide additional metadata defaults in the user settings.""" raise NotImplementedError() - def upload_file_to_draft_record( + def upload_file_to_draft_container( self, - record_id: str, + container_id: str, filename: str, file_path: str, user_context: OptionalUserContext = None, ) -> None: - """Uploads a file with the provided filename (from file_path) to a draft record with the given record_id. + """Uploads a file with the provided filename (from file_path) to a draft container with the given container_id. + + The draft container must have been created in advance with the `create_draft_file_container` method. - The draft record must have been created in advance with the `create_draft_record` method. The file must exist in the file system at the given file_path. The user_context might be required to authenticate the user in the repository. """ raise NotImplementedError() - def download_file_from_record( + def download_file_from_container( self, - record_id: str, - filename: str, + container_id: str, + file_identifier: str, file_path: str, user_context: OptionalUserContext = None, ) -> None: - """Downloads a file with the provided filename from the record with the given record_id. + """Downloads a file with the provided filename from the container with the given container_id. The file will be downloaded to the file system at the given file_path. The user_context might be required to authenticate the user in the repository if the @@ -132,13 +139,11 @@ class RDMFilesSource(BaseFilesSource): by file sources that interact with RDM repositories. A RDM file source is similar to a regular file source, but instead of tree of - files and directories, it provides a (one level) list of records (representing directories) + files and directories, it provides a (one level) list of containers that can contain only files (no subdirectories). - In addition, RDM file sources might need to create a new record (directory) in advance in the - repository, and then upload a file to it. This is done by calling the `create_entry` - method. - + In addition, RDM file sources might need to create a new container in advance in the + repository, and then upload a file to it. This is done by calling the `_create_entry` method. """ plugin_kind = PluginKind.rdm @@ -164,35 +169,16 @@ def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInterac This must be implemented by subclasses.""" raise NotImplementedError() - def parse_path(self, source_path: str, record_id_only: bool = False) -> RecordFilename: - """Parses the given source path and returns the record_id and filename. + def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: + """Parses the given source path and returns the container_id and filename. + + If container_id_only is True, an empty filename will be returned. - The source path must have the format '//'. - If record_id_only is True, the source path must have the format '/' and an - empty filename will be returned. - """ + This must be implemented by subclasses.""" + raise NotImplementedError() - def get_error_msg(details: str) -> str: - return f"Invalid source path: '{source_path}'. Expected format: '{expected_format}'. {details}" - - expected_format = "/" - if not source_path.startswith("/"): - raise ValueError(get_error_msg("Must start with '/'.")) - parts = source_path[1:].split("/", 2) - if record_id_only: - if len(parts) != 1: - raise ValueError(get_error_msg("Please provide the record_id only.")) - return RecordFilename(record_id=parts[0], filename="") - expected_format = "//" - if len(parts) < 2: - raise ValueError(get_error_msg("Please provide both the record_id and file_name.")) - if len(parts) > 2: - raise ValueError(get_error_msg("Too many parts. Please provide the record_id and file_name only.")) - record_id, file_name = parts - return RecordFilename(record_id=record_id, filename=file_name) - - def get_record_id_from_path(self, source_path: str) -> str: - return self.parse_path(source_path, record_id_only=True).record_id + def get_container_id_from_path(self, source_path: str) -> str: + raise NotImplementedError() def _serialization_props(self, user_context: OptionalUserContext = None): effective_props = {} diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py new file mode 100644 index 000000000000..c0c1d286e9c7 --- /dev/null +++ b/lib/galaxy/files/sources/dataverse.py @@ -0,0 +1,571 @@ +import json +import re +import urllib.request +from urllib.parse import quote + +from typing import ( + Any, + cast, + Dict, + List, + Optional, + Tuple, +) + +from typing_extensions import ( + Literal, + TypedDict, + Unpack, +) + +from galaxy.exceptions import AuthenticationRequired +from galaxy.files import OptionalUserContext +from galaxy.files.sources import ( + AnyRemoteEntry, + DEFAULT_PAGE_LIMIT, + DEFAULT_SCHEME, + Entry, + EntryData, + FilesSourceOptions, + RemoteDirectory, + RemoteFile, +) +from galaxy.files.sources._rdm import ( + RDMFilesSource, + RDMFilesSourceProperties, + RDMRepositoryInteractor, + ContainerAndFileIdentifier, +) +from galaxy.util import ( + DEFAULT_SOCKET_TIMEOUT, + get_charset_from_http_headers, + requests, + stream_to_open_named_file, +) + +class NotFoundException(Exception): + def __init__(self, message): + super().__init__(message) + +class DataverseDataset(TypedDict): + name: str + type: str + url: str + global_id: str + description: str + published_at: str + storageIdentifier: str + fileCount: int + versionState: str + createdAt: str + updatedAt: str + publication_date: str + + +class DataverseRDMFilesSource(RDMFilesSource): + """A files source for Dataverse turn-key research data management repository. + + In Dataverse a "dataset" represents what we refer to as container in the rdm base class + """ + plugin_type = "dataverse" + supports_pagination = True + supports_search = True + + def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): + super().__init__(**kwd) + self._scheme_regex = re.compile(rf"^{self.get_scheme()}?://{self.id}|^{DEFAULT_SCHEME}://{self.id}") + self.repository: DataverseRepositoryInteractor + + def get_scheme(self) -> str: + return "dataverse" + + def score_url_match(self, url: str) -> int: + if match := self._scheme_regex.match(url): + return match.span()[1] + else: + return 0 + + def to_relative_path(self, url: str) -> str: + legacy_uri_root = f"{DEFAULT_SCHEME}://{self.id}" + if url.startswith(legacy_uri_root): + return url[len(legacy_uri_root) :] + else: + return super().to_relative_path(url) + + def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInteractor: + return DataverseRepositoryInteractor(repository_url, self) + + def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: + """Parses the given source path and returns the dataset_id and/or the file_id. + + The source path must either have the format '/' or '/' where is a subset of . + If dataset_id_only is True, the source path must have the format '/' and an empty file_id will be returned. + + Example dataset_id: + doi:10.70122/FK2/DIG2DG + + Example file_id: + doi:10.70122/FK2/DIG2DG/AVNCLL + """ + if not source_path.startswith("/"): + raise ValueError(f"Invalid source path: '{source_path}'. Must start with '/'.") + + parts = source_path[1:].split("/", 3) + dataset_id = "/".join(parts[:3]) + + if container_id_only: + if len(parts) != 3: + raise ValueError(f"Invalid source path: '{source_path}'. Expected format: '/'.") + return ContainerAndFileIdentifier(container_id=dataset_id, file_identifier="") + + if len(parts) != 4: + raise ValueError(f"Invalid source path: '{source_path}'. Expected format: '/'.") + + file_id = dataset_id + "/" + parts[3] + return ContainerAndFileIdentifier(container_id=dataset_id, file_identifier=file_id) + + def get_container_id_from_path(self, source_path: str) -> str: + return self.parse_path(source_path, container_id_only=True).container_id + + def _list( + self, + path="/", + recursive=True, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, + query: Optional[str] = None, + sort_by: Optional[str] = None, + ) -> Tuple[List[AnyRemoteEntry], int]: + """This method lists the datasets or files from dataverse.""" + writeable = opts and opts.writeable or False + is_root_path = path == "/" + if is_root_path: + datasets, total_hits = self.repository.get_file_containers( + writeable, user_context, limit=limit, offset=offset, query=query + ) + return cast(List[AnyRemoteEntry], datasets), total_hits + dataset_id = self.get_container_id_from_path(path) + files = self.repository.get_files_in_container(dataset_id, writeable, user_context, query) + return cast(List[AnyRemoteEntry], files), len(files) + + def _create_entry( + self, + entry_data: EntryData, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + ) -> Entry: + """Creates a draft dataset in the repository.""" + public_name = self.get_public_name(user_context) or "Anonymous Galaxy User" + dataset = self.repository.create_draft_file_container(entry_data.get("name"), public_name, user_context) + return { + "uri": self.repository.to_plugin_uri(dataset.get("persistentId")), + "name": dataset.get("name") or "No title", + "external_link": self.repository.public_dataset_url(dataset.get("persistentId")), + } + + def _realize_to( + self, + source_path: str, + native_path: str, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + ): + """Used when download files from dataverse.""" + # TODO: user_context is always None here when called from a data fetch. (same problem as in invenio.py) + # This prevents downloading files that require authentication even if the user provided a token. + + dataset_id, file_id = self.parse_path(source_path) + try: + self.repository.download_file_from_container(dataset_id, file_id, native_path, user_context=user_context) + except NotFoundException as e: + filename = file_id.split("/")[-1] + is_zip_file = self._is_zip_archive(filename) + if is_zip_file: + # Workaround explanation: + # When we archive our history to dataverse, the zip sent from Galaxy to dataverse is extracted automatically. + # Only the contents are stored, not the zip itself. + # So, if a zip is not found, we suppose we are trying to reimport an archived history + # and make an API call to Dataverse to download the dataset as a zip. + self.repository._download_dataset_as_zip(dataset_id, native_path, user_context) + + def _is_zip_archive(self, file_name: str) -> bool: + return file_name.endswith(".zip") + + def _write_from( + self, + target_path: str, + native_path: str, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + ): + """Used when uploading files to dataverse.""" + dataset_id, file_id = self.parse_path(target_path) + self.repository.upload_file_to_draft_container(dataset_id, file_id, native_path, user_context=user_context) + +class DataverseRepositoryInteractor(RDMRepositoryInteractor): + """In Dataverse a "Dataset" represents what we refer to as container in the rdm base class""" + + @property + def api_base_url(self) -> str: + return f"{self.repository_url}/api/v1" + + @property + def search_url(self) -> str: + return f"{self.api_base_url}/search" + + def file_access_url(self, file_id: str) -> str: + encoded_file_id = quote(file_id, safe="") + return f"{self.api_base_url}/access/datafile/:persistentId?persistentId={encoded_file_id}" + + def files_of_dataset_url(self, dataset_id: str, dataset_version: str = ':latest') -> str: + return f"{self.api_base_url}/datasets/:persistentId/versions/{dataset_version}/files?persistentId={dataset_id}" + + def create_collection_url(self, parent_alias: str) -> str: + return f"{self.api_base_url}/dataverses/{parent_alias}" + + def create_dataset_url(self, parent_alias: str) -> str: + return f"{self.api_base_url}/dataverses/{parent_alias}/datasets" + + def download_dataset_as_zip_url(self, dataset_id: str) -> str: + return f"{self.api_base_url}/access/dataset/:persistentId/?persistentId={dataset_id}" + + def add_files_to_dataset_url(self, dataset_id: str) -> str: + return f"{self.api_base_url}/datasets/:persistentId/add?persistentId={dataset_id}" + + def public_dataset_url(self, dataset_id: str) -> str: + return f"{self.repository_url}/dataset.xhtml?persistentId={dataset_id}" + + def to_plugin_uri(self, dataset_id: str, file_identifier: Optional[str] = None) -> str: + return f"{self.plugin.get_uri_root()}/{f'{file_identifier}' if file_identifier else f'{dataset_id}'}" + + def _is_api_url(self, url: str) -> bool: + return "/api/" in url + + def get_file_containers( + self, + writeable: bool, + user_context: OptionalUserContext = None, + limit: Optional[int] = None, + offset: Optional[int] = None, + query: Optional[str] = None, + sort_by: Optional[str] = None, + ) -> Tuple[List[RemoteDirectory], int]: + """Lists the Dataverse datasets in the repository.""" + request_url = self.search_url + params = { + "type": "dataset", + "per_page": limit or DEFAULT_PAGE_LIMIT, + "start": offset, + "q": f"title:{query}" if query else "*", + "sort": sort_by or "date", + } + if writeable: + params["fq"] = "publicationStatus:Draft" + response_data = self._get_response(user_context, request_url, params) + total_hits = response_data["data"]["total_count"] + return self._get_datasets_from_response(response_data["data"]), total_hits + + def get_files_in_container( + self, + dataset_id: str, + writeable: bool, + user_context: OptionalUserContext = None, + query: Optional[str] = None, + ) -> List[RemoteFile]: + """This method lists the files in a dataverse dataset.""" + request_url = self.files_of_dataset_url(dataset_id=dataset_id) + response_data = self._get_response(user_context, request_url) + files = self._get_files_from_response(dataset_id, response_data["data"]) + files = self._filter_files_by_name(files, query) + return files + + def _filter_files_by_name(self, files: List[RemoteFile], query: Optional[str] = None) -> List[RemoteFile]: + if not query: + return files + return [file for file in files if query in file["name"]] + + def create_draft_file_container( + self, title: str, public_name: str, user_context: OptionalUserContext = None + ) -> RemoteDirectory: + """Creates a draft dataset in the repository. Dataverse datasets are contained in collections. Collections can be contained in collections. + We create a collection inside the root collection and then a dataset inside that collection. + """ + collection_payload = self._prepare_collection_data(title, public_name, user_context) + collection = self._create_collection(":root", collection_payload, user_context) + if collection and collection.get("data"): + collection_alias = collection.get("data").get("alias") + else: + raise Exception("Could not create collection in Dataverse or response has not expected format.") + dataset_payload = self._prepare_dataset_data(title, public_name, user_context) + dataset = self._create_dataset(collection_alias, dataset_payload, user_context) + if dataset and dataset.get("data"): + dataset["data"]["name"] = title + return dataset.get("data") + else: + raise Exception("Could not create dataset in Dataverse or response has not expected format.") + + def _create_collection(self, parent_alias: str, collection_payload: str, user_context: OptionalUserContext = None) -> dict: + headers = self._get_request_headers(user_context, auth_required=True) + response = requests.post(self.create_collection_url(parent_alias), data=collection_payload, headers=headers) + self._ensure_response_has_expected_status_code(response, 201) + return response.json() + + def _create_dataset(self, parent_alias: str, dataset_payload: str, user_context: OptionalUserContext = None) -> dict: + headers = self._get_request_headers(user_context, auth_required=True) + response = requests.post(self.create_dataset_url(parent_alias), data=dataset_payload, headers=headers) + self._ensure_response_has_expected_status_code(response, 201) + return response.json() + + def upload_file_to_draft_container( + self, + dataset_id: str, + filename: str, + file_path: str, + user_context: OptionalUserContext = None, + ): + """Uploads a file to a draft dataset in the repository.""" + headers = self._get_request_headers(user_context, auth_required=True) + + with open(file_path, "rb") as file: + # TODO: For some reason tar.gz files are not uploaded successfully to Dataverse. + files = {'file': (filename, file)} + add_files_url = self.add_files_to_dataset_url(dataset_id) + response = requests.post( + add_files_url, + files=files, + headers=headers) + self._ensure_response_has_expected_status_code(response, 200) + + def download_file_from_container( + self, + container_id: str, + file_identifier: str, + file_path: str, + user_context: OptionalUserContext = None, + ): + download_file_content_url = self.file_access_url(file_identifier) + self._download_file(file_path, download_file_content_url, user_context) + + def _download_dataset_as_zip( + self, + dataset_id: str, + file_path: str, + user_context: OptionalUserContext = None + ): + download_dataset_url = self.download_dataset_as_zip_url(dataset_id) + self._download_file(file_path, download_dataset_url, user_context) + + def _download_file( + self, + file_path: str, + download_file_content_url: str, + user_context: OptionalUserContext = None, + ): + headers = {} + + if self._is_api_url(download_file_content_url): + # pass the token as a header only when using the API + headers = self._get_request_headers(user_context) + try: + req = urllib.request.Request(download_file_content_url, headers=headers) + with urllib.request.urlopen(req, timeout=DEFAULT_SOCKET_TIMEOUT) as page: + f = open(file_path, "wb") + return stream_to_open_named_file( + page, f.fileno(), file_path, source_encoding=get_charset_from_http_headers(page.headers) + ) + except urllib.error.HTTPError as e: + # TODO: We can only download files from published datasets for now + if e.code in [401, 403, 404]: + raise NotFoundException( + f"Cannot download file from URL '{file_path}'. Please make sure the dataset and/or file exists and it is public." + ) + + def _get_datasets_from_response(self, response: dict) -> List[RemoteDirectory]: + rval: List[RemoteDirectory] = [] + for dataset in response["items"]: + uri = self.to_plugin_uri(dataset_id=dataset["global_id"]) + rval.append( + { + "class": "Directory", + "name": dataset.get("name") or "No title", + "uri": uri, + "path": self.plugin.to_relative_path(uri), + } + ) + return rval + + def _get_files_from_response(self, dataset_id: str, response: dict) -> List[RemoteFile]: + rval: List[RemoteFile] = [] + for entry in response: + dataFile = entry.get("dataFile") + uri = self.to_plugin_uri(dataset_id, dataFile.get("persistentId")) + rval.append( + { + "class": "File", + "name": dataFile.get("filename"), + "size": dataFile.get("filesize"), + "ctime": dataFile.get("creationDate"), + "uri": uri, + "path": self.plugin.to_relative_path(uri), + } + ) + return rval + + def _get_response( + self, + user_context: OptionalUserContext, + request_url: str, + params: Optional[Dict[str, Any]] = None, + auth_required: bool = False, + ) -> dict: + headers = self._get_request_headers(user_context, auth_required) + response = requests.get(request_url, params=params, headers=headers) + self._ensure_response_has_expected_status_code(response, 200) + return response.json() + + def _get_request_headers(self, user_context: OptionalUserContext, auth_required: bool = False): + token = self.plugin.get_authorization_token(user_context) + headers = {"X-Dataverse-Key": f"{token}"} if token else {} + if auth_required and token is None: + self._raise_auth_required() + return headers + + def _ensure_response_has_expected_status_code(self, response, expected_status_code: int): + if response.status_code != expected_status_code: + if response.status_code == 403: + self._raise_auth_required() + error_message = self._get_response_error_message(response) + raise Exception( + f"Request to {response.url} failed with status code {response.status_code}: {error_message}" + ) + + def _raise_auth_required(self): + raise AuthenticationRequired( + f"Please provide a personal access token in your user's preferences for '{self.plugin.label}'" + ) + + def _get_response_error_message(self, response): + response_json = response.json() + error_message = response_json.get("message") if response.status_code == 400 else response.text + errors = response_json.get("errors", []) + for error in errors: + error_message += f"\n{json.dumps(error)}" + return error_message + + def _get_user_email(self, user_context: OptionalUserContext = None) -> str: + return user_context.email if user_context and user_context.email else "enteryourmail@placeholder.com" + + def _create_valid_alias(self, public_name: str, title: str) -> str: + return re.sub(r"[^a-zA-Z0-9-_]", "", public_name.lower().replace(" ", "-") + "_" + title.lower().replace(" ", "-")) + + def _prepare_collection_data( + self, + title: str, + public_name: str, + user_context: OptionalUserContext = None, + ) -> str: + return json.dumps({ + "name": title, + "alias": self._create_valid_alias(public_name, title), + "dataverseContacts": [ + { + "contactEmail": self._get_user_email(user_context) + }, + ], + }) + + def _prepare_dataset_data( + self, + title: str, + public_name: str, + user_context: OptionalUserContext = None, + ) -> str: + """Prepares the dataset data with all required metadata fields.""" + user_email = self._get_user_email(user_context) + author_name = public_name + dataset_data = { + "datasetVersion": { + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0" + }, + "metadataBlocks": { + "citation": { + "fields": [ + { + "value": title, + "typeClass": "primitive", + "multiple": False, + "typeName": "title" + }, + { + "value": [ + { + "authorName": { + "value": author_name, + "typeClass": "primitive", + "multiple": False, + "typeName": "authorName" + } + } + ], + "typeClass": "compound", + "multiple": True, + "typeName": "author" + }, + { + "value": [ + { + "datasetContactEmail": { + "typeClass": "primitive", + "multiple": False, + "typeName": "datasetContactEmail", + "value": user_email, + }, + "datasetContactName": { + "typeClass": "primitive", + "multiple": False, + "typeName": "datasetContactName", + "value": author_name, + } + } + ], + "typeClass": "compound", + "multiple": True, + "typeName": "datasetContact" + }, + { + "value": [ + { + "dsDescriptionValue": { + "value": "Exported history from Galaxy", + "multiple": False, + "typeClass": "primitive", + "typeName": "dsDescriptionValue" + } + } + ], + "typeClass": "compound", + "multiple": True, + "typeName": "dsDescription" + }, + { + "value": [ + "Medicine, Health and Life Sciences" + ], + "typeClass": "controlledVocabulary", + "multiple": True, + "typeName": "subject" + } + ], + "displayName": "Citation Metadata" + } + } + } + } + return json.dumps(dataset_data) + + +__all__ = ("DataverseRDMFilesSource",) \ No newline at end of file diff --git a/lib/galaxy/files/sources/invenio.py b/lib/galaxy/files/sources/invenio.py index 146d63d0b641..b99e2ca019f8 100644 --- a/lib/galaxy/files/sources/invenio.py +++ b/lib/galaxy/files/sources/invenio.py @@ -34,6 +34,7 @@ RDMFilesSource, RDMFilesSourceProperties, RDMRepositoryInteractor, + ContainerAndFileIdentifier, ) from galaxy.util import ( DEFAULT_SOCKET_TIMEOUT, @@ -117,7 +118,10 @@ class InvenioRecord(TypedDict): class InvenioRDMFilesSource(RDMFilesSource): - """A files source for Invenio turn-key research data management repository.""" + """A files source for Invenio turn-key research data management repository. + + In Invenio a "Record" represents what we refer to as container in the rdm base class + """ plugin_type = "inveniordm" supports_pagination = True @@ -126,11 +130,12 @@ class InvenioRDMFilesSource(RDMFilesSource): def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): super().__init__(**kwd) self._scheme_regex = re.compile(rf"^{self.get_scheme()}?://{self.id}|^{DEFAULT_SCHEME}://{self.id}") + self.repository: InvenioRepositoryInteractor def get_scheme(self) -> str: return "invenio" - def score_url_match(self, url: str): + def score_url_match(self, url: str) -> int: if match := self._scheme_regex.match(url): return match.span()[1] else: @@ -145,6 +150,36 @@ def to_relative_path(self, url: str) -> str: def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInteractor: return InvenioRepositoryInteractor(repository_url, self) + + def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: + """Parses the given source path and returns the record_id and filename. + + The source path must have the format '//'. + If container_id_only is True, the source path must have the format '/' and and an empty filename will be returned. + """ + + def get_error_msg(details: str) -> str: + return f"Invalid source path: '{source_path}'. Expected format: '{expected_format}'. {details}" + + expected_format = "/" + if not source_path.startswith("/"): + raise ValueError(get_error_msg("Must start with '/'.")) + parts = source_path[1:].split("/", 2) + if container_id_only: + if len(parts) != 1: + raise ValueError(get_error_msg("Please provide the record_id only.")) + return ContainerAndFileIdentifier(container_id=parts[0], file_identifier="") + expected_format = "//" + if len(parts) < 2: + raise ValueError(get_error_msg("Please provide both the record_id and file_name.")) + if len(parts) > 2: + # TODO: This causes downloads to crash if the filename contains a slash + raise ValueError(get_error_msg("Too many parts. Please provide the record_id and file_name only.")) + record_id, file_name = parts + return ContainerAndFileIdentifier(container_id=record_id, file_identifier=file_name) + + def get_container_id_from_path(self, source_path: str) -> str: + return self.parse_path(source_path, container_id_only=True).container_id def _list( self, @@ -160,12 +195,12 @@ def _list( writeable = opts and opts.writeable or False is_root_path = path == "/" if is_root_path: - records, total_hits = self.repository.get_records( + records, total_hits = self.repository.get_file_containers( writeable, user_context, limit=limit, offset=offset, query=query ) return cast(List[AnyRemoteEntry], records), total_hits - record_id = self.get_record_id_from_path(path) - files = self.repository.get_files_in_record(record_id, writeable, user_context) + record_id = self.get_container_id_from_path(path) + files = self.repository.get_files_in_container(record_id, writeable, user_context) return cast(List[AnyRemoteEntry], files), len(files) def _create_entry( @@ -175,7 +210,7 @@ def _create_entry( opts: Optional[FilesSourceOptions] = None, ) -> Entry: public_name = self.get_public_name(user_context) - record = self.repository.create_draft_record(entry_data["name"], public_name, user_context=user_context) + record = self.repository.create_draft_file_container(entry_data["name"], public_name, user_context=user_context) return { "uri": self.repository.to_plugin_uri(record["id"]), "name": record["title"], @@ -191,9 +226,8 @@ def _realize_to( ): # TODO: user_context is always None here when called from a data fetch. # This prevents downloading files that require authentication even if the user provided a token. - record_id, filename = self.parse_path(source_path) - self.repository.download_file_from_record(record_id, filename, native_path, user_context=user_context) + self.repository.download_file_from_container(record_id, filename, native_path, user_context=user_context) def _write_from( self, @@ -203,10 +237,12 @@ def _write_from( opts: Optional[FilesSourceOptions] = None, ): record_id, filename = self.parse_path(target_path) - self.repository.upload_file_to_draft_record(record_id, filename, native_path, user_context=user_context) + self.repository.upload_file_to_draft_container(record_id, filename, native_path, user_context=user_context) class InvenioRepositoryInteractor(RDMRepositoryInteractor): + """In Invenio a "Record" represents what we refer to as container in the rdm base class""" + @property def records_url(self) -> str: return f"{self.repository_url}/api/records" @@ -218,7 +254,7 @@ def user_records_url(self) -> str: def to_plugin_uri(self, record_id: str, filename: Optional[str] = None) -> str: return f"{self.plugin.get_uri_root()}/{record_id}{f'/{filename}' if filename else ''}" - def get_records( + def get_file_containers( self, writeable: bool, user_context: OptionalUserContext = None, @@ -227,6 +263,7 @@ def get_records( query: Optional[str] = None, sort_by: Optional[str] = None, ) -> Tuple[List[RemoteDirectory], int]: + """Gets the records in the repository and returns the total count of records.""" params: Dict[str, Any] = {} request_url = self.records_url if writeable: @@ -250,15 +287,15 @@ def _to_size_page(self, limit: Optional[int], offset: Optional[int]) -> Tuple[Op page = (offset or 0) // size + 1 return size, page - def get_files_in_record( - self, record_id: str, writeable: bool, user_context: OptionalUserContext = None + def get_files_in_container( + self, container_id: str, writeable: bool, user_context: OptionalUserContext = None ) -> List[RemoteFile]: conditionally_draft = "/draft" if writeable else "" - request_url = f"{self.records_url}/{record_id}{conditionally_draft}/files" + request_url = f"{self.records_url}/{container_id}{conditionally_draft}/files" response_data = self._get_response(user_context, request_url) - return self._get_record_files_from_response(record_id, response_data) + return self._get_record_files_from_response(container_id, response_data) - def create_draft_record( + def create_draft_file_container( self, title: str, public_name: Optional[str] = None, user_context: OptionalUserContext = None ) -> RemoteDirectory: today = datetime.date.today().isoformat() @@ -282,7 +319,7 @@ def create_draft_record( record["title"] = self._get_record_title(record) return record - def upload_file_to_draft_record( + def upload_file_to_draft_container( self, record_id: str, filename: str, @@ -310,14 +347,14 @@ def upload_file_to_draft_record( response = requests.post(commit_file_upload_url, headers=headers) self._ensure_response_has_expected_status_code(response, 200) - def download_file_from_record( + def download_file_from_container( self, - record_id: str, - filename: str, + container_id: str, + file_identifier: str, file_path: str, user_context: OptionalUserContext = None, ): - download_file_content_url = self._get_download_file_url(record_id, filename, user_context) + download_file_content_url = self._get_download_file_url(container_id, file_identifier, user_context) headers = {} if self._is_api_url(download_file_content_url): # pass the token as a header only when using the API @@ -333,7 +370,7 @@ def download_file_from_record( # TODO: We can only download files from published records for now if e.code in [401, 403, 404]: raise Exception( - f"Cannot download file '{filename}' from record '{record_id}'. Please make sure the record exists and it is public." + f"Cannot download file '{file_identifier}' from record '{container_id}'. Please make sure the record exists and it is public." ) def _get_download_file_url(self, record_id: str, filename: str, user_context: OptionalUserContext = None): diff --git a/lib/galaxy/tools/parameters/grouping.py b/lib/galaxy/tools/parameters/grouping.py index 5a0e256fcbf2..123bf528b71e 100644 --- a/lib/galaxy/tools/parameters/grouping.py +++ b/lib/galaxy/tools/parameters/grouping.py @@ -53,6 +53,7 @@ "drs", "invenio", "zenodo", + "dataverse", ] ]