From 8d49f12e9261b42265f3e0595b8493b39d0c3637 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Thu, 21 Nov 2024 12:18:40 +0100 Subject: [PATCH 01/64] chore: ignore vsc workspace file --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index c907531198ef..67bf620e983f 100644 --- a/.gitignore +++ b/.gitignore @@ -130,6 +130,7 @@ tool_test_output.json client/**/jsconfig.json vetur.config.js .pre-commit-config.yaml +galaxy.code-workspace # Chrom len files *.len From f0a21dc03a602f202fcdce129997cd1634433f4f Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Thu, 21 Nov 2024 12:20:06 +0100 Subject: [PATCH 02/64] feat: initial template for dataverse integration --- lib/galaxy/files/dataverse.py | 288 ++++++++++++++++++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 lib/galaxy/files/dataverse.py diff --git a/lib/galaxy/files/dataverse.py b/lib/galaxy/files/dataverse.py new file mode 100644 index 000000000000..2406bfee9081 --- /dev/null +++ b/lib/galaxy/files/dataverse.py @@ -0,0 +1,288 @@ +import datetime +import json +import re +import urllib.request +from typing import ( + Any, + cast, + Dict, + List, + Optional, + Tuple, +) +from urllib.parse import quote + +from typing_extensions import ( + Literal, + TypedDict, + Unpack, +) + +from galaxy.exceptions import AuthenticationRequired +from galaxy.files import OptionalUserContext +from galaxy.files.sources import ( + AnyRemoteEntry, + DEFAULT_PAGE_LIMIT, + DEFAULT_SCHEME, + Entry, + EntryData, + FilesSourceOptions, + RemoteDirectory, + RemoteFile, +) +from galaxy.files.sources._rdm import ( + RDMFilesSource, + RDMFilesSourceProperties, + RDMRepositoryInteractor, +) +from galaxy.util import ( + DEFAULT_SOCKET_TIMEOUT, + get_charset_from_http_headers, + requests, + stream_to_open_named_file, +) + +AccessStatus = Literal["public", "restricted"] + +class DataverseRDMFilesSource(RDMFilesSource): + """A files source for Dataverse turn-key research data management repository.""" + + plugin_type = "dataverserdm" + # TODO supports_pagination = True + # TODO supports_search = True + + def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): + super().__init__(**kwd) + self._scheme_regex = re.compile(rf"^{self.get_scheme()}?://{self.id}|^{DEFAULT_SCHEME}://{self.id}") + + def get_scheme(self) -> str: + return "dataverse" + + # TODO: Test this method + def score_url_match(self, url: str): + if match := self._scheme_regex.match(url): + return match.span()[1] + else: + return 0 + + # TODO: Test this method + def to_relative_path(self, url: str) -> str: + legacy_uri_root = f"{DEFAULT_SCHEME}://{self.id}" + if url.startswith(legacy_uri_root): + return url[len(legacy_uri_root) :] + else: + return super().to_relative_path(url) + + def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInteractor: + return DataverseRepositoryInteractor(repository_url, self) + + def _list( + self, + path="/", + recursive=True, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, + query: Optional[str] = None, + sort_by: Optional[str] = None, + ) -> Tuple[List[AnyRemoteEntry], int]: + # TODO: Implement this for Dataverse + pass + + def _create_entry( + self, + entry_data: EntryData, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + ) -> Entry: + # TODO: Implement this for Dataverse + pass + + # TODO: Test this method + def _realize_to( + self, + source_path: str, + native_path: str, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + ): + # TODO: user_context is always None here when called from a data fetch. + # This prevents downloading files that require authentication even if the user provided a token. + + record_id, filename = self.parse_path(source_path) + self.repository.download_file_from_record(record_id, filename, native_path, user_context=user_context) + + # TODO: Test this method + def _write_from( + self, + target_path: str, + native_path: str, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + ): + record_id, filename = self.parse_path(target_path) + self.repository.upload_file_to_draft_record(record_id, filename, native_path, user_context=user_context) + +class DataverseRepositoryInteractor(RDMRepositoryInteractor): + # TODO: Implement this property for Dataverse? + # @property + # def records_url(self) -> str: + # return f"{self.repository_url}/api/records" + + # TODO: Implement this property for Dataverse? + # @property + # def user_records_url(self) -> str: + # return f"{self.repository_url}/api/user/records" + + # TODO: Test this method + def to_plugin_uri(self, record_id: str, filename: Optional[str] = None) -> str: + return f"{self.plugin.get_uri_root()}/{record_id}{f'/{filename}' if filename else ''}" + + def get_records( + self, + writeable: bool, + user_context: OptionalUserContext = None, + limit: Optional[int] = None, + offset: Optional[int] = None, + query: Optional[str] = None, + sort_by: Optional[str] = None, + ) -> Tuple[List[RemoteDirectory], int]: + # TODO: Implement this for Dataverse + pass + + def _to_size_page(self, limit: Optional[int], offset: Optional[int]) -> Tuple[Optional[int], Optional[int]]: + # TODO: Implement this for Dataverse + pass + + def get_files_in_record( + self, record_id: str, writeable: bool, user_context: OptionalUserContext = None + ) -> List[RemoteFile]: + # TODO: Implement this for Dataverse + pass + + def create_draft_record( + self, title: str, public_name: Optional[str] = None, user_context: OptionalUserContext = None + ) -> RemoteDirectory: + # TODO: Implement this for Dataverse + pass + + def upload_file_to_draft_record( + self, + record_id: str, + filename: str, + file_path: str, + user_context: OptionalUserContext = None, + ): + # TODO: Implement this for Dataverse + pass + + def download_file_from_record( + self, + record_id: str, + filename: str, + file_path: str, + user_context: OptionalUserContext = None, + ): + # TODO: Implement this for Dataverse + pass + + def _get_download_file_url(self, record_id: str, filename: str, user_context: OptionalUserContext = None): + """Get the URL to download a file from a record. + + This method is used to download files from both published and draft records that are accessible by the user. + """ + # TODO: Implement this for Dataverse + pass + + # TODO: Test this method + def _is_api_url(self, url: str) -> bool: + return "/api/" in url + + # TODO: Test this method + def _to_draft_url(self, url: str) -> str: + return url.replace("/files/", "/draft/files/") + + def _can_download_from_api(self, file_details: dict) -> bool: + # TODO: Have a look at this problem + + # Only files stored locally seems to be fully supported by the API for now + # More info: https://inveniordm.docs.cern.ch/reference/file_storage/ + return file_details["storage_class"] == "L" + + def _is_draft_record(self, record_id: str, user_context: OptionalUserContext = None): + # TODO: Implement this for Dataverse + pass + + def _get_draft_record_url(self, record_id: str): + # TODO: Implement this for Dataverse + pass + + def _get_draft_record(self, record_id: str, user_context: OptionalUserContext = None): + # TODO: Implement this for Dataverse + pass + + def _get_records_from_response(self, response: dict) -> List[RemoteDirectory]: + # TODO: Implement this for Dataverse + pass + + # TODO: Implement this for Dataverse + # def _get_record_title(self, record: InvenioRecord) -> str: + # pass + + # TODO: Implement this for Dataverse + # def _get_record_files_from_response(self, record_id: str, response: dict) -> List[RemoteFile]: + # pass + + # TODO: Implement this for Dataverse + # def _get_creator_from_public_name(self, public_name: Optional[str] = None) -> Creator: + # pass + + # TODO: Test this method + def _get_response( + self, + user_context: OptionalUserContext, + request_url: str, + params: Optional[Dict[str, Any]] = None, + auth_required: bool = False, + ) -> dict: + headers = self._get_request_headers(user_context, auth_required) + response = requests.get(request_url, params=params, headers=headers) + self._ensure_response_has_expected_status_code(response, 200) + return response.json() + + # TODO: Test this method + def _get_request_headers(self, user_context: OptionalUserContext, auth_required: bool = False): + token = self.plugin.get_authorization_token(user_context) + headers = {"Authorization": f"Bearer {token}"} if token else {} + if auth_required and token is None: + self._raise_auth_required() + return headers + + # TODO: Test this method + def _ensure_response_has_expected_status_code(self, response, expected_status_code: int): + if response.status_code != expected_status_code: + if response.status_code == 403: + self._raise_auth_required() + error_message = self._get_response_error_message(response) + raise Exception( + f"Request to {response.url} failed with status code {response.status_code}: {error_message}" + ) + + # TODO: Test this method + def _raise_auth_required(self): + raise AuthenticationRequired( + f"Please provide a personal access token in your user's preferences for '{self.plugin.label}'" + ) + + # TODO: Test this method + def _get_response_error_message(self, response): + response_json = response.json() + error_message = response_json.get("message") if response.status_code == 400 else response.text + errors = response_json.get("errors", []) + for error in errors: + error_message += f"\n{json.dumps(error)}" + return error_message + + +__all__ = ("DataverseRDMFilesSource",) \ No newline at end of file From 1359cfeae8a65e381eb4e179c54fada47055b7de Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Thu, 21 Nov 2024 12:24:38 +0100 Subject: [PATCH 03/64] feat: prototype of fetching datasets --- lib/galaxy/files/{ => sources}/dataverse.py | 88 ++++++++++++++++----- 1 file changed, 68 insertions(+), 20 deletions(-) rename lib/galaxy/files/{ => sources}/dataverse.py (75%) diff --git a/lib/galaxy/files/dataverse.py b/lib/galaxy/files/sources/dataverse.py similarity index 75% rename from lib/galaxy/files/dataverse.py rename to lib/galaxy/files/sources/dataverse.py index 2406bfee9081..7d99447cc646 100644 --- a/lib/galaxy/files/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -42,14 +42,29 @@ stream_to_open_named_file, ) + +class DataverseDataset(TypedDict): + name: str + type: str + url: str + global_id: str + description: str + published_at: str + storageIdentifier: str + fileCount: int + versionState: str + createdAt: str + updatedAt: str + publication_date: str + AccessStatus = Literal["public", "restricted"] class DataverseRDMFilesSource(RDMFilesSource): """A files source for Dataverse turn-key research data management repository.""" - plugin_type = "dataverserdm" - # TODO supports_pagination = True - # TODO supports_search = True + plugin_type = "dataverse" + supports_pagination = True + supports_search = True def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): super().__init__(**kwd) @@ -87,8 +102,17 @@ def _list( query: Optional[str] = None, sort_by: Optional[str] = None, ) -> Tuple[List[AnyRemoteEntry], int]: - # TODO: Implement this for Dataverse - pass + '''In Dataverse a "dataset" is equivalent to a "record". This method lists the datasets in the repository.''' + writeable = opts and opts.writeable or False + is_root_path = path == "/" + if is_root_path: + records, total_hits = self.repository.get_records( + writeable, user_context, limit=limit, offset=offset, query=query + ) + return cast(List[AnyRemoteEntry], records), total_hits + record_id = self.get_record_id_from_path(path) + files = self.repository.get_files_in_record(record_id, writeable, user_context) + return cast(List[AnyRemoteEntry], files), len(files) def _create_entry( self, @@ -125,10 +149,9 @@ def _write_from( self.repository.upload_file_to_draft_record(record_id, filename, native_path, user_context=user_context) class DataverseRepositoryInteractor(RDMRepositoryInteractor): - # TODO: Implement this property for Dataverse? - # @property - # def records_url(self) -> str: - # return f"{self.repository_url}/api/records" + @property + def search_url(self) -> str: + return f"{self.repository_url}/api/search" # TODO: Implement this property for Dataverse? # @property @@ -148,12 +171,23 @@ def get_records( query: Optional[str] = None, sort_by: Optional[str] = None, ) -> Tuple[List[RemoteDirectory], int]: - # TODO: Implement this for Dataverse - pass - - def _to_size_page(self, limit: Optional[int], offset: Optional[int]) -> Tuple[Optional[int], Optional[int]]: - # TODO: Implement this for Dataverse - pass + '''In Dataverse a "dataset" is equivalent to a "record". This method lists the datasets in the repository.''' + # https://demo.dataverse.org/api/search?q=*&type=dataset&per_page=25&page=1&start=0 + request_url = self.search_url + params: Dict[str, Any] = {} + params["type"] = "dataset" + # if writeable: + # TODO: Do we need this for dataverse? + # Only draft records owned by the user can be written to. + # params["is_published"] = "false" + # request_url = self.user_records_url + params["per_page"] = limit or DEFAULT_PAGE_LIMIT + params["start"] = offset + params["q"] = query or "*" + params["sort"] = sort_by or "date" # can be either "name" or "date" + response_data = self._get_response(user_context, request_url, params=params) + total_hits = response_data["data"]["total_count"] + return self._get_records_from_response(response_data["data"]), total_hits def get_files_in_record( self, record_id: str, writeable: bool, user_context: OptionalUserContext = None @@ -223,12 +257,26 @@ def _get_draft_record(self, record_id: str, user_context: OptionalUserContext = pass def _get_records_from_response(self, response: dict) -> List[RemoteDirectory]: - # TODO: Implement this for Dataverse - pass + '''In Dataverse a "dataset" is equivalent to a "record". This method gets the datasets in the repository.''' + datasets = response["items"] + rval: List[RemoteDirectory] = [] + for dataset in datasets: + uri = self.to_plugin_uri(record_id=dataset["global_id"]) + path = self.plugin.to_relative_path(uri) + name = self._get_record_title(dataset) + rval.append( + { + "class": "Directory", + "name": name, + "uri": uri, + "path": path, + } + ) + return rval - # TODO: Implement this for Dataverse - # def _get_record_title(self, record: InvenioRecord) -> str: - # pass + def _get_record_title(self, record: DataverseDataset) -> str: + title = record.get("name") + return title or "No title" # TODO: Implement this for Dataverse # def _get_record_files_from_response(self, record_id: str, response: dict) -> List[RemoteFile]: From 99073e880e7f662a72aa62b376badaf90b55720d Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Thu, 21 Nov 2024 12:25:49 +0100 Subject: [PATCH 04/64] feat: adding dataverse keys where zenodo and invendio was added --- client/src/utils/upload-payload.js | 1 + lib/galaxy/tools/parameters/grouping.py | 1 + 2 files changed, 2 insertions(+) diff --git a/client/src/utils/upload-payload.js b/client/src/utils/upload-payload.js index 2fbca26c06da..4a841c0e1ffb 100644 --- a/client/src/utils/upload-payload.js +++ b/client/src/utils/upload-payload.js @@ -12,6 +12,7 @@ export const URI_PREFIXES = [ "drs://", "invenio://", "zenodo://", + "dataverse://", ]; export function isUrl(content) { diff --git a/lib/galaxy/tools/parameters/grouping.py b/lib/galaxy/tools/parameters/grouping.py index 5a0e256fcbf2..123bf528b71e 100644 --- a/lib/galaxy/tools/parameters/grouping.py +++ b/lib/galaxy/tools/parameters/grouping.py @@ -53,6 +53,7 @@ "drs", "invenio", "zenodo", + "dataverse", ] ] From 06e44247c78622aab36320d3e7b3207c05791549 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 4 Dec 2024 18:31:58 +0100 Subject: [PATCH 05/64] feat: refactor with more abstract naming in base class and migrate functions from rdm base class --- lib/galaxy/files/sources/_rdm.py | 50 +++++++++-------------------- lib/galaxy/files/sources/invenio.py | 46 +++++++++++++++++++++----- 2 files changed, 54 insertions(+), 42 deletions(-) diff --git a/lib/galaxy/files/sources/_rdm.py b/lib/galaxy/files/sources/_rdm.py index 1848cf57cb24..f77a17911495 100644 --- a/lib/galaxy/files/sources/_rdm.py +++ b/lib/galaxy/files/sources/_rdm.py @@ -25,9 +25,10 @@ class RDMFilesSourceProperties(FilesSourceProperties): public_name: str -class RecordFilename(NamedTuple): - record_id: str - filename: str +class ContainerAndFileIdentifier(NamedTuple): + """The file_identifier could be a filename or a file_id.""" + container_id: str + file_identifier: str class RDMRepositoryInteractor: @@ -109,14 +110,14 @@ def upload_file_to_draft_record( """ raise NotImplementedError() - def download_file_from_record( + def download_file_from_container( self, - record_id: str, - filename: str, + container_id: str, + file_identifier: str, file_path: str, user_context: OptionalUserContext = None, ) -> None: - """Downloads a file with the provided filename from the record with the given record_id. + """Downloads a file with the provided filename from the container with the given container_id. The file will be downloaded to the file system at the given file_path. The user_context might be required to authenticate the user in the repository if the @@ -164,35 +165,16 @@ def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInterac This must be implemented by subclasses.""" raise NotImplementedError() - def parse_path(self, source_path: str, record_id_only: bool = False) -> RecordFilename: - """Parses the given source path and returns the record_id and filename. + def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: + """Parses the given source path and returns the container_id and filename. + + If container_id_only is True, an empty filename will be returned. - The source path must have the format '//'. - If record_id_only is True, the source path must have the format '/' and an - empty filename will be returned. - """ + This must be implemented by subclasses.""" + raise NotImplementedError() - def get_error_msg(details: str) -> str: - return f"Invalid source path: '{source_path}'. Expected format: '{expected_format}'. {details}" - - expected_format = "/" - if not source_path.startswith("/"): - raise ValueError(get_error_msg("Must start with '/'.")) - parts = source_path[1:].split("/", 2) - if record_id_only: - if len(parts) != 1: - raise ValueError(get_error_msg("Please provide the record_id only.")) - return RecordFilename(record_id=parts[0], filename="") - expected_format = "//" - if len(parts) < 2: - raise ValueError(get_error_msg("Please provide both the record_id and file_name.")) - if len(parts) > 2: - raise ValueError(get_error_msg("Too many parts. Please provide the record_id and file_name only.")) - record_id, file_name = parts - return RecordFilename(record_id=record_id, filename=file_name) - - def get_record_id_from_path(self, source_path: str) -> str: - return self.parse_path(source_path, record_id_only=True).record_id + def get_container_id_from_path(self, source_path: str) -> str: + raise NotImplementedError() def _serialization_props(self, user_context: OptionalUserContext = None): effective_props = {} diff --git a/lib/galaxy/files/sources/invenio.py b/lib/galaxy/files/sources/invenio.py index 146d63d0b641..d58f2284a8a6 100644 --- a/lib/galaxy/files/sources/invenio.py +++ b/lib/galaxy/files/sources/invenio.py @@ -34,6 +34,7 @@ RDMFilesSource, RDMFilesSourceProperties, RDMRepositoryInteractor, + ContainerAndFileIdentifier, ) from galaxy.util import ( DEFAULT_SOCKET_TIMEOUT, @@ -145,6 +146,36 @@ def to_relative_path(self, url: str) -> str: def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInteractor: return InvenioRepositoryInteractor(repository_url, self) + + def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: + """Parses the given source path and returns the record_id and filename. + + The source path must have the format '//'. + If container_id_only is True, the source path must have the format '/' and and an empty filename will be returned. + """ + + def get_error_msg(details: str) -> str: + return f"Invalid source path: '{source_path}'. Expected format: '{expected_format}'. {details}" + + expected_format = "/" + if not source_path.startswith("/"): + raise ValueError(get_error_msg("Must start with '/'.")) + parts = source_path[1:].split("/", 2) + if container_id_only: + if len(parts) != 1: + raise ValueError(get_error_msg("Please provide the record_id only.")) + return ContainerAndFileIdentifier(container_id=parts[0], file_identifier="") + expected_format = "//" + if len(parts) < 2: + raise ValueError(get_error_msg("Please provide both the record_id and file_name.")) + if len(parts) > 2: + # TODO: This causes downloads to crash if the filename contains a slash + raise ValueError(get_error_msg("Too many parts. Please provide the record_id and file_name only.")) + record_id, file_name = parts + return ContainerAndFileIdentifier(container_id=record_id, file_identifier=file_name) + + def get_container_id_from_path(self, source_path: str) -> str: + return self.parse_path(source_path, container_id_only=True).container_id def _list( self, @@ -164,7 +195,7 @@ def _list( writeable, user_context, limit=limit, offset=offset, query=query ) return cast(List[AnyRemoteEntry], records), total_hits - record_id = self.get_record_id_from_path(path) + record_id = self.get_container_id_from_path(path) files = self.repository.get_files_in_record(record_id, writeable, user_context) return cast(List[AnyRemoteEntry], files), len(files) @@ -191,9 +222,8 @@ def _realize_to( ): # TODO: user_context is always None here when called from a data fetch. # This prevents downloading files that require authentication even if the user provided a token. - record_id, filename = self.parse_path(source_path) - self.repository.download_file_from_record(record_id, filename, native_path, user_context=user_context) + self.repository.download_file_from_container(record_id, filename, native_path, user_context=user_context) def _write_from( self, @@ -310,14 +340,14 @@ def upload_file_to_draft_record( response = requests.post(commit_file_upload_url, headers=headers) self._ensure_response_has_expected_status_code(response, 200) - def download_file_from_record( + def download_file_from_container( self, - record_id: str, - filename: str, + container_id: str, + file_identifier: str, file_path: str, user_context: OptionalUserContext = None, ): - download_file_content_url = self._get_download_file_url(record_id, filename, user_context) + download_file_content_url = self._get_download_file_url(container_id, file_identifier, user_context) headers = {} if self._is_api_url(download_file_content_url): # pass the token as a header only when using the API @@ -333,7 +363,7 @@ def download_file_from_record( # TODO: We can only download files from published records for now if e.code in [401, 403, 404]: raise Exception( - f"Cannot download file '{filename}' from record '{record_id}'. Please make sure the record exists and it is public." + f"Cannot download file '{file_identifier}' from record '{container_id}'. Please make sure the record exists and it is public." ) def _get_download_file_url(self, record_id: str, filename: str, user_context: OptionalUserContext = None): From b260661edf15e44b03fca1ec4b1d7e6afeee1c91 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 4 Dec 2024 18:33:33 +0100 Subject: [PATCH 06/64] feat: download remote files from dataverse (prototype) --- lib/galaxy/files/sources/dataverse.py | 182 +++++++++++++++++++++----- 1 file changed, 148 insertions(+), 34 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 7d99447cc646..34ff15d9e2dc 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -34,6 +34,7 @@ RDMFilesSource, RDMFilesSourceProperties, RDMRepositoryInteractor, + ContainerAndFileIdentifier, ) from galaxy.util import ( DEFAULT_SOCKET_TIMEOUT, @@ -90,6 +91,45 @@ def to_relative_path(self, url: str) -> str: def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInteractor: return DataverseRepositoryInteractor(repository_url, self) + + def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: + """Parses the given source path and returns the dataset_id(=dataverse file container id) and the file_id. + + The source path must have the format '//'. + + Example dataset_id: + doi:10.70122/FK2/DIG2DG + + Example file_id: + doi:10.70122/FK2/DIG2DG/AVNCLL + + If container_id_only is True, the source path must have the format '/' and an empty file_id will be returned. + """ + + def get_error_msg(details: str) -> str: + return f"Invalid source path: '{source_path}'. Expected format: '{expected_format}'. {details}" + + expected_format = "/" + if not source_path.startswith("/"): + raise ValueError(get_error_msg("Must start with '/'.")) + parts = source_path[1:].split("/", 4) + dataset_id = "/".join(parts[0:3]) + if container_id_only: + if len(parts) != 3: + raise ValueError(get_error_msg("Please provide the dataset_id only.")) + # concatenate the first 3 parts to get the dataset_id + dataset_id = "/".join(parts[0:3]) + return ContainerAndFileIdentifier(dataset_id=parts[0:3], file_identifier="") + expected_format = "//" + if len(parts) < 4: + raise ValueError(get_error_msg("Please provide both the dataset_id and file_id.")) + if len(parts) > 4: + raise ValueError(get_error_msg("Too many parts. Please provide the dataset_id and file_id only.")) + file_id = dataset_id + "/" + parts[3] + return ContainerAndFileIdentifier(container_id=dataset_id, file_identifier=file_id) + + def get_container_id_from_path(self, source_path: str) -> str: + return self.parse_path(source_path, container_id_only=True).container_id def _list( self, @@ -110,7 +150,7 @@ def _list( writeable, user_context, limit=limit, offset=offset, query=query ) return cast(List[AnyRemoteEntry], records), total_hits - record_id = self.get_record_id_from_path(path) + record_id = self._get_dataset_id_from_path(path) files = self.repository.get_files_in_record(record_id, writeable, user_context) return cast(List[AnyRemoteEntry], files), len(files) @@ -123,7 +163,6 @@ def _create_entry( # TODO: Implement this for Dataverse pass - # TODO: Test this method def _realize_to( self, source_path: str, @@ -131,11 +170,8 @@ def _realize_to( user_context: OptionalUserContext = None, opts: Optional[FilesSourceOptions] = None, ): - # TODO: user_context is always None here when called from a data fetch. - # This prevents downloading files that require authentication even if the user provided a token. - - record_id, filename = self.parse_path(source_path) - self.repository.download_file_from_record(record_id, filename, native_path, user_context=user_context) + record_id, file_id = self.parse_path(source_path) + self.repository.download_file_from_container(record_id, file_id, native_path, user_context=user_context) # TODO: Test this method def _write_from( @@ -145,23 +181,35 @@ def _write_from( user_context: OptionalUserContext = None, opts: Optional[FilesSourceOptions] = None, ): - record_id, filename = self.parse_path(target_path) - self.repository.upload_file_to_draft_record(record_id, filename, native_path, user_context=user_context) + record_id, file_id = self.parse_path(target_path) + self.repository.upload_file_to_draft_record(record_id, file_id, native_path, user_context=user_context) + + def _get_dataset_id_from_path(self, path: str) -> str: + # /doi:10.70122/FK2/DIG2DG => doi:10.70122/FK2/DIG2DG + return path.lstrip("/") class DataverseRepositoryInteractor(RDMRepositoryInteractor): + @property + def api_base_url(self) -> str: + return f"{self.repository_url}/api" + @property def search_url(self) -> str: - return f"{self.repository_url}/api/search" + return f"{self.api_base_url}/search" + + @property + def user_datasets_url(self) -> str: + return f"{self.repository_url}/api/user/records" + + def file_access_url(self, file_id: str) -> str: + return f"{self.api_base_url}/access/datafile/:persistentId?persistentId={file_id}" - # TODO: Implement this property for Dataverse? - # @property - # def user_records_url(self) -> str: - # return f"{self.repository_url}/api/user/records" + def files_of_dataset_url(self, dataset_id: str, dataset_version: str = 1.0) -> str: + return f"{self.api_base_url}/datasets/:persistentId/versions/{dataset_version}/files?persistentId={dataset_id}" - # TODO: Test this method def to_plugin_uri(self, record_id: str, filename: Optional[str] = None) -> str: - return f"{self.plugin.get_uri_root()}/{record_id}{f'/{filename}' if filename else ''}" - + return f"{self.plugin.get_uri_root()}/{f'{filename}' if filename else f'{record_id}'}" + def get_records( self, writeable: bool, @@ -171,7 +219,7 @@ def get_records( query: Optional[str] = None, sort_by: Optional[str] = None, ) -> Tuple[List[RemoteDirectory], int]: - '''In Dataverse a "dataset" is equivalent to a "record". This method lists the datasets in the repository.''' + '''In Dataverse a "dataset" is equivalent to a "record" in invenio. This method lists the dataverse datasets in the repository.''' # https://demo.dataverse.org/api/search?q=*&type=dataset&per_page=25&page=1&start=0 request_url = self.search_url params: Dict[str, Any] = {} @@ -192,8 +240,14 @@ def get_records( def get_files_in_record( self, record_id: str, writeable: bool, user_context: OptionalUserContext = None ) -> List[RemoteFile]: - # TODO: Implement this for Dataverse - pass + '''In Dataverse a "file" is a equivalent to "record" in invenio. This method lists the files in a dataverse dataset.''' + # TODO: Handle drafts? + # conditionally_draft = "/draft" if writeable else "" + # request_url = f"{self.records_url}/{record_id}{conditionally_draft}/files" + request_url = self.files_of_dataset_url(dataset_id=record_id) + response_data = self._get_response(user_context, request_url) + total_hits = response_data["totalCount"] + return self._get_files_from_response(record_id, response_data["data"]) def create_draft_record( self, title: str, public_name: Optional[str] = None, user_context: OptionalUserContext = None @@ -211,23 +265,59 @@ def upload_file_to_draft_record( # TODO: Implement this for Dataverse pass - def download_file_from_record( + def download_file_from_container( self, - record_id: str, - filename: str, + container_id: str, + file_identifier: str, file_path: str, user_context: OptionalUserContext = None, ): - # TODO: Implement this for Dataverse - pass + download_file_content_url = self._get_download_file_url(container_id, file_identifier, user_context) + headers = {} + + # TODO: User auth + # if self._is_api_url(download_file_content_url): + # pass the token as a header only when using the API + # headers = self._get_request_headers(user_context) + try: + req = urllib.request.Request(download_file_content_url, headers=headers) + with urllib.request.urlopen(req, timeout=DEFAULT_SOCKET_TIMEOUT) as page: + f = open(file_path, "wb") + return stream_to_open_named_file( + page, f.fileno(), file_path, source_encoding=get_charset_from_http_headers(page.headers) + ) + except urllib.error.HTTPError as e: + # TODO: We can only download files from published records for now + if e.code in [401, 403, 404]: + raise Exception( + f"Cannot download file '{file_identifier}' from record '{container_id}'. Please make sure the record exists and it is public." + ) + + def _get_download_file_url(self, container_id: str, file_id: str, user_context: OptionalUserContext = None): + """Get the URL to download a file from a dataset(=dataverse file container). + + This method is used to download files from both published and draft datasets that are accessible by the user. + """ + # TODO: Implement draft feature for Dataverse + # is_draft_record = self._is_draft_record(container_id, user_context) - def _get_download_file_url(self, record_id: str, filename: str, user_context: OptionalUserContext = None): - """Get the URL to download a file from a record. + download_file_content_url = self.file_access_url(file_id=file_id) + + # https://demo.dataverse.org/api/access/datafile/:persistentId?persistentId=doi:10.70122/FK2/DIG2DG/AVNCLL + # TODO: Implement draft feature for Dataverse + # if is_draft_record: + # file_details_url = self._to_draft_url(file_details_url) + # download_file_content_url = self._to_draft_url(download_file_content_url) + + # file_details = self._get_response(user_context, file_details_url) + # TODO: This is a temporary workaround from invenio for the fact that the "content" API + # does not support downloading files from S3 or other remote storage classes. + # We might need something like this as well for dataverse + # if not self._can_download_from_api(file_details): + # More info: https://inveniordm.docs.cern.ch/reference/file_storage/#remote-files-r + # download_file_content_url = f"{file_details_url.replace('/api', '')}?download=1" - This method is used to download files from both published and draft records that are accessible by the user. - """ - # TODO: Implement this for Dataverse - pass + return download_file_content_url # TODO: Test this method def _is_api_url(self, url: str) -> bool: @@ -278,14 +368,38 @@ def _get_record_title(self, record: DataverseDataset) -> str: title = record.get("name") return title or "No title" - # TODO: Implement this for Dataverse - # def _get_record_files_from_response(self, record_id: str, response: dict) -> List[RemoteFile]: - # pass + def _get_files_from_response(self, record_id: str, response: dict) -> List[RemoteFile]: + # TODO: Implement this for Dataverse + + # this is used in invenio, do we need it for dataverse? + # files_enabled = response.get("enabled", False) + # if not files_enabled: + # return [] + + rval: List[RemoteFile] = [] + for entry in response: + dataFile = entry.get("dataFile") + filename = dataFile.get("filename") + persistendId = dataFile.get("persistentId") + uri = self.to_plugin_uri(record_id=record_id, filename=persistendId) + path = self.plugin.to_relative_path(uri) + rval.append( + { + "class": "File", + "name": filename, + "size": dataFile.get("filesize"), + "ctime": dataFile.get("creationDate"), + "uri": uri, + "path": path, + } + ) + return rval # TODO: Implement this for Dataverse # def _get_creator_from_public_name(self, public_name: Optional[str] = None) -> Creator: # pass + # TODO: Test this method def _get_response( self, From 571dc07df0c50c6286f940845838a5c12c40b235 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 6 Dec 2024 12:02:44 +0100 Subject: [PATCH 07/64] chore: renaming from "record" to "container" for coherent terminology --- lib/galaxy/files/sources/_rdm.py | 48 ++++++++++++++++------------- lib/galaxy/files/sources/invenio.py | 27 ++++++++-------- 2 files changed, 41 insertions(+), 34 deletions(-) diff --git a/lib/galaxy/files/sources/_rdm.py b/lib/galaxy/files/sources/_rdm.py index f77a17911495..e2e041bc1b93 100644 --- a/lib/galaxy/files/sources/_rdm.py +++ b/lib/galaxy/files/sources/_rdm.py @@ -36,6 +36,10 @@ class RDMRepositoryInteractor: This class is not intended to be used directly, but rather to be subclassed by file sources that interact with RDM repositories. + + Different RDM repositories use different terminology. Also they use the same term for different things. + To prevent confusion, we use the term "container" in the base repository. + This is an abstract term for the entity that contains multiple files, i.e. what is a "Collection" in galaxy. """ def __init__(self, repository_url: str, plugin: "RDMFilesSource"): @@ -55,13 +59,13 @@ def repository_url(self) -> str: """ return self._repository_url - def to_plugin_uri(self, record_id: str, filename: Optional[str] = None) -> str: - """Creates a valid plugin URI to reference the given record_id. + def to_plugin_uri(self, container_id: str, filename: Optional[str] = None) -> str: + """Creates a valid plugin URI to reference the given container_id. - If a filename is provided, the URI will reference the specific file in the record.""" + If a filename is provided, the URI will reference the specific file in the container.""" raise NotImplementedError() - def get_records( + def get_file_containers( self, writeable: bool, user_context: OptionalUserContext = None, @@ -70,41 +74,43 @@ def get_records( query: Optional[str] = None, sort_by: Optional[str] = None, ) -> Tuple[List[RemoteDirectory], int]: - """Returns the list of records in the repository and the total count of records. + """Returns the list of file containers in the repository and the total count containers. - If writeable is True, only records that the user can write to will be returned. + If writeable is True, only containers that the user can write to will be returned. The user_context might be required to authenticate the user in the repository. """ raise NotImplementedError() - def get_files_in_record( - self, record_id: str, writeable: bool, user_context: OptionalUserContext = None + def get_files_in_container( + self, container_id: str, writeable: bool, user_context: OptionalUserContext = None ) -> List[RemoteFile]: - """Returns the list of files contained in the given record. + """Returns the list of files of a file container. - If writeable is True, we are signaling that the user intends to write to the record. + If writeable is True, we are signaling that the user intends to write to the container. """ raise NotImplementedError() - def create_draft_record( + def create_draft_container( + self, title: str, public_name: Optional[str] = None, user_context: OptionalUserContext = None ): - """Creates a draft record (directory) in the repository with basic metadata. + """Creates a draft container (directory) in the repository with basic metadata. - The metadata is usually just the title of the record and the user that created it. + The metadata is usually just the title of the container and the user that created it. Some plugins might also provide additional metadata defaults in the user settings.""" raise NotImplementedError() - def upload_file_to_draft_record( + def upload_file_to_draft_container( self, - record_id: str, + container_id: str, filename: str, file_path: str, user_context: OptionalUserContext = None, ) -> None: - """Uploads a file with the provided filename (from file_path) to a draft record with the given record_id. + """Uploads a file with the provided filename (from file_path) to a draft container with the given container_id. + + The draft container must have been created in advance with the `create_draft_container` method. - The draft record must have been created in advance with the `create_draft_record` method. The file must exist in the file system at the given file_path. The user_context might be required to authenticate the user in the repository. """ @@ -133,13 +139,11 @@ class RDMFilesSource(BaseFilesSource): by file sources that interact with RDM repositories. A RDM file source is similar to a regular file source, but instead of tree of - files and directories, it provides a (one level) list of records (representing directories) + files and directories, it provides a (one level) list of containers (representing directories) that can contain only files (no subdirectories). - In addition, RDM file sources might need to create a new record (directory) in advance in the - repository, and then upload a file to it. This is done by calling the `create_entry` - method. - + In addition, RDM file sources might need to create a new container (directory) in advance in the + repository, and then upload a file to it. This is done by calling the `_create_entry` method. """ plugin_kind = PluginKind.rdm diff --git a/lib/galaxy/files/sources/invenio.py b/lib/galaxy/files/sources/invenio.py index d58f2284a8a6..369e05ecfdbf 100644 --- a/lib/galaxy/files/sources/invenio.py +++ b/lib/galaxy/files/sources/invenio.py @@ -131,7 +131,7 @@ def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): def get_scheme(self) -> str: return "invenio" - def score_url_match(self, url: str): + def score_url_match(self, url: str) -> int: if match := self._scheme_regex.match(url): return match.span()[1] else: @@ -191,12 +191,12 @@ def _list( writeable = opts and opts.writeable or False is_root_path = path == "/" if is_root_path: - records, total_hits = self.repository.get_records( + records, total_hits = self.repository.get_file_containers( writeable, user_context, limit=limit, offset=offset, query=query ) return cast(List[AnyRemoteEntry], records), total_hits record_id = self.get_container_id_from_path(path) - files = self.repository.get_files_in_record(record_id, writeable, user_context) + files = self.repository.get_files_in_container(record_id, writeable, user_context) return cast(List[AnyRemoteEntry], files), len(files) def _create_entry( @@ -206,7 +206,7 @@ def _create_entry( opts: Optional[FilesSourceOptions] = None, ) -> Entry: public_name = self.get_public_name(user_context) - record = self.repository.create_draft_record(entry_data["name"], public_name, user_context=user_context) + record = self.repository.create_draft_container(entry_data["name"], public_name, user_context=user_context) return { "uri": self.repository.to_plugin_uri(record["id"]), "name": record["title"], @@ -233,7 +233,7 @@ def _write_from( opts: Optional[FilesSourceOptions] = None, ): record_id, filename = self.parse_path(target_path) - self.repository.upload_file_to_draft_record(record_id, filename, native_path, user_context=user_context) + self.repository.upload_file_to_draft_container(record_id, filename, native_path, user_context=user_context) class InvenioRepositoryInteractor(RDMRepositoryInteractor): @@ -248,7 +248,7 @@ def user_records_url(self) -> str: def to_plugin_uri(self, record_id: str, filename: Optional[str] = None) -> str: return f"{self.plugin.get_uri_root()}/{record_id}{f'/{filename}' if filename else ''}" - def get_records( + def get_file_containers( self, writeable: bool, user_context: OptionalUserContext = None, @@ -257,6 +257,9 @@ def get_records( query: Optional[str] = None, sort_by: Optional[str] = None, ) -> Tuple[List[RemoteDirectory], int]: + """Gets the records in the repository and returns the total count of records. + An Invenio "Record" is a Galaxy "Collection". + """ params: Dict[str, Any] = {} request_url = self.records_url if writeable: @@ -280,15 +283,15 @@ def _to_size_page(self, limit: Optional[int], offset: Optional[int]) -> Tuple[Op page = (offset or 0) // size + 1 return size, page - def get_files_in_record( - self, record_id: str, writeable: bool, user_context: OptionalUserContext = None + def get_files_in_container( + self, container_id: str, writeable: bool, user_context: OptionalUserContext = None ) -> List[RemoteFile]: conditionally_draft = "/draft" if writeable else "" - request_url = f"{self.records_url}/{record_id}{conditionally_draft}/files" + request_url = f"{self.records_url}/{container_id}{conditionally_draft}/files" response_data = self._get_response(user_context, request_url) - return self._get_record_files_from_response(record_id, response_data) + return self._get_record_files_from_response(container_id, response_data) - def create_draft_record( + def create_draft_container( self, title: str, public_name: Optional[str] = None, user_context: OptionalUserContext = None ) -> RemoteDirectory: today = datetime.date.today().isoformat() @@ -312,7 +315,7 @@ def create_draft_record( record["title"] = self._get_record_title(record) return record - def upload_file_to_draft_record( + def upload_file_to_draft_container( self, record_id: str, filename: str, From 65f95047041be0be7fe9d7850f85bb796fb5a445 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 6 Dec 2024 12:19:46 +0100 Subject: [PATCH 08/64] chore: docstring to clarify what is a container in invenio --- lib/galaxy/files/sources/invenio.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/galaxy/files/sources/invenio.py b/lib/galaxy/files/sources/invenio.py index 369e05ecfdbf..85b9dd6ee094 100644 --- a/lib/galaxy/files/sources/invenio.py +++ b/lib/galaxy/files/sources/invenio.py @@ -257,9 +257,7 @@ def get_file_containers( query: Optional[str] = None, sort_by: Optional[str] = None, ) -> Tuple[List[RemoteDirectory], int]: - """Gets the records in the repository and returns the total count of records. - An Invenio "Record" is a Galaxy "Collection". - """ + """Gets the records in the repository and returns the total count of records.""" params: Dict[str, Any] = {} request_url = self.records_url if writeable: From 60afbf0d99ce8e6571487961047c07c558f53911 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 6 Dec 2024 12:21:11 +0100 Subject: [PATCH 09/64] chore: remove reference to galaxy Collection --- lib/galaxy/files/sources/_rdm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/files/sources/_rdm.py b/lib/galaxy/files/sources/_rdm.py index e2e041bc1b93..87ef411b325f 100644 --- a/lib/galaxy/files/sources/_rdm.py +++ b/lib/galaxy/files/sources/_rdm.py @@ -39,7 +39,7 @@ class RDMRepositoryInteractor: Different RDM repositories use different terminology. Also they use the same term for different things. To prevent confusion, we use the term "container" in the base repository. - This is an abstract term for the entity that contains multiple files, i.e. what is a "Collection" in galaxy. + This is an abstract term for the entity that contains multiple files. """ def __init__(self, repository_url: str, plugin: "RDMFilesSource"): From a01c848189e3d5f8fa473281b2d8dc0de531727d Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 6 Dec 2024 12:21:46 +0100 Subject: [PATCH 10/64] chore: change order of file source and repository interactor --- lib/galaxy/files/sources/_rdm.py | 134 +++++++++++++++---------------- 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/lib/galaxy/files/sources/_rdm.py b/lib/galaxy/files/sources/_rdm.py index 87ef411b325f..9d13a21007b7 100644 --- a/lib/galaxy/files/sources/_rdm.py +++ b/lib/galaxy/files/sources/_rdm.py @@ -31,6 +31,72 @@ class ContainerAndFileIdentifier(NamedTuple): file_identifier: str +class RDMFilesSource(BaseFilesSource): + """Base class for Research Data Management (RDM) file sources. + + This class is not intended to be used directly, but rather to be subclassed + by file sources that interact with RDM repositories. + + A RDM file source is similar to a regular file source, but instead of tree of + files and directories, it provides a (one level) list of containers (representing directories) + that can contain only files (no subdirectories). + + In addition, RDM file sources might need to create a new container (directory) in advance in the + repository, and then upload a file to it. This is done by calling the `_create_entry` method. + """ + + plugin_kind = PluginKind.rdm + + def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): + props = self._parse_common_config_opts(kwd) + self.url = props.get("url") + if not self.url: + raise Exception("URL for RDM repository must be provided in configuration") + self._props = props + self._repository_interactor = self.get_repository_interactor(self.url) + + @property + def repository(self) -> RDMRepositoryInteractor: + return self._repository_interactor + + def get_url(self) -> Optional[str]: + return self.url + + def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInteractor: + """Returns an interactor compatible with the given repository URL. + + This must be implemented by subclasses.""" + raise NotImplementedError() + + def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: + """Parses the given source path and returns the container_id and filename. + + If container_id_only is True, an empty filename will be returned. + + This must be implemented by subclasses.""" + raise NotImplementedError() + + def get_container_id_from_path(self, source_path: str) -> str: + raise NotImplementedError() + + def _serialization_props(self, user_context: OptionalUserContext = None): + effective_props = {} + for key, val in self._props.items(): + effective_props[key] = self._evaluate_prop(val, user_context=user_context) + return effective_props + + def get_authorization_token(self, user_context: OptionalUserContext) -> Optional[str]: + token = None + if user_context: + effective_props = self._serialization_props(user_context) + token = effective_props.get("token") + return token + + def get_public_name(self, user_context: OptionalUserContext) -> Optional[str]: + effective_props = self._serialization_props(user_context) + return effective_props.get("public_name") + + class RDMRepositoryInteractor: """Base class for interacting with an external RDM repository. @@ -129,70 +195,4 @@ def download_file_from_container( The user_context might be required to authenticate the user in the repository if the file is not publicly available. """ - raise NotImplementedError() - - -class RDMFilesSource(BaseFilesSource): - """Base class for Research Data Management (RDM) file sources. - - This class is not intended to be used directly, but rather to be subclassed - by file sources that interact with RDM repositories. - - A RDM file source is similar to a regular file source, but instead of tree of - files and directories, it provides a (one level) list of containers (representing directories) - that can contain only files (no subdirectories). - - In addition, RDM file sources might need to create a new container (directory) in advance in the - repository, and then upload a file to it. This is done by calling the `_create_entry` method. - """ - - plugin_kind = PluginKind.rdm - - def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): - props = self._parse_common_config_opts(kwd) - self.url = props.get("url") - if not self.url: - raise Exception("URL for RDM repository must be provided in configuration") - self._props = props - self._repository_interactor = self.get_repository_interactor(self.url) - - @property - def repository(self) -> RDMRepositoryInteractor: - return self._repository_interactor - - def get_url(self) -> Optional[str]: - return self.url - - def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInteractor: - """Returns an interactor compatible with the given repository URL. - - This must be implemented by subclasses.""" - raise NotImplementedError() - - def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: - """Parses the given source path and returns the container_id and filename. - - If container_id_only is True, an empty filename will be returned. - - This must be implemented by subclasses.""" - raise NotImplementedError() - - def get_container_id_from_path(self, source_path: str) -> str: - raise NotImplementedError() - - def _serialization_props(self, user_context: OptionalUserContext = None): - effective_props = {} - for key, val in self._props.items(): - effective_props[key] = self._evaluate_prop(val, user_context=user_context) - return effective_props - - def get_authorization_token(self, user_context: OptionalUserContext) -> Optional[str]: - token = None - if user_context: - effective_props = self._serialization_props(user_context) - token = effective_props.get("token") - return token - - def get_public_name(self, user_context: OptionalUserContext) -> Optional[str]: - effective_props = self._serialization_props(user_context) - return effective_props.get("public_name") + raise NotImplementedError() \ No newline at end of file From 63105a5c3137e58c0ff980d813e8a3d1b5bc7ca0 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 6 Dec 2024 12:23:05 +0100 Subject: [PATCH 11/64] Revert "chore: change order of file source and repository interactor" This reverts commit c5ec249c93b74e41cccdc5d133e957b3f1b1e6f3. --- lib/galaxy/files/sources/_rdm.py | 134 +++++++++++++++---------------- 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/lib/galaxy/files/sources/_rdm.py b/lib/galaxy/files/sources/_rdm.py index 9d13a21007b7..87ef411b325f 100644 --- a/lib/galaxy/files/sources/_rdm.py +++ b/lib/galaxy/files/sources/_rdm.py @@ -31,72 +31,6 @@ class ContainerAndFileIdentifier(NamedTuple): file_identifier: str -class RDMFilesSource(BaseFilesSource): - """Base class for Research Data Management (RDM) file sources. - - This class is not intended to be used directly, but rather to be subclassed - by file sources that interact with RDM repositories. - - A RDM file source is similar to a regular file source, but instead of tree of - files and directories, it provides a (one level) list of containers (representing directories) - that can contain only files (no subdirectories). - - In addition, RDM file sources might need to create a new container (directory) in advance in the - repository, and then upload a file to it. This is done by calling the `_create_entry` method. - """ - - plugin_kind = PluginKind.rdm - - def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): - props = self._parse_common_config_opts(kwd) - self.url = props.get("url") - if not self.url: - raise Exception("URL for RDM repository must be provided in configuration") - self._props = props - self._repository_interactor = self.get_repository_interactor(self.url) - - @property - def repository(self) -> RDMRepositoryInteractor: - return self._repository_interactor - - def get_url(self) -> Optional[str]: - return self.url - - def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInteractor: - """Returns an interactor compatible with the given repository URL. - - This must be implemented by subclasses.""" - raise NotImplementedError() - - def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: - """Parses the given source path and returns the container_id and filename. - - If container_id_only is True, an empty filename will be returned. - - This must be implemented by subclasses.""" - raise NotImplementedError() - - def get_container_id_from_path(self, source_path: str) -> str: - raise NotImplementedError() - - def _serialization_props(self, user_context: OptionalUserContext = None): - effective_props = {} - for key, val in self._props.items(): - effective_props[key] = self._evaluate_prop(val, user_context=user_context) - return effective_props - - def get_authorization_token(self, user_context: OptionalUserContext) -> Optional[str]: - token = None - if user_context: - effective_props = self._serialization_props(user_context) - token = effective_props.get("token") - return token - - def get_public_name(self, user_context: OptionalUserContext) -> Optional[str]: - effective_props = self._serialization_props(user_context) - return effective_props.get("public_name") - - class RDMRepositoryInteractor: """Base class for interacting with an external RDM repository. @@ -195,4 +129,70 @@ def download_file_from_container( The user_context might be required to authenticate the user in the repository if the file is not publicly available. """ - raise NotImplementedError() \ No newline at end of file + raise NotImplementedError() + + +class RDMFilesSource(BaseFilesSource): + """Base class for Research Data Management (RDM) file sources. + + This class is not intended to be used directly, but rather to be subclassed + by file sources that interact with RDM repositories. + + A RDM file source is similar to a regular file source, but instead of tree of + files and directories, it provides a (one level) list of containers (representing directories) + that can contain only files (no subdirectories). + + In addition, RDM file sources might need to create a new container (directory) in advance in the + repository, and then upload a file to it. This is done by calling the `_create_entry` method. + """ + + plugin_kind = PluginKind.rdm + + def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): + props = self._parse_common_config_opts(kwd) + self.url = props.get("url") + if not self.url: + raise Exception("URL for RDM repository must be provided in configuration") + self._props = props + self._repository_interactor = self.get_repository_interactor(self.url) + + @property + def repository(self) -> RDMRepositoryInteractor: + return self._repository_interactor + + def get_url(self) -> Optional[str]: + return self.url + + def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInteractor: + """Returns an interactor compatible with the given repository URL. + + This must be implemented by subclasses.""" + raise NotImplementedError() + + def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: + """Parses the given source path and returns the container_id and filename. + + If container_id_only is True, an empty filename will be returned. + + This must be implemented by subclasses.""" + raise NotImplementedError() + + def get_container_id_from_path(self, source_path: str) -> str: + raise NotImplementedError() + + def _serialization_props(self, user_context: OptionalUserContext = None): + effective_props = {} + for key, val in self._props.items(): + effective_props[key] = self._evaluate_prop(val, user_context=user_context) + return effective_props + + def get_authorization_token(self, user_context: OptionalUserContext) -> Optional[str]: + token = None + if user_context: + effective_props = self._serialization_props(user_context) + token = effective_props.get("token") + return token + + def get_public_name(self, user_context: OptionalUserContext) -> Optional[str]: + effective_props = self._serialization_props(user_context) + return effective_props.get("public_name") From 93393e72c26eceb417b6725646da6727a6a9cf11 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 6 Dec 2024 12:25:25 +0100 Subject: [PATCH 12/64] chore: change order of repository interactor and file source in invenio --- lib/galaxy/files/sources/invenio.py | 238 ++++++++++++++-------------- 1 file changed, 119 insertions(+), 119 deletions(-) diff --git a/lib/galaxy/files/sources/invenio.py b/lib/galaxy/files/sources/invenio.py index 85b9dd6ee094..e67e114145a6 100644 --- a/lib/galaxy/files/sources/invenio.py +++ b/lib/galaxy/files/sources/invenio.py @@ -117,125 +117,6 @@ class InvenioRecord(TypedDict): links: RecordLinks -class InvenioRDMFilesSource(RDMFilesSource): - """A files source for Invenio turn-key research data management repository.""" - - plugin_type = "inveniordm" - supports_pagination = True - supports_search = True - - def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): - super().__init__(**kwd) - self._scheme_regex = re.compile(rf"^{self.get_scheme()}?://{self.id}|^{DEFAULT_SCHEME}://{self.id}") - - def get_scheme(self) -> str: - return "invenio" - - def score_url_match(self, url: str) -> int: - if match := self._scheme_regex.match(url): - return match.span()[1] - else: - return 0 - - def to_relative_path(self, url: str) -> str: - legacy_uri_root = f"{DEFAULT_SCHEME}://{self.id}" - if url.startswith(legacy_uri_root): - return url[len(legacy_uri_root) :] - else: - return super().to_relative_path(url) - - def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInteractor: - return InvenioRepositoryInteractor(repository_url, self) - - def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: - """Parses the given source path and returns the record_id and filename. - - The source path must have the format '//'. - If container_id_only is True, the source path must have the format '/' and and an empty filename will be returned. - """ - - def get_error_msg(details: str) -> str: - return f"Invalid source path: '{source_path}'. Expected format: '{expected_format}'. {details}" - - expected_format = "/" - if not source_path.startswith("/"): - raise ValueError(get_error_msg("Must start with '/'.")) - parts = source_path[1:].split("/", 2) - if container_id_only: - if len(parts) != 1: - raise ValueError(get_error_msg("Please provide the record_id only.")) - return ContainerAndFileIdentifier(container_id=parts[0], file_identifier="") - expected_format = "//" - if len(parts) < 2: - raise ValueError(get_error_msg("Please provide both the record_id and file_name.")) - if len(parts) > 2: - # TODO: This causes downloads to crash if the filename contains a slash - raise ValueError(get_error_msg("Too many parts. Please provide the record_id and file_name only.")) - record_id, file_name = parts - return ContainerAndFileIdentifier(container_id=record_id, file_identifier=file_name) - - def get_container_id_from_path(self, source_path: str) -> str: - return self.parse_path(source_path, container_id_only=True).container_id - - def _list( - self, - path="/", - recursive=True, - user_context: OptionalUserContext = None, - opts: Optional[FilesSourceOptions] = None, - limit: Optional[int] = None, - offset: Optional[int] = None, - query: Optional[str] = None, - sort_by: Optional[str] = None, - ) -> Tuple[List[AnyRemoteEntry], int]: - writeable = opts and opts.writeable or False - is_root_path = path == "/" - if is_root_path: - records, total_hits = self.repository.get_file_containers( - writeable, user_context, limit=limit, offset=offset, query=query - ) - return cast(List[AnyRemoteEntry], records), total_hits - record_id = self.get_container_id_from_path(path) - files = self.repository.get_files_in_container(record_id, writeable, user_context) - return cast(List[AnyRemoteEntry], files), len(files) - - def _create_entry( - self, - entry_data: EntryData, - user_context: OptionalUserContext = None, - opts: Optional[FilesSourceOptions] = None, - ) -> Entry: - public_name = self.get_public_name(user_context) - record = self.repository.create_draft_container(entry_data["name"], public_name, user_context=user_context) - return { - "uri": self.repository.to_plugin_uri(record["id"]), - "name": record["title"], - "external_link": record["links"]["self_html"], - } - - def _realize_to( - self, - source_path: str, - native_path: str, - user_context: OptionalUserContext = None, - opts: Optional[FilesSourceOptions] = None, - ): - # TODO: user_context is always None here when called from a data fetch. - # This prevents downloading files that require authentication even if the user provided a token. - record_id, filename = self.parse_path(source_path) - self.repository.download_file_from_container(record_id, filename, native_path, user_context=user_context) - - def _write_from( - self, - target_path: str, - native_path: str, - user_context: OptionalUserContext = None, - opts: Optional[FilesSourceOptions] = None, - ): - record_id, filename = self.parse_path(target_path) - self.repository.upload_file_to_draft_container(record_id, filename, native_path, user_context=user_context) - - class InvenioRepositoryInteractor(RDMRepositoryInteractor): @property def records_url(self) -> str: @@ -517,6 +398,125 @@ def _get_response_error_message(self, response): for error in errors: error_message += f"\n{json.dumps(error)}" return error_message + + +class InvenioRDMFilesSource(RDMFilesSource): + """A files source for Invenio turn-key research data management repository.""" + + plugin_type = "inveniordm" + supports_pagination = True + supports_search = True + + def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): + super().__init__(**kwd) + self._scheme_regex = re.compile(rf"^{self.get_scheme()}?://{self.id}|^{DEFAULT_SCHEME}://{self.id}") + + def get_scheme(self) -> str: + return "invenio" + + def score_url_match(self, url: str) -> int: + if match := self._scheme_regex.match(url): + return match.span()[1] + else: + return 0 + + def to_relative_path(self, url: str) -> str: + legacy_uri_root = f"{DEFAULT_SCHEME}://{self.id}" + if url.startswith(legacy_uri_root): + return url[len(legacy_uri_root) :] + else: + return super().to_relative_path(url) + + def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInteractor: + return InvenioRepositoryInteractor(repository_url, self) + + def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: + """Parses the given source path and returns the record_id and filename. + + The source path must have the format '//'. + If container_id_only is True, the source path must have the format '/' and and an empty filename will be returned. + """ + + def get_error_msg(details: str) -> str: + return f"Invalid source path: '{source_path}'. Expected format: '{expected_format}'. {details}" + + expected_format = "/" + if not source_path.startswith("/"): + raise ValueError(get_error_msg("Must start with '/'.")) + parts = source_path[1:].split("/", 2) + if container_id_only: + if len(parts) != 1: + raise ValueError(get_error_msg("Please provide the record_id only.")) + return ContainerAndFileIdentifier(container_id=parts[0], file_identifier="") + expected_format = "//" + if len(parts) < 2: + raise ValueError(get_error_msg("Please provide both the record_id and file_name.")) + if len(parts) > 2: + # TODO: This causes downloads to crash if the filename contains a slash + raise ValueError(get_error_msg("Too many parts. Please provide the record_id and file_name only.")) + record_id, file_name = parts + return ContainerAndFileIdentifier(container_id=record_id, file_identifier=file_name) + + def get_container_id_from_path(self, source_path: str) -> str: + return self.parse_path(source_path, container_id_only=True).container_id + + def _list( + self, + path="/", + recursive=True, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, + query: Optional[str] = None, + sort_by: Optional[str] = None, + ) -> Tuple[List[AnyRemoteEntry], int]: + writeable = opts and opts.writeable or False + is_root_path = path == "/" + if is_root_path: + records, total_hits = self.repository.get_file_containers( + writeable, user_context, limit=limit, offset=offset, query=query + ) + return cast(List[AnyRemoteEntry], records), total_hits + record_id = self.get_container_id_from_path(path) + files = self.repository.get_files_in_container(record_id, writeable, user_context) + return cast(List[AnyRemoteEntry], files), len(files) + + def _create_entry( + self, + entry_data: EntryData, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + ) -> Entry: + public_name = self.get_public_name(user_context) + record = self.repository.create_draft_container(entry_data["name"], public_name, user_context=user_context) + return { + "uri": self.repository.to_plugin_uri(record["id"]), + "name": record["title"], + "external_link": record["links"]["self_html"], + } + + def _realize_to( + self, + source_path: str, + native_path: str, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + ): + # TODO: user_context is always None here when called from a data fetch. + # This prevents downloading files that require authentication even if the user provided a token. + record_id, filename = self.parse_path(source_path) + self.repository.download_file_from_container(record_id, filename, native_path, user_context=user_context) + + def _write_from( + self, + target_path: str, + native_path: str, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + ): + record_id, filename = self.parse_path(target_path) + self.repository.upload_file_to_draft_container(record_id, filename, native_path, user_context=user_context) __all__ = ("InvenioRDMFilesSource",) From 975c329e7e45a2520c48d1f89c666e24746f043f Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 6 Dec 2024 12:32:55 +0100 Subject: [PATCH 13/64] chore: explain container in invenio docstrings --- lib/galaxy/files/sources/invenio.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/files/sources/invenio.py b/lib/galaxy/files/sources/invenio.py index e67e114145a6..016f9ed97d4c 100644 --- a/lib/galaxy/files/sources/invenio.py +++ b/lib/galaxy/files/sources/invenio.py @@ -118,6 +118,7 @@ class InvenioRecord(TypedDict): class InvenioRepositoryInteractor(RDMRepositoryInteractor): + """In Invenio a "Record" represents what we refer to as container in the rdm base class""" @property def records_url(self) -> str: return f"{self.repository_url}/api/records" @@ -401,7 +402,10 @@ def _get_response_error_message(self, response): class InvenioRDMFilesSource(RDMFilesSource): - """A files source for Invenio turn-key research data management repository.""" + """A files source for Invenio turn-key research data management repository. + + In Invenio a "Record" represents what we refer to as container in the rdm base class" + """ plugin_type = "inveniordm" supports_pagination = True From 50894b5c86885c71e1f3c4aad4cd79a91b1f5c72 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 6 Dec 2024 12:34:43 +0100 Subject: [PATCH 14/64] Revert "chore: explain container in invenio docstrings" This reverts commit a3018e37642ccef7b86d8dc6d572aa666578012d. --- lib/galaxy/files/sources/invenio.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/galaxy/files/sources/invenio.py b/lib/galaxy/files/sources/invenio.py index 016f9ed97d4c..e67e114145a6 100644 --- a/lib/galaxy/files/sources/invenio.py +++ b/lib/galaxy/files/sources/invenio.py @@ -118,7 +118,6 @@ class InvenioRecord(TypedDict): class InvenioRepositoryInteractor(RDMRepositoryInteractor): - """In Invenio a "Record" represents what we refer to as container in the rdm base class""" @property def records_url(self) -> str: return f"{self.repository_url}/api/records" @@ -402,10 +401,7 @@ def _get_response_error_message(self, response): class InvenioRDMFilesSource(RDMFilesSource): - """A files source for Invenio turn-key research data management repository. - - In Invenio a "Record" represents what we refer to as container in the rdm base class" - """ + """A files source for Invenio turn-key research data management repository.""" plugin_type = "inveniordm" supports_pagination = True From 47f0dbe28d6333dafde53a2a57431ab26ed506de Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 6 Dec 2024 12:34:52 +0100 Subject: [PATCH 15/64] Revert "chore: change order of repository interactor and file source in invenio" This reverts commit 5d009700dab18f50366a5288616a1664a3ef55ba. --- lib/galaxy/files/sources/invenio.py | 238 ++++++++++++++-------------- 1 file changed, 119 insertions(+), 119 deletions(-) diff --git a/lib/galaxy/files/sources/invenio.py b/lib/galaxy/files/sources/invenio.py index e67e114145a6..85b9dd6ee094 100644 --- a/lib/galaxy/files/sources/invenio.py +++ b/lib/galaxy/files/sources/invenio.py @@ -117,6 +117,125 @@ class InvenioRecord(TypedDict): links: RecordLinks +class InvenioRDMFilesSource(RDMFilesSource): + """A files source for Invenio turn-key research data management repository.""" + + plugin_type = "inveniordm" + supports_pagination = True + supports_search = True + + def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): + super().__init__(**kwd) + self._scheme_regex = re.compile(rf"^{self.get_scheme()}?://{self.id}|^{DEFAULT_SCHEME}://{self.id}") + + def get_scheme(self) -> str: + return "invenio" + + def score_url_match(self, url: str) -> int: + if match := self._scheme_regex.match(url): + return match.span()[1] + else: + return 0 + + def to_relative_path(self, url: str) -> str: + legacy_uri_root = f"{DEFAULT_SCHEME}://{self.id}" + if url.startswith(legacy_uri_root): + return url[len(legacy_uri_root) :] + else: + return super().to_relative_path(url) + + def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInteractor: + return InvenioRepositoryInteractor(repository_url, self) + + def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: + """Parses the given source path and returns the record_id and filename. + + The source path must have the format '//'. + If container_id_only is True, the source path must have the format '/' and and an empty filename will be returned. + """ + + def get_error_msg(details: str) -> str: + return f"Invalid source path: '{source_path}'. Expected format: '{expected_format}'. {details}" + + expected_format = "/" + if not source_path.startswith("/"): + raise ValueError(get_error_msg("Must start with '/'.")) + parts = source_path[1:].split("/", 2) + if container_id_only: + if len(parts) != 1: + raise ValueError(get_error_msg("Please provide the record_id only.")) + return ContainerAndFileIdentifier(container_id=parts[0], file_identifier="") + expected_format = "//" + if len(parts) < 2: + raise ValueError(get_error_msg("Please provide both the record_id and file_name.")) + if len(parts) > 2: + # TODO: This causes downloads to crash if the filename contains a slash + raise ValueError(get_error_msg("Too many parts. Please provide the record_id and file_name only.")) + record_id, file_name = parts + return ContainerAndFileIdentifier(container_id=record_id, file_identifier=file_name) + + def get_container_id_from_path(self, source_path: str) -> str: + return self.parse_path(source_path, container_id_only=True).container_id + + def _list( + self, + path="/", + recursive=True, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, + query: Optional[str] = None, + sort_by: Optional[str] = None, + ) -> Tuple[List[AnyRemoteEntry], int]: + writeable = opts and opts.writeable or False + is_root_path = path == "/" + if is_root_path: + records, total_hits = self.repository.get_file_containers( + writeable, user_context, limit=limit, offset=offset, query=query + ) + return cast(List[AnyRemoteEntry], records), total_hits + record_id = self.get_container_id_from_path(path) + files = self.repository.get_files_in_container(record_id, writeable, user_context) + return cast(List[AnyRemoteEntry], files), len(files) + + def _create_entry( + self, + entry_data: EntryData, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + ) -> Entry: + public_name = self.get_public_name(user_context) + record = self.repository.create_draft_container(entry_data["name"], public_name, user_context=user_context) + return { + "uri": self.repository.to_plugin_uri(record["id"]), + "name": record["title"], + "external_link": record["links"]["self_html"], + } + + def _realize_to( + self, + source_path: str, + native_path: str, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + ): + # TODO: user_context is always None here when called from a data fetch. + # This prevents downloading files that require authentication even if the user provided a token. + record_id, filename = self.parse_path(source_path) + self.repository.download_file_from_container(record_id, filename, native_path, user_context=user_context) + + def _write_from( + self, + target_path: str, + native_path: str, + user_context: OptionalUserContext = None, + opts: Optional[FilesSourceOptions] = None, + ): + record_id, filename = self.parse_path(target_path) + self.repository.upload_file_to_draft_container(record_id, filename, native_path, user_context=user_context) + + class InvenioRepositoryInteractor(RDMRepositoryInteractor): @property def records_url(self) -> str: @@ -398,125 +517,6 @@ def _get_response_error_message(self, response): for error in errors: error_message += f"\n{json.dumps(error)}" return error_message - - -class InvenioRDMFilesSource(RDMFilesSource): - """A files source for Invenio turn-key research data management repository.""" - - plugin_type = "inveniordm" - supports_pagination = True - supports_search = True - - def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): - super().__init__(**kwd) - self._scheme_regex = re.compile(rf"^{self.get_scheme()}?://{self.id}|^{DEFAULT_SCHEME}://{self.id}") - - def get_scheme(self) -> str: - return "invenio" - - def score_url_match(self, url: str) -> int: - if match := self._scheme_regex.match(url): - return match.span()[1] - else: - return 0 - - def to_relative_path(self, url: str) -> str: - legacy_uri_root = f"{DEFAULT_SCHEME}://{self.id}" - if url.startswith(legacy_uri_root): - return url[len(legacy_uri_root) :] - else: - return super().to_relative_path(url) - - def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInteractor: - return InvenioRepositoryInteractor(repository_url, self) - - def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: - """Parses the given source path and returns the record_id and filename. - - The source path must have the format '//'. - If container_id_only is True, the source path must have the format '/' and and an empty filename will be returned. - """ - - def get_error_msg(details: str) -> str: - return f"Invalid source path: '{source_path}'. Expected format: '{expected_format}'. {details}" - - expected_format = "/" - if not source_path.startswith("/"): - raise ValueError(get_error_msg("Must start with '/'.")) - parts = source_path[1:].split("/", 2) - if container_id_only: - if len(parts) != 1: - raise ValueError(get_error_msg("Please provide the record_id only.")) - return ContainerAndFileIdentifier(container_id=parts[0], file_identifier="") - expected_format = "//" - if len(parts) < 2: - raise ValueError(get_error_msg("Please provide both the record_id and file_name.")) - if len(parts) > 2: - # TODO: This causes downloads to crash if the filename contains a slash - raise ValueError(get_error_msg("Too many parts. Please provide the record_id and file_name only.")) - record_id, file_name = parts - return ContainerAndFileIdentifier(container_id=record_id, file_identifier=file_name) - - def get_container_id_from_path(self, source_path: str) -> str: - return self.parse_path(source_path, container_id_only=True).container_id - - def _list( - self, - path="/", - recursive=True, - user_context: OptionalUserContext = None, - opts: Optional[FilesSourceOptions] = None, - limit: Optional[int] = None, - offset: Optional[int] = None, - query: Optional[str] = None, - sort_by: Optional[str] = None, - ) -> Tuple[List[AnyRemoteEntry], int]: - writeable = opts and opts.writeable or False - is_root_path = path == "/" - if is_root_path: - records, total_hits = self.repository.get_file_containers( - writeable, user_context, limit=limit, offset=offset, query=query - ) - return cast(List[AnyRemoteEntry], records), total_hits - record_id = self.get_container_id_from_path(path) - files = self.repository.get_files_in_container(record_id, writeable, user_context) - return cast(List[AnyRemoteEntry], files), len(files) - - def _create_entry( - self, - entry_data: EntryData, - user_context: OptionalUserContext = None, - opts: Optional[FilesSourceOptions] = None, - ) -> Entry: - public_name = self.get_public_name(user_context) - record = self.repository.create_draft_container(entry_data["name"], public_name, user_context=user_context) - return { - "uri": self.repository.to_plugin_uri(record["id"]), - "name": record["title"], - "external_link": record["links"]["self_html"], - } - - def _realize_to( - self, - source_path: str, - native_path: str, - user_context: OptionalUserContext = None, - opts: Optional[FilesSourceOptions] = None, - ): - # TODO: user_context is always None here when called from a data fetch. - # This prevents downloading files that require authentication even if the user provided a token. - record_id, filename = self.parse_path(source_path) - self.repository.download_file_from_container(record_id, filename, native_path, user_context=user_context) - - def _write_from( - self, - target_path: str, - native_path: str, - user_context: OptionalUserContext = None, - opts: Optional[FilesSourceOptions] = None, - ): - record_id, filename = self.parse_path(target_path) - self.repository.upload_file_to_draft_container(record_id, filename, native_path, user_context=user_context) __all__ = ("InvenioRDMFilesSource",) From bc9d6715bc0c3b9bf25938d2992b11aea4cb3e98 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 6 Dec 2024 12:36:41 +0100 Subject: [PATCH 16/64] chore: explain container in invenio docstrings --- lib/galaxy/files/sources/invenio.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/files/sources/invenio.py b/lib/galaxy/files/sources/invenio.py index 85b9dd6ee094..8e85377c4dc7 100644 --- a/lib/galaxy/files/sources/invenio.py +++ b/lib/galaxy/files/sources/invenio.py @@ -118,7 +118,10 @@ class InvenioRecord(TypedDict): class InvenioRDMFilesSource(RDMFilesSource): - """A files source for Invenio turn-key research data management repository.""" + """A files source for Invenio turn-key research data management repository. + + In Invenio a "Record" represents what we refer to as container in the rdm base class + """ plugin_type = "inveniordm" supports_pagination = True @@ -237,6 +240,8 @@ def _write_from( class InvenioRepositoryInteractor(RDMRepositoryInteractor): + """In Invenio a "Record" represents what we refer to as container in the rdm base class""" + @property def records_url(self) -> str: return f"{self.repository_url}/api/records" From 6473711297aead6a7724e77c9c28ca100f2a9412 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 6 Dec 2024 12:44:07 +0100 Subject: [PATCH 17/64] chore: dataset refactoring and renaming to container term --- lib/galaxy/files/sources/dataverse.py | 110 +++++++++++++------------- 1 file changed, 56 insertions(+), 54 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 34ff15d9e2dc..e3dc4810aa09 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -61,7 +61,10 @@ class DataverseDataset(TypedDict): AccessStatus = Literal["public", "restricted"] class DataverseRDMFilesSource(RDMFilesSource): - """A files source for Dataverse turn-key research data management repository.""" + """A files source for Dataverse turn-key research data management repository. + + In Dataverse a "Dataset" represents what we refer to as container in the rdm base class + """ plugin_type = "dataverse" supports_pagination = True @@ -74,14 +77,14 @@ def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): def get_scheme(self) -> str: return "dataverse" - # TODO: Test this method - def score_url_match(self, url: str): - if match := self._scheme_regex.match(url): - return match.span()[1] - else: - return 0 - - # TODO: Test this method + # TODO: Maybe we dont need this + # def score_url_match(self, url: str) -> + # if match := self._scheme_regex.match(url): + # return match.span()[1] + # else: + # return 0 + + # TODO: Test this method (maybe we dont need it) def to_relative_path(self, url: str) -> str: legacy_uri_root = f"{DEFAULT_SCHEME}://{self.id}" if url.startswith(legacy_uri_root): @@ -93,19 +96,17 @@ def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInterac return DataverseRepositoryInteractor(repository_url, self) def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: - """Parses the given source path and returns the dataset_id(=dataverse file container id) and the file_id. + """Parses the given source path and returns the dataset_id and the file_id. The source path must have the format '//'. + If dataset_id_only is True, the source path must have the format '/' and an empty file_id will be returned. Example dataset_id: doi:10.70122/FK2/DIG2DG Example file_id: doi:10.70122/FK2/DIG2DG/AVNCLL - - If container_id_only is True, the source path must have the format '/' and an empty file_id will be returned. """ - def get_error_msg(details: str) -> str: return f"Invalid source path: '{source_path}'. Expected format: '{expected_format}'. {details}" @@ -117,7 +118,6 @@ def get_error_msg(details: str) -> str: if container_id_only: if len(parts) != 3: raise ValueError(get_error_msg("Please provide the dataset_id only.")) - # concatenate the first 3 parts to get the dataset_id dataset_id = "/".join(parts[0:3]) return ContainerAndFileIdentifier(dataset_id=parts[0:3], file_identifier="") expected_format = "//" @@ -142,16 +142,16 @@ def _list( query: Optional[str] = None, sort_by: Optional[str] = None, ) -> Tuple[List[AnyRemoteEntry], int]: - '''In Dataverse a "dataset" is equivalent to a "record". This method lists the datasets in the repository.''' + """This method lists the files in the Dataverse Dataset.""" writeable = opts and opts.writeable or False is_root_path = path == "/" if is_root_path: - records, total_hits = self.repository.get_records( + datasets, total_hits = self.repository.get_file_containers( writeable, user_context, limit=limit, offset=offset, query=query ) - return cast(List[AnyRemoteEntry], records), total_hits - record_id = self._get_dataset_id_from_path(path) - files = self.repository.get_files_in_record(record_id, writeable, user_context) + return cast(List[AnyRemoteEntry], datasets), total_hits + dataset_id = self._get_dataset_id_from_path(path) + files = self.repository.get_files_in_container(dataset_id, writeable, user_context) return cast(List[AnyRemoteEntry], files), len(files) def _create_entry( @@ -170,8 +170,8 @@ def _realize_to( user_context: OptionalUserContext = None, opts: Optional[FilesSourceOptions] = None, ): - record_id, file_id = self.parse_path(source_path) - self.repository.download_file_from_container(record_id, file_id, native_path, user_context=user_context) + dataset_id, file_id = self.parse_path(source_path) + self.repository.download_file_from_container(dataset_id, file_id, native_path, user_context=user_context) # TODO: Test this method def _write_from( @@ -181,14 +181,16 @@ def _write_from( user_context: OptionalUserContext = None, opts: Optional[FilesSourceOptions] = None, ): - record_id, file_id = self.parse_path(target_path) - self.repository.upload_file_to_draft_record(record_id, file_id, native_path, user_context=user_context) + dataset_id, file_id = self.parse_path(target_path) + self.repository.upload_file_to_draft_dataset(dataset_id, file_id, native_path, user_context=user_context) def _get_dataset_id_from_path(self, path: str) -> str: # /doi:10.70122/FK2/DIG2DG => doi:10.70122/FK2/DIG2DG return path.lstrip("/") class DataverseRepositoryInteractor(RDMRepositoryInteractor): + """In Dataverse a "Dataset" represents what we refer to as container in the rdm base class""" + @property def api_base_url(self) -> str: return f"{self.repository_url}/api" @@ -199,7 +201,9 @@ def search_url(self) -> str: @property def user_datasets_url(self) -> str: - return f"{self.repository_url}/api/user/records" + # TODO fix + # return f"{self.repository_url}/api/user/records" + pass def file_access_url(self, file_id: str) -> str: return f"{self.api_base_url}/access/datafile/:persistentId?persistentId={file_id}" @@ -207,10 +211,10 @@ def file_access_url(self, file_id: str) -> str: def files_of_dataset_url(self, dataset_id: str, dataset_version: str = 1.0) -> str: return f"{self.api_base_url}/datasets/:persistentId/versions/{dataset_version}/files?persistentId={dataset_id}" - def to_plugin_uri(self, record_id: str, filename: Optional[str] = None) -> str: - return f"{self.plugin.get_uri_root()}/{f'{filename}' if filename else f'{record_id}'}" + def to_plugin_uri(self, dataset_id: str, file_identifier: Optional[str] = None) -> str: + return f"{self.plugin.get_uri_root()}/{f'{file_identifier}' if file_identifier else f'{dataset_id}'}" - def get_records( + def get_file_containers( self, writeable: bool, user_context: OptionalUserContext = None, @@ -219,8 +223,7 @@ def get_records( query: Optional[str] = None, sort_by: Optional[str] = None, ) -> Tuple[List[RemoteDirectory], int]: - '''In Dataverse a "dataset" is equivalent to a "record" in invenio. This method lists the dataverse datasets in the repository.''' - # https://demo.dataverse.org/api/search?q=*&type=dataset&per_page=25&page=1&start=0 + """Lists the Dataverse datasets in the repository.""" request_url = self.search_url params: Dict[str, Any] = {} params["type"] = "dataset" @@ -235,29 +238,29 @@ def get_records( params["sort"] = sort_by or "date" # can be either "name" or "date" response_data = self._get_response(user_context, request_url, params=params) total_hits = response_data["data"]["total_count"] - return self._get_records_from_response(response_data["data"]), total_hits + return self._get_datasets_from_response(response_data["data"]), total_hits - def get_files_in_record( - self, record_id: str, writeable: bool, user_context: OptionalUserContext = None + def get_files_in_container( + self, dataset_id: str, writeable: bool, user_context: OptionalUserContext = None ) -> List[RemoteFile]: - '''In Dataverse a "file" is a equivalent to "record" in invenio. This method lists the files in a dataverse dataset.''' + """This method lists the files in a dataverse dataset.""" # TODO: Handle drafts? # conditionally_draft = "/draft" if writeable else "" - # request_url = f"{self.records_url}/{record_id}{conditionally_draft}/files" - request_url = self.files_of_dataset_url(dataset_id=record_id) + # request_url = f"{self.records_url}/{dataset_id}{conditionally_draft}/files" + request_url = self.files_of_dataset_url(dataset_id=dataset_id) response_data = self._get_response(user_context, request_url) total_hits = response_data["totalCount"] - return self._get_files_from_response(record_id, response_data["data"]) + return self._get_files_from_response(dataset_id, response_data["data"]) - def create_draft_record( + def create_draft_container( self, title: str, public_name: Optional[str] = None, user_context: OptionalUserContext = None ) -> RemoteDirectory: # TODO: Implement this for Dataverse pass - def upload_file_to_draft_record( + def upload_file_to_draft_container( self, - record_id: str, + dataset_id: str, filename: str, file_path: str, user_context: OptionalUserContext = None, @@ -278,7 +281,7 @@ def download_file_from_container( # TODO: User auth # if self._is_api_url(download_file_content_url): # pass the token as a header only when using the API - # headers = self._get_request_headers(user_context) + # headers = self._get_request_headers(user_context) try: req = urllib.request.Request(download_file_content_url, headers=headers) with urllib.request.urlopen(req, timeout=DEFAULT_SOCKET_TIMEOUT) as page: @@ -287,10 +290,10 @@ def download_file_from_container( page, f.fileno(), file_path, source_encoding=get_charset_from_http_headers(page.headers) ) except urllib.error.HTTPError as e: - # TODO: We can only download files from published records for now + # TODO: We can only download files from published datasets for now if e.code in [401, 403, 404]: raise Exception( - f"Cannot download file '{file_identifier}' from record '{container_id}'. Please make sure the record exists and it is public." + f"Cannot download file '{file_identifier}' from dataset '{container_id}'. Please make sure the dataset exists and it is public." ) def _get_download_file_url(self, container_id: str, file_id: str, user_context: OptionalUserContext = None): @@ -334,26 +337,25 @@ def _can_download_from_api(self, file_details: dict) -> bool: # More info: https://inveniordm.docs.cern.ch/reference/file_storage/ return file_details["storage_class"] == "L" - def _is_draft_record(self, record_id: str, user_context: OptionalUserContext = None): + def _is_draft_dataset(self, dataset_id: str, user_context: OptionalUserContext = None): # TODO: Implement this for Dataverse pass - def _get_draft_record_url(self, record_id: str): + def _get_draft_dataset_url(self, dataset_id: str): # TODO: Implement this for Dataverse pass - def _get_draft_record(self, record_id: str, user_context: OptionalUserContext = None): + def _get_draft_dataset(self, dataset_id: str, user_context: OptionalUserContext = None): # TODO: Implement this for Dataverse pass - def _get_records_from_response(self, response: dict) -> List[RemoteDirectory]: - '''In Dataverse a "dataset" is equivalent to a "record". This method gets the datasets in the repository.''' + def _get_datasets_from_response(self, response: dict) -> List[RemoteDirectory]: datasets = response["items"] rval: List[RemoteDirectory] = [] for dataset in datasets: - uri = self.to_plugin_uri(record_id=dataset["global_id"]) + uri = self.to_plugin_uri(dataset_id=dataset["global_id"]) path = self.plugin.to_relative_path(uri) - name = self._get_record_title(dataset) + name = self._get_dataset_title(dataset) rval.append( { "class": "Directory", @@ -364,13 +366,13 @@ def _get_records_from_response(self, response: dict) -> List[RemoteDirectory]: ) return rval - def _get_record_title(self, record: DataverseDataset) -> str: - title = record.get("name") + def _get_dataset_title(self, dataset: DataverseDataset) -> str: + title = dataset.get("name") return title or "No title" - def _get_files_from_response(self, record_id: str, response: dict) -> List[RemoteFile]: - # TODO: Implement this for Dataverse + def _get_files_from_response(self, dataset_id: str, response: dict) -> List[RemoteFile]: + # TODO Do we need this for Dataverse? # this is used in invenio, do we need it for dataverse? # files_enabled = response.get("enabled", False) # if not files_enabled: @@ -381,7 +383,7 @@ def _get_files_from_response(self, record_id: str, response: dict) -> List[Remot dataFile = entry.get("dataFile") filename = dataFile.get("filename") persistendId = dataFile.get("persistentId") - uri = self.to_plugin_uri(record_id=record_id, filename=persistendId) + uri = self.to_plugin_uri(dataset_id=dataset_id, file_identifier=persistendId) path = self.plugin.to_relative_path(uri) rval.append( { From 7c229271816af51ba1503ac44bc2c33068ae82da Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Sat, 7 Dec 2024 12:52:30 +0100 Subject: [PATCH 18/64] feat: only load drafts if writeable is true --- lib/galaxy/files/sources/dataverse.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index e3dc4810aa09..b065fc3822b4 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -73,6 +73,7 @@ class DataverseRDMFilesSource(RDMFilesSource): def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): super().__init__(**kwd) self._scheme_regex = re.compile(rf"^{self.get_scheme()}?://{self.id}|^{DEFAULT_SCHEME}://{self.id}") + self.repository: DataverseRepositoryInteractor def get_scheme(self) -> str: return "dataverse" @@ -227,11 +228,10 @@ def get_file_containers( request_url = self.search_url params: Dict[str, Any] = {} params["type"] = "dataset" - # if writeable: - # TODO: Do we need this for dataverse? + if writeable: # Only draft records owned by the user can be written to. - # params["is_published"] = "false" - # request_url = self.user_records_url + params["fq"] = "publicationStatus:Draft" + request_url = self.user_records_url params["per_page"] = limit or DEFAULT_PAGE_LIMIT params["start"] = offset params["q"] = query or "*" @@ -418,7 +418,7 @@ def _get_response( # TODO: Test this method def _get_request_headers(self, user_context: OptionalUserContext, auth_required: bool = False): token = self.plugin.get_authorization_token(user_context) - headers = {"Authorization": f"Bearer {token}"} if token else {} + headers = {"X-Dataverse-key": f"{token}"} if token else {} if auth_required and token is None: self._raise_auth_required() return headers From 73c72d1c79f23228ffddcfaa36eabc4e984cdef6 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Sat, 7 Dec 2024 12:55:07 +0100 Subject: [PATCH 19/64] chore: clarification regarding dataset drafts --- lib/galaxy/files/sources/dataverse.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index b065fc3822b4..bbda24067f8c 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -229,7 +229,9 @@ def get_file_containers( params: Dict[str, Any] = {} params["type"] = "dataset" if writeable: - # Only draft records owned by the user can be written to. + # Only draft datasets can be written to. + # This is not tested and assumes that drafts are never public, + # i.e. we automatically only get the drafts from our user params["fq"] = "publicationStatus:Draft" request_url = self.user_records_url params["per_page"] = limit or DEFAULT_PAGE_LIMIT From 4be61091b88b13e6c90d420f4faf9f3c0ea8dbe9 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Sat, 7 Dec 2024 13:15:00 +0100 Subject: [PATCH 20/64] feat: load latest version of files from datasets (this automatically loads drafts if that is the latest version) --- lib/galaxy/files/sources/dataverse.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index bbda24067f8c..7aa268ce8682 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -209,7 +209,7 @@ def user_datasets_url(self) -> str: def file_access_url(self, file_id: str) -> str: return f"{self.api_base_url}/access/datafile/:persistentId?persistentId={file_id}" - def files_of_dataset_url(self, dataset_id: str, dataset_version: str = 1.0) -> str: + def files_of_dataset_url(self, dataset_id: str, dataset_version: str = ':latest') -> str: return f"{self.api_base_url}/datasets/:persistentId/versions/{dataset_version}/files?persistentId={dataset_id}" def to_plugin_uri(self, dataset_id: str, file_identifier: Optional[str] = None) -> str: @@ -233,7 +233,6 @@ def get_file_containers( # This is not tested and assumes that drafts are never public, # i.e. we automatically only get the drafts from our user params["fq"] = "publicationStatus:Draft" - request_url = self.user_records_url params["per_page"] = limit or DEFAULT_PAGE_LIMIT params["start"] = offset params["q"] = query or "*" From f557b133bf6a32606ac6637f81e3037d50a44eb2 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Sat, 7 Dec 2024 13:58:21 +0100 Subject: [PATCH 21/64] feat: download files from draft (doesn't work yet due to missing user context - same problem as in invenio) --- lib/galaxy/files/sources/dataverse.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 7aa268ce8682..2407a68aadee 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -171,6 +171,8 @@ def _realize_to( user_context: OptionalUserContext = None, opts: Optional[FilesSourceOptions] = None, ): + # TODO: user_context is always None here when called from a data fetch. (same problem as in invenio.py) + # This prevents downloading files that require authentication even if the user provided a token. dataset_id, file_id = self.parse_path(source_path) self.repository.download_file_from_container(dataset_id, file_id, native_path, user_context=user_context) @@ -200,12 +202,6 @@ def api_base_url(self) -> str: def search_url(self) -> str: return f"{self.api_base_url}/search" - @property - def user_datasets_url(self) -> str: - # TODO fix - # return f"{self.repository_url}/api/user/records" - pass - def file_access_url(self, file_id: str) -> str: return f"{self.api_base_url}/access/datafile/:persistentId?persistentId={file_id}" @@ -236,7 +232,7 @@ def get_file_containers( params["per_page"] = limit or DEFAULT_PAGE_LIMIT params["start"] = offset params["q"] = query or "*" - params["sort"] = sort_by or "date" # can be either "name" or "date" + params["sort"] = sort_by or "date" # can be either "name" or "date" response_data = self._get_response(user_context, request_url, params=params) total_hits = response_data["data"]["total_count"] return self._get_datasets_from_response(response_data["data"]), total_hits @@ -279,10 +275,9 @@ def download_file_from_container( download_file_content_url = self._get_download_file_url(container_id, file_identifier, user_context) headers = {} - # TODO: User auth - # if self._is_api_url(download_file_content_url): + if self._is_api_url(download_file_content_url): # pass the token as a header only when using the API - # headers = self._get_request_headers(user_context) + headers = self._get_request_headers(user_context) try: req = urllib.request.Request(download_file_content_url, headers=headers) with urllib.request.urlopen(req, timeout=DEFAULT_SOCKET_TIMEOUT) as page: @@ -323,7 +318,6 @@ def _get_download_file_url(self, container_id: str, file_id: str, user_context: return download_file_content_url - # TODO: Test this method def _is_api_url(self, url: str) -> bool: return "/api/" in url @@ -419,7 +413,7 @@ def _get_response( # TODO: Test this method def _get_request_headers(self, user_context: OptionalUserContext, auth_required: bool = False): token = self.plugin.get_authorization_token(user_context) - headers = {"X-Dataverse-key": f"{token}"} if token else {} + headers = {"X-Dataverse-Key": f"{token}"} if token else {} if auth_required and token is None: self._raise_auth_required() return headers From 4a37a21bbf78ea946371df890e1713cd60d261d9 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Sat, 7 Dec 2024 15:16:20 +0100 Subject: [PATCH 22/64] feat: add config samples for dataverse and dataverse_sandbox --- .../sample/file_sources_conf.yml.sample | 18 +++++++++++++ .../user_preferences_extra_conf.yml.sample | 26 +++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/lib/galaxy/config/sample/file_sources_conf.yml.sample b/lib/galaxy/config/sample/file_sources_conf.yml.sample index 0bc5e9e10aed..74bc157f6773 100644 --- a/lib/galaxy/config/sample/file_sources_conf.yml.sample +++ b/lib/galaxy/config/sample/file_sources_conf.yml.sample @@ -229,6 +229,24 @@ public_name: ${user.preferences['zenodo_sandbox|public_name']} writable: true +- type: dataverse + id: dataverse + doc: Dataverse is an open-source data repository platform designed for sharing, preserving, and managing research data, offering tools for data citation, exploration, and collaboration. + label: Dataverse + url: https://dataverse.org + token: ${user.user_vault.read_secret('preferences/dataverse/token')} + public_name: ${user.preferences['dataverse|public_name']} + writable: true + +- type: dataverse + id: dataverse_sandbox + doc: This is the sandbox instance of Dataverse. It is used for testing purposes only, content is NOT preserved. DOIs created in this instance are not real and will not resolve. + label: Dataverse Sandbox (use only for testing purposes) + url: https://demo.dataverse.org + token: ${user.user_vault.read_secret('preferences/dataverse_sandbox/token')} + public_name: ${user.preferences['dataverse_sandbox|public_name']} + writable: true + # Note for developers: you can easily set up a minimal, dockerized Onedata environment # using the so-called "demo-mode": https://onedata.org/#/home/documentation/topic/stable/demo-mode - type: onedata diff --git a/lib/galaxy/config/sample/user_preferences_extra_conf.yml.sample b/lib/galaxy/config/sample/user_preferences_extra_conf.yml.sample index 9f38f40bba1d..fb73fe16c2ea 100644 --- a/lib/galaxy/config/sample/user_preferences_extra_conf.yml.sample +++ b/lib/galaxy/config/sample/user_preferences_extra_conf.yml.sample @@ -135,6 +135,32 @@ preferences: label: Creator name to associate with new records (formatted as "Last name, First name"). If left blank "Anonymous Galaxy User" will be used. You can always change this by editing your record directly. type: text required: False + + dataverse: + description: Your Dataverse Integration Settings + inputs: + - name: token + label: API Token used to create draft records and to upload files. You can manage your tokens at https://YOUR_INSTANCE/dataverseuser.xhtml?selectTab=apiTokenTab (Replace YOUR_INSTANCE with your Dataverse instance URL) + type: secret + # store: vault # Requires setting up vault_config_file in your galaxy.yml + required: False + - name: public_name + label: Creator name to associate with new datasets (formatted as "Last name, First name"). If left blank "Anonymous Galaxy User" will be used. You can always change this by editing your dataset directly. + type: text + required: False + + dataverse_sandbox: + description: Your Dataverse Integration Settings (TESTING ONLY) + inputs: + - name: token + label: API Token used to create draft records and to upload files. You can manage your tokens at https://demo.dataverse.org/dataverseuser.xhtml?selectTab=apiTokenTab (Replace demo.dataverse.org with your Dataverse instance URL) + type: secret + # store: vault # Requires setting up vault_config_file in your galaxy.yml + required: False + - name: public_name + label: Creator name to associate with new datasets (formatted as "Last name, First name"). If left blank "Anonymous Galaxy User" will be used. You can always change this by editing your dataset directly. + type: text + required: False # Used in file_sources_conf.yml onedata: From 1b18e90f6fd4e3d321dc4ac3dff4bbe5a130f031 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Sat, 7 Dec 2024 15:21:17 +0100 Subject: [PATCH 23/64] chore: cleanup after download drafts feature, remove apparantly not needed functions --- lib/galaxy/files/sources/dataverse.py | 32 ++------------------------- 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 2407a68aadee..94e54cb3a83c 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -78,7 +78,7 @@ def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): def get_scheme(self) -> str: return "dataverse" - # TODO: Maybe we dont need this + # TODO: Maybe we dont need this? # def score_url_match(self, url: str) -> # if match := self._scheme_regex.match(url): # return match.span()[1] @@ -241,9 +241,6 @@ def get_files_in_container( self, dataset_id: str, writeable: bool, user_context: OptionalUserContext = None ) -> List[RemoteFile]: """This method lists the files in a dataverse dataset.""" - # TODO: Handle drafts? - # conditionally_draft = "/draft" if writeable else "" - # request_url = f"{self.records_url}/{dataset_id}{conditionally_draft}/files" request_url = self.files_of_dataset_url(dataset_id=dataset_id) response_data = self._get_response(user_context, request_url) total_hits = response_data["totalCount"] @@ -297,17 +294,8 @@ def _get_download_file_url(self, container_id: str, file_id: str, user_context: This method is used to download files from both published and draft datasets that are accessible by the user. """ - # TODO: Implement draft feature for Dataverse - # is_draft_record = self._is_draft_record(container_id, user_context) - download_file_content_url = self.file_access_url(file_id=file_id) - - # https://demo.dataverse.org/api/access/datafile/:persistentId?persistentId=doi:10.70122/FK2/DIG2DG/AVNCLL - # TODO: Implement draft feature for Dataverse - # if is_draft_record: - # file_details_url = self._to_draft_url(file_details_url) - # download_file_content_url = self._to_draft_url(download_file_content_url) - + # file_details = self._get_response(user_context, file_details_url) # TODO: This is a temporary workaround from invenio for the fact that the "content" API # does not support downloading files from S3 or other remote storage classes. @@ -321,10 +309,6 @@ def _get_download_file_url(self, container_id: str, file_id: str, user_context: def _is_api_url(self, url: str) -> bool: return "/api/" in url - # TODO: Test this method - def _to_draft_url(self, url: str) -> str: - return url.replace("/files/", "/draft/files/") - def _can_download_from_api(self, file_details: dict) -> bool: # TODO: Have a look at this problem @@ -332,18 +316,6 @@ def _can_download_from_api(self, file_details: dict) -> bool: # More info: https://inveniordm.docs.cern.ch/reference/file_storage/ return file_details["storage_class"] == "L" - def _is_draft_dataset(self, dataset_id: str, user_context: OptionalUserContext = None): - # TODO: Implement this for Dataverse - pass - - def _get_draft_dataset_url(self, dataset_id: str): - # TODO: Implement this for Dataverse - pass - - def _get_draft_dataset(self, dataset_id: str, user_context: OptionalUserContext = None): - # TODO: Implement this for Dataverse - pass - def _get_datasets_from_response(self, response: dict) -> List[RemoteDirectory]: datasets = response["items"] rval: List[RemoteDirectory] = [] From 29ee871d0d7597a14d63d74c28736ad4696f06a0 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Sat, 7 Dec 2024 16:02:41 +0100 Subject: [PATCH 24/64] chore: clearer naming for file container method --- lib/galaxy/files/sources/_rdm.py | 6 +++--- lib/galaxy/files/sources/invenio.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/galaxy/files/sources/_rdm.py b/lib/galaxy/files/sources/_rdm.py index 87ef411b325f..4644e7de6656 100644 --- a/lib/galaxy/files/sources/_rdm.py +++ b/lib/galaxy/files/sources/_rdm.py @@ -90,11 +90,11 @@ def get_files_in_container( """ raise NotImplementedError() - def create_draft_container( + def create_draft_file_container( self, title: str, public_name: Optional[str] = None, user_context: OptionalUserContext = None ): - """Creates a draft container (directory) in the repository with basic metadata. + """Creates a draft file container in the repository with basic metadata. The metadata is usually just the title of the container and the user that created it. Some plugins might also provide additional metadata defaults in the user settings.""" @@ -109,7 +109,7 @@ def upload_file_to_draft_container( ) -> None: """Uploads a file with the provided filename (from file_path) to a draft container with the given container_id. - The draft container must have been created in advance with the `create_draft_container` method. + The draft container must have been created in advance with the `create_draft_file_container` method. The file must exist in the file system at the given file_path. The user_context might be required to authenticate the user in the repository. diff --git a/lib/galaxy/files/sources/invenio.py b/lib/galaxy/files/sources/invenio.py index 8e85377c4dc7..bd86872422c7 100644 --- a/lib/galaxy/files/sources/invenio.py +++ b/lib/galaxy/files/sources/invenio.py @@ -209,7 +209,7 @@ def _create_entry( opts: Optional[FilesSourceOptions] = None, ) -> Entry: public_name = self.get_public_name(user_context) - record = self.repository.create_draft_container(entry_data["name"], public_name, user_context=user_context) + record = self.repository.create_draft_file_container(entry_data["name"], public_name, user_context=user_context) return { "uri": self.repository.to_plugin_uri(record["id"]), "name": record["title"], @@ -294,7 +294,7 @@ def get_files_in_container( response_data = self._get_response(user_context, request_url) return self._get_record_files_from_response(container_id, response_data) - def create_draft_container( + def create_draft_file_container( self, title: str, public_name: Optional[str] = None, user_context: OptionalUserContext = None ) -> RemoteDirectory: today = datetime.date.today().isoformat() From 77f0dde948d9e19c8a5d50974cf8d3e190702f5b Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Sat, 7 Dec 2024 16:03:26 +0100 Subject: [PATCH 25/64] feat: api versioning for long term stability --- lib/galaxy/files/sources/dataverse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 94e54cb3a83c..4873ff7975d2 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -196,7 +196,7 @@ class DataverseRepositoryInteractor(RDMRepositoryInteractor): @property def api_base_url(self) -> str: - return f"{self.repository_url}/api" + return f"{self.repository_url}/api/v1" @property def search_url(self) -> str: From 6d2081c8d98cdc07240f075847a73d6c024587c1 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Sat, 7 Dec 2024 16:03:53 +0100 Subject: [PATCH 26/64] chore: add repository type for invenio filesource class --- lib/galaxy/files/sources/invenio.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/galaxy/files/sources/invenio.py b/lib/galaxy/files/sources/invenio.py index bd86872422c7..b99e2ca019f8 100644 --- a/lib/galaxy/files/sources/invenio.py +++ b/lib/galaxy/files/sources/invenio.py @@ -130,6 +130,7 @@ class InvenioRDMFilesSource(RDMFilesSource): def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): super().__init__(**kwd) self._scheme_regex = re.compile(rf"^{self.get_scheme()}?://{self.id}|^{DEFAULT_SCHEME}://{self.id}") + self.repository: InvenioRepositoryInteractor def get_scheme(self) -> str: return "invenio" From 0220c74cebe33bc1d8545f4ce804ad6691d2e5c6 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Sat, 7 Dec 2024 17:30:16 +0100 Subject: [PATCH 27/64] chore: remove todos for tested methods --- lib/galaxy/files/sources/dataverse.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 4873ff7975d2..24c67c35127d 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -368,8 +368,6 @@ def _get_files_from_response(self, dataset_id: str, response: dict) -> List[Remo # def _get_creator_from_public_name(self, public_name: Optional[str] = None) -> Creator: # pass - - # TODO: Test this method def _get_response( self, user_context: OptionalUserContext, @@ -382,7 +380,6 @@ def _get_response( self._ensure_response_has_expected_status_code(response, 200) return response.json() - # TODO: Test this method def _get_request_headers(self, user_context: OptionalUserContext, auth_required: bool = False): token = self.plugin.get_authorization_token(user_context) headers = {"X-Dataverse-Key": f"{token}"} if token else {} @@ -390,7 +387,6 @@ def _get_request_headers(self, user_context: OptionalUserContext, auth_required: self._raise_auth_required() return headers - # TODO: Test this method def _ensure_response_has_expected_status_code(self, response, expected_status_code: int): if response.status_code != expected_status_code: if response.status_code == 403: From fd921a45361ee1bca2ffeba01fb8374fb7a3ab46 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Sat, 7 Dec 2024 17:44:46 +0100 Subject: [PATCH 28/64] feat: export history to existing dataset --- lib/galaxy/files/sources/dataverse.py | 34 +++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 24c67c35127d..0ac54bbe2a39 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -162,6 +162,13 @@ def _create_entry( opts: Optional[FilesSourceOptions] = None, ) -> Entry: # TODO: Implement this for Dataverse + # public_name = self.get_public_name(user_context) + # dataset = self.repository.create_draft_file_container(entry_data.name, public_name, user_context) + # return { + # "uri": self.to_plugin_uri(dataset["global_id"]), + # "name": dataset["name"], + # "external_link": 'test', + # } pass def _realize_to( @@ -185,7 +192,7 @@ def _write_from( opts: Optional[FilesSourceOptions] = None, ): dataset_id, file_id = self.parse_path(target_path) - self.repository.upload_file_to_draft_dataset(dataset_id, file_id, native_path, user_context=user_context) + self.repository.upload_file_to_draft_container(dataset_id, file_id, native_path, user_context=user_context) def _get_dataset_id_from_path(self, path: str) -> str: # /doi:10.70122/FK2/DIG2DG => doi:10.70122/FK2/DIG2DG @@ -207,6 +214,9 @@ def file_access_url(self, file_id: str) -> str: def files_of_dataset_url(self, dataset_id: str, dataset_version: str = ':latest') -> str: return f"{self.api_base_url}/datasets/:persistentId/versions/{dataset_version}/files?persistentId={dataset_id}" + + def add_files_to_dataset_url(self, dataset_id: str) -> str: + return f"{self.api_base_url}/datasets/:persistentId/add?persistentId={dataset_id}" def to_plugin_uri(self, dataset_id: str, file_identifier: Optional[str] = None) -> str: return f"{self.plugin.get_uri_root()}/{f'{file_identifier}' if file_identifier else f'{dataset_id}'}" @@ -246,10 +256,10 @@ def get_files_in_container( total_hits = response_data["totalCount"] return self._get_files_from_response(dataset_id, response_data["data"]) - def create_draft_container( + def create_draft_file_container( self, title: str, public_name: Optional[str] = None, user_context: OptionalUserContext = None ) -> RemoteDirectory: - # TODO: Implement this for Dataverse + # TODO Implement for Dataverse, see invenio pass def upload_file_to_draft_container( @@ -259,8 +269,22 @@ def upload_file_to_draft_container( file_path: str, user_context: OptionalUserContext = None, ): - # TODO: Implement this for Dataverse - pass + headers = self._get_request_headers(user_context, auth_required=True) + + with open(file_path, "rb") as file: + files = {'file': (filename, file)} + # -------------------------------------------------- + # Using a "jsonData" parameter, add optional description + file tags + # -------------------------------------------------- + # params = dict(description='Blue skies!', + # categories=['Lily', 'Rosemary', 'Jack of Hearts']) + # params_as_json_string = json.dumps(params) + payload = dict() + add_files_url = self.add_files_to_dataset_url(dataset_id) + response = requests.post(add_files_url, data=payload, files=files, headers=headers) + print(response.json()) + print(response.status_code) + self._ensure_response_has_expected_status_code(response, 200) def download_file_from_container( self, From 20cbd1ad2242ba806cf79fc04c5fbce292f1c5af Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Thu, 12 Dec 2024 14:59:26 +0100 Subject: [PATCH 29/64] feat: reimport of archived datasets from dataverse --- lib/galaxy/files/sources/dataverse.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 0ac54bbe2a39..7a48d4be7bcd 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -181,7 +181,12 @@ def _realize_to( # TODO: user_context is always None here when called from a data fetch. (same problem as in invenio.py) # This prevents downloading files that require authentication even if the user provided a token. dataset_id, file_id = self.parse_path(source_path) - self.repository.download_file_from_container(dataset_id, file_id, native_path, user_context=user_context) + if "rocrate.zip" in file_id: + # If file path contains "rocrate.zip", we need to change the URL, so we use the dataverse API URL to download a dataset as a zip file + file_id = re.sub(r"/[^/]*\.rocrate\.zip$", "", file_id) + self.repository._download_dataset_as_zip(dataset_id, native_path, user_context) + else: + self.repository.download_file_from_container(dataset_id, file_id, native_path, user_context=user_context) # TODO: Test this method def _write_from( @@ -294,6 +299,18 @@ def download_file_from_container( user_context: OptionalUserContext = None, ): download_file_content_url = self._get_download_file_url(container_id, file_identifier, user_context) + self._download_file(file_path, download_file_content_url, user_context) + + def _download_dataset_as_zip(self, dataset_id: str, file_path: str, user_context: OptionalUserContext = None): + download_file_content_url = f"{self.api_base_url}/access/dataset/:persistentId/?persistentId={dataset_id}" + self._download_file(file_path, download_file_content_url, user_context) + + def _download_file( + self, + file_path: str, + download_file_content_url: str, + user_context: OptionalUserContext = None, + ): headers = {} if self._is_api_url(download_file_content_url): @@ -310,7 +327,7 @@ def download_file_from_container( # TODO: We can only download files from published datasets for now if e.code in [401, 403, 404]: raise Exception( - f"Cannot download file '{file_identifier}' from dataset '{container_id}'. Please make sure the dataset exists and it is public." + f"Cannot download file from URL '{file_path}'. Please make sure the dataset and/or file exists and it is public." ) def _get_download_file_url(self, container_id: str, file_id: str, user_context: OptionalUserContext = None): From ad65b27e907f3880eea7091d5e3d33ac197bbd6e Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 13 Dec 2024 14:36:26 +0100 Subject: [PATCH 30/64] chore: line breaks --- lib/galaxy/files/sources/dataverse.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 7a48d4be7bcd..6cf2590ebbfc 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -301,7 +301,12 @@ def download_file_from_container( download_file_content_url = self._get_download_file_url(container_id, file_identifier, user_context) self._download_file(file_path, download_file_content_url, user_context) - def _download_dataset_as_zip(self, dataset_id: str, file_path: str, user_context: OptionalUserContext = None): + def _download_dataset_as_zip( + self, + dataset_id: str, + file_path: str, + user_context: OptionalUserContext = None + ): download_file_content_url = f"{self.api_base_url}/access/dataset/:persistentId/?persistentId={dataset_id}" self._download_file(file_path, download_file_content_url, user_context) From 647f1f5110e698b634e331b4c7dda095edd1d03c Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 13 Dec 2024 14:37:08 +0100 Subject: [PATCH 31/64] feat: more reliable way to reimport archives --- lib/galaxy/files/sources/dataverse.py | 29 ++++++++++++++++++++------- lib/galaxy/schema/schema.py | 5 ++++- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 6cf2590ebbfc..03359765edac 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -43,6 +43,13 @@ stream_to_open_named_file, ) +from galaxy.schema.schema import ( + ModelStoreFormat, +) + +class NotFoundException(Exception): + def __init__(self, message): + super().__init__(message) class DataverseDataset(TypedDict): name: str @@ -180,13 +187,20 @@ def _realize_to( ): # TODO: user_context is always None here when called from a data fetch. (same problem as in invenio.py) # This prevents downloading files that require authentication even if the user provided a token. + dataset_id, file_id = self.parse_path(source_path) - if "rocrate.zip" in file_id: - # If file path contains "rocrate.zip", we need to change the URL, so we use the dataverse API URL to download a dataset as a zip file - file_id = re.sub(r"/[^/]*\.rocrate\.zip$", "", file_id) - self.repository._download_dataset_as_zip(dataset_id, native_path, user_context) - else: + try: self.repository.download_file_from_container(dataset_id, file_id, native_path, user_context=user_context) + except NotFoundException as e: + filename = file_id.split("/")[-1] + is_archive = any(format in filename for format in ModelStoreFormat.available_formats()) + if is_archive: + # Workaround explanation: + # When we archive our history to dataverse, the zip sent from Galaxy to dataverse is extracted automatically. + # Only the contents are stored, not the zip itself. + # So, if a zip is not called, we suppose we are trying to reimport an archived history + # and make an API call ti Dataverse to download the dataset as a zip. + self.repository._download_dataset_as_zip(dataset_id, native_path, user_context) # TODO: Test this method def _write_from( @@ -215,7 +229,8 @@ def search_url(self) -> str: return f"{self.api_base_url}/search" def file_access_url(self, file_id: str) -> str: - return f"{self.api_base_url}/access/datafile/:persistentId?persistentId={file_id}" + encoded_file_id = quote(encoded_file_id, safe="") + return f"{self.api_base_url}/access/datafile/:persistentId?persistentId={encoded_file_id}" def files_of_dataset_url(self, dataset_id: str, dataset_version: str = ':latest') -> str: return f"{self.api_base_url}/datasets/:persistentId/versions/{dataset_version}/files?persistentId={dataset_id}" @@ -331,7 +346,7 @@ def _download_file( except urllib.error.HTTPError as e: # TODO: We can only download files from published datasets for now if e.code in [401, 403, 404]: - raise Exception( + raise NotFoundException( f"Cannot download file from URL '{file_path}'. Please make sure the dataset and/or file exists and it is public." ) diff --git a/lib/galaxy/schema/schema.py b/lib/galaxy/schema/schema.py index e33053fed0d0..a57e0438455b 100644 --- a/lib/galaxy/schema/schema.py +++ b/lib/galaxy/schema/schema.py @@ -1703,7 +1703,10 @@ def is_compressed(cls, value: "ModelStoreFormat"): @classmethod def is_bag(cls, value: "ModelStoreFormat"): return value in [cls.BAG_DOT_TAR, cls.BAG_DOT_TGZ, cls.BAG_DOT_ZIP] - + + @classmethod + def available_formats(cls): + return [item.value for item in cls] class StoreContentSource(Model): store_content_uri: Optional[str] = None From 5698ed5f286948538917ab1a0be3cddafcb89267 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 13 Dec 2024 14:39:59 +0100 Subject: [PATCH 32/64] chore: typo --- lib/galaxy/files/sources/dataverse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 03359765edac..34be538c1250 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -198,7 +198,7 @@ def _realize_to( # Workaround explanation: # When we archive our history to dataverse, the zip sent from Galaxy to dataverse is extracted automatically. # Only the contents are stored, not the zip itself. - # So, if a zip is not called, we suppose we are trying to reimport an archived history + # So, if a zip is not found, we suppose we are trying to reimport an archived history # and make an API call ti Dataverse to download the dataset as a zip. self.repository._download_dataset_as_zip(dataset_id, native_path, user_context) From 234c27ddb395ed9b02e6fe2f045520d1ea2a5961 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Fri, 13 Dec 2024 15:13:17 +0100 Subject: [PATCH 33/64] chore: typo --- lib/galaxy/files/sources/dataverse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 34be538c1250..858577d42cdb 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -199,7 +199,7 @@ def _realize_to( # When we archive our history to dataverse, the zip sent from Galaxy to dataverse is extracted automatically. # Only the contents are stored, not the zip itself. # So, if a zip is not found, we suppose we are trying to reimport an archived history - # and make an API call ti Dataverse to download the dataset as a zip. + # and make an API call to Dataverse to download the dataset as a zip. self.repository._download_dataset_as_zip(dataset_id, native_path, user_context) # TODO: Test this method From 559afcf602d47d96e2727b3579b679fdf73481a1 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Mon, 16 Dec 2024 09:58:44 +0100 Subject: [PATCH 34/64] fix: only recognize .zip files for dataset import workaround (.tar files aren't extracted automatically) --- lib/galaxy/files/sources/dataverse.py | 20 ++++++++++---------- lib/galaxy/schema/schema.py | 5 +---- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 858577d42cdb..6bf7e5f91438 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -43,13 +43,6 @@ stream_to_open_named_file, ) -from galaxy.schema.schema import ( - ModelStoreFormat, -) - -class NotFoundException(Exception): - def __init__(self, message): - super().__init__(message) class DataverseDataset(TypedDict): name: str @@ -193,8 +186,8 @@ def _realize_to( self.repository.download_file_from_container(dataset_id, file_id, native_path, user_context=user_context) except NotFoundException as e: filename = file_id.split("/")[-1] - is_archive = any(format in filename for format in ModelStoreFormat.available_formats()) - if is_archive: + is_zip_file = self._is_zip_archive(filename) + if is_zip_file: # Workaround explanation: # When we archive our history to dataverse, the zip sent from Galaxy to dataverse is extracted automatically. # Only the contents are stored, not the zip itself. @@ -202,6 +195,9 @@ def _realize_to( # and make an API call to Dataverse to download the dataset as a zip. self.repository._download_dataset_as_zip(dataset_id, native_path, user_context) + def _is_zip_archive(self, file_name: str) -> bool: + return file_name.endswith(".zip") + # TODO: Test this method def _write_from( self, @@ -301,7 +297,11 @@ def upload_file_to_draft_container( # params_as_json_string = json.dumps(params) payload = dict() add_files_url = self.add_files_to_dataset_url(dataset_id) - response = requests.post(add_files_url, data=payload, files=files, headers=headers) + response = requests.post( + add_files_url, + data=payload, + files=files, + headers=headers) print(response.json()) print(response.status_code) self._ensure_response_has_expected_status_code(response, 200) diff --git a/lib/galaxy/schema/schema.py b/lib/galaxy/schema/schema.py index a57e0438455b..e33053fed0d0 100644 --- a/lib/galaxy/schema/schema.py +++ b/lib/galaxy/schema/schema.py @@ -1703,10 +1703,7 @@ def is_compressed(cls, value: "ModelStoreFormat"): @classmethod def is_bag(cls, value: "ModelStoreFormat"): return value in [cls.BAG_DOT_TAR, cls.BAG_DOT_TGZ, cls.BAG_DOT_ZIP] - - @classmethod - def available_formats(cls): - return [item.value for item in cls] + class StoreContentSource(Model): store_content_uri: Optional[str] = None From 48fa830b684bf0c62684fdd602ce46432b688365 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Mon, 16 Dec 2024 09:59:53 +0100 Subject: [PATCH 35/64] chore: add NotFoundException --- lib/galaxy/files/sources/dataverse.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 6bf7e5f91438..1130a769e8fe 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -43,6 +43,9 @@ stream_to_open_named_file, ) +class NotFoundException(Exception): + def __init__(self, message): + super().__init__(message) class DataverseDataset(TypedDict): name: str From f789332419ba2fbc183bd4daa75ed1d2baaadc60 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Mon, 16 Dec 2024 12:16:58 +0100 Subject: [PATCH 36/64] chore: remove print statements --- lib/galaxy/files/sources/dataverse.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 1130a769e8fe..67cc6593513d 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -305,8 +305,6 @@ def upload_file_to_draft_container( data=payload, files=files, headers=headers) - print(response.json()) - print(response.status_code) self._ensure_response_has_expected_status_code(response, 200) def download_file_from_container( From 21dc77d1f5c6f3d5ee6fdf2f8b9a14fd85c6150f Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Mon, 16 Dec 2024 12:18:12 +0100 Subject: [PATCH 37/64] chore: remove TODOs --- lib/galaxy/files/sources/dataverse.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 67cc6593513d..1b3b97aed821 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -88,7 +88,6 @@ def get_scheme(self) -> str: # else: # return 0 - # TODO: Test this method (maybe we dont need it) def to_relative_path(self, url: str) -> str: legacy_uri_root = f"{DEFAULT_SCHEME}://{self.id}" if url.startswith(legacy_uri_root): @@ -201,7 +200,6 @@ def _realize_to( def _is_zip_archive(self, file_name: str) -> bool: return file_name.endswith(".zip") - # TODO: Test this method def _write_from( self, target_path: str, From 5898d45a665e9e09f7d1340d8b7ddda995fa485a Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Mon, 16 Dec 2024 14:25:56 +0100 Subject: [PATCH 38/64] chore: score_url_match function --- lib/galaxy/files/sources/dataverse.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 1b3b97aed821..7f67f61e0424 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -1,4 +1,3 @@ -import datetime import json import re import urllib.request @@ -82,11 +81,11 @@ def get_scheme(self) -> str: return "dataverse" # TODO: Maybe we dont need this? - # def score_url_match(self, url: str) -> - # if match := self._scheme_regex.match(url): - # return match.span()[1] - # else: - # return 0 + def score_url_match(self, url: str) -> int: + if match := self._scheme_regex.match(url): + return match.span()[1] + else: + return 0 def to_relative_path(self, url: str) -> str: legacy_uri_root = f"{DEFAULT_SCHEME}://{self.id}" From b6a4f36e49efd3028d2d8d2cfc5ebd9f1d354005 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Mon, 16 Dec 2024 14:48:39 +0100 Subject: [PATCH 39/64] chore: remove invenio specific feature --- lib/galaxy/files/sources/dataverse.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 7f67f61e0424..a7003fcb9826 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -80,7 +80,6 @@ def __init__(self, **kwd: Unpack[RDMFilesSourceProperties]): def get_scheme(self) -> str: return "dataverse" - # TODO: Maybe we dont need this? def score_url_match(self, url: str) -> int: if match := self._scheme_regex.match(url): return match.span()[1] @@ -397,13 +396,6 @@ def _get_dataset_title(self, dataset: DataverseDataset) -> str: return title or "No title" def _get_files_from_response(self, dataset_id: str, response: dict) -> List[RemoteFile]: - - # TODO Do we need this for Dataverse? - # this is used in invenio, do we need it for dataverse? - # files_enabled = response.get("enabled", False) - # if not files_enabled: - # return [] - rval: List[RemoteFile] = [] for entry in response: dataFile = entry.get("dataFile") From 4e21abdb116b4bb3119853a7eb5959e6f083f5d9 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Tue, 17 Dec 2024 10:06:56 +0100 Subject: [PATCH 40/64] chore: remove unused metadata parameter for file upload --- lib/galaxy/files/sources/dataverse.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index a7003fcb9826..eda1fbbd8c19 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -287,18 +287,9 @@ def upload_file_to_draft_container( headers = self._get_request_headers(user_context, auth_required=True) with open(file_path, "rb") as file: - files = {'file': (filename, file)} - # -------------------------------------------------- - # Using a "jsonData" parameter, add optional description + file tags - # -------------------------------------------------- - # params = dict(description='Blue skies!', - # categories=['Lily', 'Rosemary', 'Jack of Hearts']) - # params_as_json_string = json.dumps(params) - payload = dict() add_files_url = self.add_files_to_dataset_url(dataset_id) response = requests.post( add_files_url, - data=payload, files=files, headers=headers) self._ensure_response_has_expected_status_code(response, 200) From 128f8ab79e1a3951b60bde17cbe4892a6f9dc0bc Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Tue, 17 Dec 2024 10:19:20 +0100 Subject: [PATCH 41/64] fix: add files again --- lib/galaxy/files/sources/dataverse.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index eda1fbbd8c19..52f8c3939e4b 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -287,6 +287,7 @@ def upload_file_to_draft_container( headers = self._get_request_headers(user_context, auth_required=True) with open(file_path, "rb") as file: + files = {'file': (filename, file)} add_files_url = self.add_files_to_dataset_url(dataset_id) response = requests.post( add_files_url, From e22b2dff86a2d0e88f1e5cf911b28a50cce09072 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Tue, 17 Dec 2024 12:31:46 +0100 Subject: [PATCH 42/64] feat: export history to new dataverse dataset --- lib/galaxy/files/sources/dataverse.py | 173 +++++++++++++++++++++++--- 1 file changed, 159 insertions(+), 14 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 52f8c3939e4b..20b4133f6236 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -161,15 +161,13 @@ def _create_entry( user_context: OptionalUserContext = None, opts: Optional[FilesSourceOptions] = None, ) -> Entry: - # TODO: Implement this for Dataverse - # public_name = self.get_public_name(user_context) - # dataset = self.repository.create_draft_file_container(entry_data.name, public_name, user_context) - # return { - # "uri": self.to_plugin_uri(dataset["global_id"]), - # "name": dataset["name"], - # "external_link": 'test', - # } - pass + public_name = self.get_public_name(user_context) or "Anonymous Galaxy User" + dataset = self.repository.create_draft_file_container(entry_data["name"], public_name, user_context) + return { + "uri": self.repository.to_plugin_uri(dataset.get("persistentId")), + "name": dataset.get("name") or "No title", + "external_link": self.repository.public_dataset_url(dataset.get("persistentId")), + } def _realize_to( self, @@ -209,7 +207,7 @@ def _write_from( self.repository.upload_file_to_draft_container(dataset_id, file_id, native_path, user_context=user_context) def _get_dataset_id_from_path(self, path: str) -> str: - # /doi:10.70122/FK2/DIG2DG => doi:10.70122/FK2/DIG2DG + """e.g. /doi:10.70122/FK2/DIG2DG => doi:10.70122/FK2/DIG2DG""" return path.lstrip("/") class DataverseRepositoryInteractor(RDMRepositoryInteractor): @@ -230,6 +228,15 @@ def file_access_url(self, file_id: str) -> str: def files_of_dataset_url(self, dataset_id: str, dataset_version: str = ':latest') -> str: return f"{self.api_base_url}/datasets/:persistentId/versions/{dataset_version}/files?persistentId={dataset_id}" + def create_collection_url(self, parent_alias: str) -> str: + return f"{self.api_base_url}/dataverses/{parent_alias}" + + def create_dataset_url(self, parent_alias: str) -> str: + return f"{self.api_base_url}/dataverses/{parent_alias}/datasets" + + def public_dataset_url(self, dataset_id: str) -> str: + return f"{self.repository_url}/dataset.xhtml?persistentId={dataset_id}" + def add_files_to_dataset_url(self, dataset_id: str) -> str: return f"{self.api_base_url}/datasets/:persistentId/add?persistentId={dataset_id}" @@ -274,8 +281,34 @@ def get_files_in_container( def create_draft_file_container( self, title: str, public_name: Optional[str] = None, user_context: OptionalUserContext = None ) -> RemoteDirectory: - # TODO Implement for Dataverse, see invenio - pass + """Creates a draft Dataset in the repository. Dataverse Datasets are contained in Collections. Collections can be contained in Collections. + We create a Collection inside the root Collection and then a Dataset inside that Collection.""" + collection_alias = self.create_valid_alias(public_name, title) + collection_payload = self._prepare_collection_data(title, collection_alias, user_context) + collection = self._create_collection(":root", collection_payload, user_context) + if collection and collection.get("data"): + collection_alias = collection.get("data").get("alias") + else: + raise Exception("Could not create collection in Dataverse or response has not expected format.") + dataset_payload = self._prepare_dataset_data(title, public_name, user_context) + dataset = self._create_dataset(collection_alias, dataset_payload, user_context) + if dataset and dataset.get("data"): + dataset["data"]["name"] = title + return dataset["data"] + else: + raise Exception("Could not create dataset in Dataverse or response has not expected format.") + + def _create_collection(self, parent_alias: str, collection_payload: dict, user_context: OptionalUserContext = None) -> dict: + headers = self._get_request_headers(user_context, auth_required=True) + response = requests.post(self.create_collection_url(parent_alias), data=collection_payload, headers=headers) + self._ensure_response_has_expected_status_code(response, 201) + return response.json() + + def _create_dataset(self, parent_alias: str, dataset_payload: dict, user_context: OptionalUserContext = None) -> dict: + headers = self._get_request_headers(user_context, auth_required=True) + response = requests.post(self.create_dataset_url(parent_alias), data=dataset_payload, headers=headers) + self._ensure_response_has_expected_status_code(response, 201) + return response.json() def upload_file_to_draft_container( self, @@ -439,13 +472,11 @@ def _ensure_response_has_expected_status_code(self, response, expected_status_co f"Request to {response.url} failed with status code {response.status_code}: {error_message}" ) - # TODO: Test this method def _raise_auth_required(self): raise AuthenticationRequired( f"Please provide a personal access token in your user's preferences for '{self.plugin.label}'" ) - # TODO: Test this method def _get_response_error_message(self, response): response_json = response.json() error_message = response_json.get("message") if response.status_code == 400 else response.text @@ -453,6 +484,120 @@ def _get_response_error_message(self, response): for error in errors: error_message += f"\n{json.dumps(error)}" return error_message + + def _get_user_email(self, user_context: OptionalUserContext = None) -> str: + return user_context.email if user_context and user_context.email else "enteryourmail@placeholder.com" + + def create_valid_alias(self, public_name: str, title: str) -> str: + return re.sub(r"[^a-zA-Z0-9-_]", "", public_name.lower().replace(" ", "-") + "_" + title.lower().replace(" ", "-")) + + def _prepare_collection_data( + self, + title: str, + collection_alias: Optional[str] = None, + user_context: OptionalUserContext = None, + ) -> str: + user_email = self._get_user_email(user_context) + return json.dumps({ + "name": title, + "alias": collection_alias, + "dataverseContacts": [ + { + "contactEmail": user_email + }, + ], + }) + + def _prepare_dataset_data( + self, + title: str, + public_name: str, + user_context: OptionalUserContext = None, + ) -> str: + """Prepares the dataset data with all required metadata fields.""" + user_email = self._get_user_email(user_context) + author_name = public_name + dataset_data = { + "datasetVersion": { + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0" + }, + "metadataBlocks": { + "citation": { + "fields": [ + { + "value": title, + "typeClass": "primitive", + "multiple": False, + "typeName": "title" + }, + { + "value": [ + { + "authorName": { + "value": author_name, + "typeClass": "primitive", + "multiple": False, + "typeName": "authorName" + } + } + ], + "typeClass": "compound", + "multiple": True, + "typeName": "author" + }, + { + "value": [ + { + "datasetContactEmail": { + "typeClass": "primitive", + "multiple": False, + "typeName": "datasetContactEmail", + "value": user_email, + }, + "datasetContactName": { + "typeClass": "primitive", + "multiple": False, + "typeName": "datasetContactName", + "value": author_name, + } + } + ], + "typeClass": "compound", + "multiple": True, + "typeName": "datasetContact" + }, + { + "value": [ + { + "dsDescriptionValue": { + "value": "Exported history from Galaxy", + "multiple": False, + "typeClass": "primitive", + "typeName": "dsDescriptionValue" + } + } + ], + "typeClass": "compound", + "multiple": True, + "typeName": "dsDescription" + }, + { + "value": [ + "Medicine, Health and Life Sciences" + ], + "typeClass": "controlledVocabulary", + "multiple": True, + "typeName": "subject" + } + ], + "displayName": "Citation Metadata" + } + } + } + } + return json.dumps(dataset_data) __all__ = ("DataverseRDMFilesSource",) \ No newline at end of file From 08cd27f97699e63a8cd863d5ea4a4efd0cc05d8c Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Tue, 17 Dec 2024 12:40:00 +0100 Subject: [PATCH 43/64] fix: remove todos, file_access_url --- lib/galaxy/files/sources/dataverse.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 20b4133f6236..56fbe6c9b16b 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -222,7 +222,7 @@ def search_url(self) -> str: return f"{self.api_base_url}/search" def file_access_url(self, file_id: str) -> str: - encoded_file_id = quote(encoded_file_id, safe="") + encoded_file_id = quote(file_id, safe="") return f"{self.api_base_url}/access/datafile/:persistentId?persistentId={encoded_file_id}" def files_of_dataset_url(self, dataset_id: str, dataset_version: str = ':latest') -> str: @@ -335,7 +335,7 @@ def download_file_from_container( file_path: str, user_context: OptionalUserContext = None, ): - download_file_content_url = self._get_download_file_url(container_id, file_identifier, user_context) + download_file_content_url = self.file_access_url(file_identifier) self._download_file(file_path, download_file_content_url, user_context) def _download_dataset_as_zip( @@ -378,27 +378,11 @@ def _get_download_file_url(self, container_id: str, file_id: str, user_context: This method is used to download files from both published and draft datasets that are accessible by the user. """ download_file_content_url = self.file_access_url(file_id=file_id) - - # file_details = self._get_response(user_context, file_details_url) - # TODO: This is a temporary workaround from invenio for the fact that the "content" API - # does not support downloading files from S3 or other remote storage classes. - # We might need something like this as well for dataverse - # if not self._can_download_from_api(file_details): - # More info: https://inveniordm.docs.cern.ch/reference/file_storage/#remote-files-r - # download_file_content_url = f"{file_details_url.replace('/api', '')}?download=1" - return download_file_content_url def _is_api_url(self, url: str) -> bool: return "/api/" in url - def _can_download_from_api(self, file_details: dict) -> bool: - # TODO: Have a look at this problem - - # Only files stored locally seems to be fully supported by the API for now - # More info: https://inveniordm.docs.cern.ch/reference/file_storage/ - return file_details["storage_class"] == "L" - def _get_datasets_from_response(self, response: dict) -> List[RemoteDirectory]: datasets = response["items"] rval: List[RemoteDirectory] = [] From 173c445287c710442408a5642bf1b579ba667668 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Tue, 17 Dec 2024 12:40:34 +0100 Subject: [PATCH 44/64] chore: remove get creator method --- lib/galaxy/files/sources/dataverse.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 56fbe6c9b16b..d2bc4ff3ea45 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -424,10 +424,6 @@ def _get_files_from_response(self, dataset_id: str, response: dict) -> List[Remo ) return rval - # TODO: Implement this for Dataverse - # def _get_creator_from_public_name(self, public_name: Optional[str] = None) -> Creator: - # pass - def _get_response( self, user_context: OptionalUserContext, From 15c9488e1b56a33165e48fe14affe19e32868d70 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 10:39:25 +0100 Subject: [PATCH 45/64] chore: remove reference to directories in rdm base class --- lib/galaxy/files/sources/_rdm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/files/sources/_rdm.py b/lib/galaxy/files/sources/_rdm.py index 4644e7de6656..99681bf76e7f 100644 --- a/lib/galaxy/files/sources/_rdm.py +++ b/lib/galaxy/files/sources/_rdm.py @@ -139,10 +139,10 @@ class RDMFilesSource(BaseFilesSource): by file sources that interact with RDM repositories. A RDM file source is similar to a regular file source, but instead of tree of - files and directories, it provides a (one level) list of containers (representing directories) + files and directories, it provides a (one level) list of containers that can contain only files (no subdirectories). - In addition, RDM file sources might need to create a new container (directory) in advance in the + In addition, RDM file sources might need to create a new container in advance in the repository, and then upload a file to it. This is done by calling the `_create_entry` method. """ From 144fec035f404e58d2c01df3f923b22d264e017d Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 10:56:12 +0100 Subject: [PATCH 46/64] chore: refactor parse path function --- lib/galaxy/files/sources/dataverse.py | 30 ++++++++++++--------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index d2bc4ff3ea45..bdb8d22e7cb0 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -97,9 +97,9 @@ def get_repository_interactor(self, repository_url: str) -> RDMRepositoryInterac return DataverseRepositoryInteractor(repository_url, self) def parse_path(self, source_path: str, container_id_only: bool = False) -> ContainerAndFileIdentifier: - """Parses the given source path and returns the dataset_id and the file_id. + """Parses the given source path and returns the dataset_id and/or the file_id. - The source path must have the format '//'. + The source path must either have the format '/' or '/' where is a subset of . If dataset_id_only is True, the source path must have the format '/' and an empty file_id will be returned. Example dataset_id: @@ -108,24 +108,20 @@ def parse_path(self, source_path: str, container_id_only: bool = False) -> Conta Example file_id: doi:10.70122/FK2/DIG2DG/AVNCLL """ - def get_error_msg(details: str) -> str: - return f"Invalid source path: '{source_path}'. Expected format: '{expected_format}'. {details}" - - expected_format = "/" if not source_path.startswith("/"): - raise ValueError(get_error_msg("Must start with '/'.")) - parts = source_path[1:].split("/", 4) - dataset_id = "/".join(parts[0:3]) + raise ValueError(f"Invalid source path: '{source_path}'. Must start with '/'.") + + parts = source_path[1:].split("/", 3) + dataset_id = "/".join(parts[:3]) + if container_id_only: if len(parts) != 3: - raise ValueError(get_error_msg("Please provide the dataset_id only.")) - dataset_id = "/".join(parts[0:3]) - return ContainerAndFileIdentifier(dataset_id=parts[0:3], file_identifier="") - expected_format = "//" - if len(parts) < 4: - raise ValueError(get_error_msg("Please provide both the dataset_id and file_id.")) - if len(parts) > 4: - raise ValueError(get_error_msg("Too many parts. Please provide the dataset_id and file_id only.")) + raise ValueError(f"Invalid source path: '{source_path}'. Expected format: '/'.") + return ContainerAndFileIdentifier(container_id=dataset_id, file_identifier="") + + if len(parts) != 4: + raise ValueError(f"Invalid source path: '{source_path}'. Expected format: '/'.") + file_id = dataset_id + "/" + parts[3] return ContainerAndFileIdentifier(container_id=dataset_id, file_identifier=file_id) From 8442ceee95bbe3948ef9c650434b36c2980d9524 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 10:56:34 +0100 Subject: [PATCH 47/64] chore: reordering imports --- lib/galaxy/files/sources/dataverse.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index bdb8d22e7cb0..443bd6c21b1b 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -1,6 +1,8 @@ import json import re import urllib.request +from urllib.parse import quote + from typing import ( Any, cast, @@ -9,7 +11,6 @@ Optional, Tuple, ) -from urllib.parse import quote from typing_extensions import ( Literal, From f4fdf5da92584942ab3b6b657285db44cbf57d6d Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 10:59:32 +0100 Subject: [PATCH 48/64] chore: remove duplicated function --- lib/galaxy/files/sources/dataverse.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 443bd6c21b1b..9901203a6261 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -148,7 +148,7 @@ def _list( writeable, user_context, limit=limit, offset=offset, query=query ) return cast(List[AnyRemoteEntry], datasets), total_hits - dataset_id = self._get_dataset_id_from_path(path) + dataset_id = self.get_container_id_from_path(path) files = self.repository.get_files_in_container(dataset_id, writeable, user_context) return cast(List[AnyRemoteEntry], files), len(files) @@ -202,10 +202,6 @@ def _write_from( ): dataset_id, file_id = self.parse_path(target_path) self.repository.upload_file_to_draft_container(dataset_id, file_id, native_path, user_context=user_context) - - def _get_dataset_id_from_path(self, path: str) -> str: - """e.g. /doi:10.70122/FK2/DIG2DG => doi:10.70122/FK2/DIG2DG""" - return path.lstrip("/") class DataverseRepositoryInteractor(RDMRepositoryInteractor): """In Dataverse a "Dataset" represents what we refer to as container in the rdm base class""" From 5480e8f0a47e7f9d68cdc10e9cc51e7511cc8bdc Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:13:31 +0100 Subject: [PATCH 49/64] chore: add TODO for tar.gz files --- lib/galaxy/files/sources/dataverse.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 9901203a6261..56e3a72334dd 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -310,9 +310,11 @@ def upload_file_to_draft_container( file_path: str, user_context: OptionalUserContext = None, ): + """Uploads a file to a draft dataset in the repository.""" headers = self._get_request_headers(user_context, auth_required=True) with open(file_path, "rb") as file: + # TODO: For some reason tar.gz files are not uploaded successfully to Dataverse. files = {'file': (filename, file)} add_files_url = self.add_files_to_dataset_url(dataset_id) response = requests.post( From 3d0ec46d9090cd217029a1ceddb1c41bd6458c98 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:14:50 +0100 Subject: [PATCH 50/64] chore: add dataset download url --- lib/galaxy/files/sources/dataverse.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 56e3a72334dd..68414daf0088 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -227,11 +227,14 @@ def create_collection_url(self, parent_alias: str) -> str: def create_dataset_url(self, parent_alias: str) -> str: return f"{self.api_base_url}/dataverses/{parent_alias}/datasets" - def public_dataset_url(self, dataset_id: str) -> str: - return f"{self.repository_url}/dataset.xhtml?persistentId={dataset_id}" + def download_dataset_as_zip_url(self, dataset_id: str) -> str: + return f"{self.api_base_url}/access/dataset/:persistentId/?persistentId={dataset_id}" def add_files_to_dataset_url(self, dataset_id: str) -> str: return f"{self.api_base_url}/datasets/:persistentId/add?persistentId={dataset_id}" + + def public_dataset_url(self, dataset_id: str) -> str: + return f"{self.repository_url}/dataset.xhtml?persistentId={dataset_id}" def to_plugin_uri(self, dataset_id: str, file_identifier: Optional[str] = None) -> str: return f"{self.plugin.get_uri_root()}/{f'{file_identifier}' if file_identifier else f'{dataset_id}'}" @@ -339,8 +342,8 @@ def _download_dataset_as_zip( file_path: str, user_context: OptionalUserContext = None ): - download_file_content_url = f"{self.api_base_url}/access/dataset/:persistentId/?persistentId={dataset_id}" - self._download_file(file_path, download_file_content_url, user_context) + download_dataset_url = self.download_dataset_as_zip_url(dataset_id) + self._download_file(file_path, download_dataset_url, user_context) def _download_file( self, From ed8bd7873a900099487dd9b79635f279e012b69b Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:15:38 +0100 Subject: [PATCH 51/64] chore: private get alias function --- lib/galaxy/files/sources/dataverse.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 68414daf0088..d7a1a500ae38 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -466,16 +466,17 @@ def _get_response_error_message(self, response): def _get_user_email(self, user_context: OptionalUserContext = None) -> str: return user_context.email if user_context and user_context.email else "enteryourmail@placeholder.com" - def create_valid_alias(self, public_name: str, title: str) -> str: + def _create_valid_alias(self, public_name: str, title: str) -> str: return re.sub(r"[^a-zA-Z0-9-_]", "", public_name.lower().replace(" ", "-") + "_" + title.lower().replace(" ", "-")) def _prepare_collection_data( self, title: str, - collection_alias: Optional[str] = None, + public_name: str, user_context: OptionalUserContext = None, ) -> str: user_email = self._get_user_email(user_context) + collection_alias = self._create_valid_alias(public_name, title) return json.dumps({ "name": title, "alias": collection_alias, From 441254f13bc3c74148b8bacc7a7d6ca8a593138d Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:16:28 +0100 Subject: [PATCH 52/64] chore: payload as str parameter instead of dict --- lib/galaxy/files/sources/dataverse.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index d7a1a500ae38..044e105b04e1 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -290,17 +290,17 @@ def create_draft_file_container( dataset = self._create_dataset(collection_alias, dataset_payload, user_context) if dataset and dataset.get("data"): dataset["data"]["name"] = title - return dataset["data"] + return dataset.get("data") else: raise Exception("Could not create dataset in Dataverse or response has not expected format.") - def _create_collection(self, parent_alias: str, collection_payload: dict, user_context: OptionalUserContext = None) -> dict: + def _create_collection(self, parent_alias: str, collection_payload: str, user_context: OptionalUserContext = None) -> dict: headers = self._get_request_headers(user_context, auth_required=True) response = requests.post(self.create_collection_url(parent_alias), data=collection_payload, headers=headers) self._ensure_response_has_expected_status_code(response, 201) return response.json() - def _create_dataset(self, parent_alias: str, dataset_payload: dict, user_context: OptionalUserContext = None) -> dict: + def _create_dataset(self, parent_alias: str, dataset_payload: str, user_context: OptionalUserContext = None) -> dict: headers = self._get_request_headers(user_context, auth_required=True) response = requests.post(self.create_dataset_url(parent_alias), data=dataset_payload, headers=headers) self._ensure_response_has_expected_status_code(response, 201) From 06a976fc4fc31cd65850f3656101b954f362b782 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:16:58 +0100 Subject: [PATCH 53/64] chore: public_name not optional, create alias in payload preparation --- lib/galaxy/files/sources/dataverse.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 044e105b04e1..445cc08c6a58 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -275,12 +275,12 @@ def get_files_in_container( return self._get_files_from_response(dataset_id, response_data["data"]) def create_draft_file_container( - self, title: str, public_name: Optional[str] = None, user_context: OptionalUserContext = None + self, title: str, public_name: str, user_context: OptionalUserContext = None ) -> RemoteDirectory: - """Creates a draft Dataset in the repository. Dataverse Datasets are contained in Collections. Collections can be contained in Collections. - We create a Collection inside the root Collection and then a Dataset inside that Collection.""" - collection_alias = self.create_valid_alias(public_name, title) - collection_payload = self._prepare_collection_data(title, collection_alias, user_context) + """Creates a draft dataset in the repository. Dataverse datasets are contained in collections. Collections can be contained in collections. + We create a collection inside the root collection and then a dataset inside that collection. + """ + collection_payload = self._prepare_collection_data(title, public_name, user_context) collection = self._create_collection(":root", collection_payload, user_context) if collection and collection.get("data"): collection_alias = collection.get("data").get("alias") From 9c3a632ee991c4486790cfeca1996d49a4523521 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:17:38 +0100 Subject: [PATCH 54/64] chore: docstrings --- lib/galaxy/files/sources/dataverse.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 445cc08c6a58..49b17c185580 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -61,14 +61,12 @@ class DataverseDataset(TypedDict): updatedAt: str publication_date: str -AccessStatus = Literal["public", "restricted"] class DataverseRDMFilesSource(RDMFilesSource): """A files source for Dataverse turn-key research data management repository. - In Dataverse a "Dataset" represents what we refer to as container in the rdm base class + In Dataverse a "dataset" represents what we refer to as container in the rdm base class """ - plugin_type = "dataverse" supports_pagination = True supports_search = True @@ -140,7 +138,7 @@ def _list( query: Optional[str] = None, sort_by: Optional[str] = None, ) -> Tuple[List[AnyRemoteEntry], int]: - """This method lists the files in the Dataverse Dataset.""" + """This method lists the datasets or files from dataverse.""" writeable = opts and opts.writeable or False is_root_path = path == "/" if is_root_path: @@ -158,8 +156,9 @@ def _create_entry( user_context: OptionalUserContext = None, opts: Optional[FilesSourceOptions] = None, ) -> Entry: + """Creates a draft dataset in the repository.""" public_name = self.get_public_name(user_context) or "Anonymous Galaxy User" - dataset = self.repository.create_draft_file_container(entry_data["name"], public_name, user_context) + dataset = self.repository.create_draft_file_container(entry_data.get("name"), public_name, user_context) return { "uri": self.repository.to_plugin_uri(dataset.get("persistentId")), "name": dataset.get("name") or "No title", From 0398a76657f461433640f909756a794be9b716f9 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:18:40 +0100 Subject: [PATCH 55/64] chore: remove unusued get file url function --- lib/galaxy/files/sources/dataverse.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 49b17c185580..ad6ceaddca24 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -369,14 +369,6 @@ def _download_file( f"Cannot download file from URL '{file_path}'. Please make sure the dataset and/or file exists and it is public." ) - def _get_download_file_url(self, container_id: str, file_id: str, user_context: OptionalUserContext = None): - """Get the URL to download a file from a dataset(=dataverse file container). - - This method is used to download files from both published and draft datasets that are accessible by the user. - """ - download_file_content_url = self.file_access_url(file_id=file_id) - return download_file_content_url - def _is_api_url(self, url: str) -> bool: return "/api/" in url From a73f6797c2da2bb6140edb4ec1f6e3ad09295e37 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:19:39 +0100 Subject: [PATCH 56/64] chore: reorder is api url function --- lib/galaxy/files/sources/dataverse.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index ad6ceaddca24..b71440f65a00 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -237,6 +237,9 @@ def public_dataset_url(self, dataset_id: str) -> str: def to_plugin_uri(self, dataset_id: str, file_identifier: Optional[str] = None) -> str: return f"{self.plugin.get_uri_root()}/{f'{file_identifier}' if file_identifier else f'{dataset_id}'}" + + def _is_api_url(self, url: str) -> bool: + return "/api/" in url def get_file_containers( self, @@ -369,9 +372,6 @@ def _download_file( f"Cannot download file from URL '{file_path}'. Please make sure the dataset and/or file exists and it is public." ) - def _is_api_url(self, url: str) -> bool: - return "/api/" in url - def _get_datasets_from_response(self, response: dict) -> List[RemoteDirectory]: datasets = response["items"] rval: List[RemoteDirectory] = [] From ddecf15badef7953edd5fb6dbbefa65d568d1ee3 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:25:54 +0100 Subject: [PATCH 57/64] chore: remove unneeded function --- lib/galaxy/files/sources/dataverse.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index b71440f65a00..d8944e46c260 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -382,17 +382,13 @@ def _get_datasets_from_response(self, response: dict) -> List[RemoteDirectory]: rval.append( { "class": "Directory", - "name": name, + "name": dataset.get("name") or "No title", "uri": uri, "path": path, } ) return rval - def _get_dataset_title(self, dataset: DataverseDataset) -> str: - title = dataset.get("name") - return title or "No title" - def _get_files_from_response(self, dataset_id: str, response: dict) -> List[RemoteFile]: rval: List[RemoteFile] = [] for entry in response: From 1e4135f10b27016d8edee576417566ceb5809821 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:27:22 +0100 Subject: [PATCH 58/64] chore: simplify get datasets from response --- lib/galaxy/files/sources/dataverse.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index d8944e46c260..58c937f4e7c8 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -373,18 +373,15 @@ def _download_file( ) def _get_datasets_from_response(self, response: dict) -> List[RemoteDirectory]: - datasets = response["items"] rval: List[RemoteDirectory] = [] - for dataset in datasets: + for dataset in response["items"]: uri = self.to_plugin_uri(dataset_id=dataset["global_id"]) - path = self.plugin.to_relative_path(uri) - name = self._get_dataset_title(dataset) rval.append( { "class": "Directory", "name": dataset.get("name") or "No title", "uri": uri, - "path": path, + "path": self.plugin.to_relative_path(uri), } ) return rval From d778dfdf74525a79d3c9dbbd0bf81fa9cc41d14a Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:28:53 +0100 Subject: [PATCH 59/64] chore: simplify get_files_from_response --- lib/galaxy/files/sources/dataverse.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 58c937f4e7c8..688afe78f27a 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -390,18 +390,15 @@ def _get_files_from_response(self, dataset_id: str, response: dict) -> List[Remo rval: List[RemoteFile] = [] for entry in response: dataFile = entry.get("dataFile") - filename = dataFile.get("filename") - persistendId = dataFile.get("persistentId") - uri = self.to_plugin_uri(dataset_id=dataset_id, file_identifier=persistendId) - path = self.plugin.to_relative_path(uri) + uri = self.to_plugin_uri(dataset_id, dataFile.get("persistentId")) rval.append( { "class": "File", - "name": filename, + "name": dataFile.get("filename"), "size": dataFile.get("filesize"), "ctime": dataFile.get("creationDate"), "uri": uri, - "path": path, + "path": self.plugin.to_relative_path(uri), } ) return rval From e44c2d31dc49398de7972439015ee9eec44674e1 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:31:34 +0100 Subject: [PATCH 60/64] chore: simplify collection creation --- lib/galaxy/files/sources/dataverse.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 688afe78f27a..0f8ed0927542 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -456,14 +456,12 @@ def _prepare_collection_data( public_name: str, user_context: OptionalUserContext = None, ) -> str: - user_email = self._get_user_email(user_context) - collection_alias = self._create_valid_alias(public_name, title) return json.dumps({ "name": title, - "alias": collection_alias, + "alias": self._create_valid_alias(public_name, title), "dataverseContacts": [ { - "contactEmail": user_email + "contactEmail": self._get_user_email(user_context) }, ], }) From 885c4f32d60e37901774c7600609f4da5a930960 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:44:40 +0100 Subject: [PATCH 61/64] chore: add docstring to realize_to and write_from --- lib/galaxy/files/sources/dataverse.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 0f8ed0927542..85b7788fd210 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -172,6 +172,7 @@ def _realize_to( user_context: OptionalUserContext = None, opts: Optional[FilesSourceOptions] = None, ): + """Used when download files from dataverse.""" # TODO: user_context is always None here when called from a data fetch. (same problem as in invenio.py) # This prevents downloading files that require authentication even if the user provided a token. @@ -199,6 +200,7 @@ def _write_from( user_context: OptionalUserContext = None, opts: Optional[FilesSourceOptions] = None, ): + """Used when uploading files to dataverse.""" dataset_id, file_id = self.parse_path(target_path) self.repository.upload_file_to_draft_container(dataset_id, file_id, native_path, user_context=user_context) From 4153952594a6889e4232e3cd3141817d46255797 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 11:52:57 +0100 Subject: [PATCH 62/64] feat: improve search to only search for title --- lib/galaxy/files/sources/dataverse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index 85b7788fd210..e13e261f68e8 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -263,7 +263,7 @@ def get_file_containers( params["fq"] = "publicationStatus:Draft" params["per_page"] = limit or DEFAULT_PAGE_LIMIT params["start"] = offset - params["q"] = query or "*" + params["q"] = "title:"+query if query else "*" params["sort"] = sort_by or "date" # can be either "name" or "date" response_data = self._get_response(user_context, request_url, params=params) total_hits = response_data["data"]["total_count"] From be3c77916de526074a2ccf4e1f4ca7052911b182 Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 12:26:38 +0100 Subject: [PATCH 63/64] chore: simplify get_file_containers --- lib/galaxy/files/sources/dataverse.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index e13e261f68e8..d7453277cf5a 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -254,18 +254,16 @@ def get_file_containers( ) -> Tuple[List[RemoteDirectory], int]: """Lists the Dataverse datasets in the repository.""" request_url = self.search_url - params: Dict[str, Any] = {} - params["type"] = "dataset" + params = { + "type": "dataset", + "per_page": limit or DEFAULT_PAGE_LIMIT, + "start": offset, + "q": f"title:{query}" if query else "*", + "sort": sort_by or "date", + } if writeable: - # Only draft datasets can be written to. - # This is not tested and assumes that drafts are never public, - # i.e. we automatically only get the drafts from our user params["fq"] = "publicationStatus:Draft" - params["per_page"] = limit or DEFAULT_PAGE_LIMIT - params["start"] = offset - params["q"] = "title:"+query if query else "*" - params["sort"] = sort_by or "date" # can be either "name" or "date" - response_data = self._get_response(user_context, request_url, params=params) + response_data = self._get_response(user_context, request_url, params) total_hits = response_data["data"]["total_count"] return self._get_datasets_from_response(response_data["data"]), total_hits From 60e20555591f15a264a790e62ba8ea0f324bd78f Mon Sep 17 00:00:00 2001 From: Kai Koenig Date: Wed, 18 Dec 2024 14:53:48 +0100 Subject: [PATCH 64/64] feat: filter files of dataset with search query --- lib/galaxy/files/sources/_rdm.py | 2 +- lib/galaxy/files/sources/dataverse.py | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/lib/galaxy/files/sources/_rdm.py b/lib/galaxy/files/sources/_rdm.py index 99681bf76e7f..025b24fc684f 100644 --- a/lib/galaxy/files/sources/_rdm.py +++ b/lib/galaxy/files/sources/_rdm.py @@ -82,7 +82,7 @@ def get_file_containers( raise NotImplementedError() def get_files_in_container( - self, container_id: str, writeable: bool, user_context: OptionalUserContext = None + self, container_id: str, writeable: bool, user_context: OptionalUserContext = None, query: Optional[str] = None, ) -> List[RemoteFile]: """Returns the list of files of a file container. diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py index d7453277cf5a..c0c1d286e9c7 100644 --- a/lib/galaxy/files/sources/dataverse.py +++ b/lib/galaxy/files/sources/dataverse.py @@ -147,7 +147,7 @@ def _list( ) return cast(List[AnyRemoteEntry], datasets), total_hits dataset_id = self.get_container_id_from_path(path) - files = self.repository.get_files_in_container(dataset_id, writeable, user_context) + files = self.repository.get_files_in_container(dataset_id, writeable, user_context, query) return cast(List[AnyRemoteEntry], files), len(files) def _create_entry( @@ -268,13 +268,23 @@ def get_file_containers( return self._get_datasets_from_response(response_data["data"]), total_hits def get_files_in_container( - self, dataset_id: str, writeable: bool, user_context: OptionalUserContext = None + self, + dataset_id: str, + writeable: bool, + user_context: OptionalUserContext = None, + query: Optional[str] = None, ) -> List[RemoteFile]: """This method lists the files in a dataverse dataset.""" request_url = self.files_of_dataset_url(dataset_id=dataset_id) response_data = self._get_response(user_context, request_url) - total_hits = response_data["totalCount"] - return self._get_files_from_response(dataset_id, response_data["data"]) + files = self._get_files_from_response(dataset_id, response_data["data"]) + files = self._filter_files_by_name(files, query) + return files + + def _filter_files_by_name(self, files: List[RemoteFile], query: Optional[str] = None) -> List[RemoteFile]: + if not query: + return files + return [file for file in files if query in file["name"]] def create_draft_file_container( self, title: str, public_name: str, user_context: OptionalUserContext = None