diff --git a/changelog.d/20230307_164111_austin_add_metalad_extractor.md b/changelog.d/20230307_164111_austin_add_metalad_extractor.md new file mode 100644 index 00000000..cc475e58 --- /dev/null +++ b/changelog.d/20230307_164111_austin_add_metalad_extractor.md @@ -0,0 +1,5 @@ +### 🚀 Enhancements and New Features + +- Add metalad extractor using `singularity inspect`. + Fixes https://github.com/datalad/datalad-container/issues/198 via + https://github.com/datalad/datalad-container/pull/200 (by @asmacdo ) diff --git a/datalad_container/extractors/__init__.py b/datalad_container/extractors/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/datalad_container/extractors/metalad_container.py b/datalad_container/extractors/metalad_container.py new file mode 100644 index 00000000..99e8fa27 --- /dev/null +++ b/datalad_container/extractors/metalad_container.py @@ -0,0 +1,86 @@ +# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- +# ex: set sts=4 ts=4 sw=4 noet: +# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## +# +# See COPYING file distributed along with the datalad package for the +# copyright and license terms. +# +# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## +"""MetadataRecord extractor for files stored in Datalad's own core storage""" +import json +import logging +import subprocess +import time +from uuid import UUID + +CURRENT_VERSION = "0.0.1" + + +try: + from datalad_metalad.extractors.base import DataOutputCategory, ExtractorResult, FileMetadataExtractor + from datalad_metalad import get_file_id +except Exception as e: + raise + +lgr = logging.getLogger('datalad.metadata.extractors.metalad_container') + + +class MetaladContainer(FileMetadataExtractor): + + def get_data_output_category(self) -> DataOutputCategory: + return DataOutputCategory.IMMEDIATE + + def is_content_required(self) -> bool: + return True + + def get_id(self) -> UUID: + # Nothing special, made this up - asmacdo + return UUID('3a28cca6-b7a1-11ed-b106-fc3497650c92') + + + def get_version(self) -> str: + return CURRENT_VERSION + + def extract(self, _=None) -> ExtractorResult: + return ExtractorResult( + extractor_version=self.get_version(), + extraction_parameter=self.parameter or {}, + extraction_success=True, + datalad_result_dict={ + "type": "container", + "status": "ok" + }, + immediate_data={ + "@id": get_file_id(dict( + path=self.file_info.path, + type=self.file_info.type)), + "type": self.file_info.type, + "path": self.file_info.intra_dataset_path, + "content_byte_size": self.file_info.byte_size, + "comment": f"Container metadata extractor executed at {time.time()}", + "singularity_version": self._singularity_version(), + "singularity_inspect": self._singularity_inspect(self.file_info.path), + }) + + def _singularity_inspect(self, path) -> str: + data = subprocess.run( + ["singularity", "inspect", "--json", path], + check=True, + stdout=subprocess.PIPE).stdout.decode() + return json.loads(data) + + def _singularity_version(self) -> str: + try: + # If this works, its "apptainer version 1.1.5-1.fc37" + data = subprocess.run( + ["apptainer", "--version"], + check=True, + stdout=subprocess.PIPE + ).stdout.decode().strip() + except Exception as e: + # If this is not apptainer, its "1.1.5-1.fc37" + data = subprocess.run( + ["singularity", "version"], + check=True, + stdout=subprocess.PIPE).stdout.decode().strip() + return data diff --git a/docs/source/index.rst b/docs/source/index.rst index e5671421..7175f190 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -18,6 +18,7 @@ Documentation changelog acknowledgements + metadata-extraction API Reference diff --git a/docs/source/metadata-extraction.rst b/docs/source/metadata-extraction.rst new file mode 100644 index 00000000..03dca36a --- /dev/null +++ b/docs/source/metadata-extraction.rst @@ -0,0 +1,53 @@ +Metadata Extraction +******************* + +If `datalad-metalad`_ extension is installed, `datalad-container` can +extract metadata from singularity containers images. + +(It is recommended to use a tool like `jq` if you would like to read the +output yourself.) + +For example: + +``datalad meta-extract -d . metalad_container images/bids/bids-pymvpa--1.0.2.sing | jq`` + +.. code-block:: + + { + "type": "file", + "dataset_id": "b02e63c2-62c1-11e9-82b0-52540040489c", + "dataset_version": "9ed0a39406e518f0309bb665a99b64dec719fb08", + "path": "images/bids/bids-pymvpa--1.0.2.sing", + "extractor_name": "metalad_container", + "extractor_version": "0.0.1", + "extraction_parameter": {}, + "extraction_time": 1678225970.5466852, + "agent_name": "Austin Macdonald", + "agent_email": "austin@dartmouth.edu", + "extracted_metadata": { + "@id": "datalad:SHA1-s993116191--cc7ac6e6a31e9ac131035a88f699dfcca785b844", + "type": "file", + "path": "images/bids/bids-pymvpa--1.0.2.sing", + "content_byte_size": 0, + "comment": "Container metadata extractor executed at 1678225970.4338098", + "singularity_version": "apptainer version 1.1.5-1.fc37", + "singularity_inspect": { + "data": { + "attributes": { + "labels": { + "org.label-schema.build-date": "Thu,_19_Dec_2019_14:58:41_+0000", + "org.label-schema.build-size": "2442MB", + "org.label-schema.schema-version": "1.0", + "org.label-schema.usage.singularity.deffile": "Singularity.bids-pymvpa--1.0.2", + "org.label-schema.usage.singularity.deffile.bootstrap": "docker", + "org.label-schema.usage.singularity.deffile.from": "bids/pymvpa:v1.0.2", + "org.label-schema.usage.singularity.version": "2.5.2-feature-squashbuild-secbuild-2.5.6e68f9725" + } + } + }, + "type": "container" + } + } + } + +.. _datalad-metalad: http://docs.datalad.org/projects/metalad/en/latest/ diff --git a/setup.cfg b/setup.cfg index 82f347ef..58993b7c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,6 +41,9 @@ datalad.extensions = # valid datalad interface specification (see demo in this extensions) container = datalad_container:command_suite +datalad.metadata.extractors = + metalad_container = datalad_container.extractors.metalad_container:MetaladContainer + [versioneer] # See the docstring in versioneer.py for instructions. Note that you must # re-run 'versioneer.py setup' after changing this section, and commit the