Skip to content

Commit

Permalink
Merge branch 'maint_0.4'
Browse files Browse the repository at this point in the history
  • Loading branch information
christian-monch committed Jan 17, 2024
2 parents 6f870b7 + ad855b8 commit 528d0f4
Show file tree
Hide file tree
Showing 18 changed files with 1,113 additions and 490 deletions.
3 changes: 3 additions & 0 deletions datalad_metalad/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,6 @@ def get_agent_id(name, email):
from ._version import get_versions
__version__ = get_versions()['version']
del get_versions

from . import _version
__version__ = _version.get_versions()['version']
333 changes: 248 additions & 85 deletions datalad_metalad/_version.py

Large diffs are not rendered by default.

38 changes: 21 additions & 17 deletions datalad_metalad/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
import json
import logging
import sys
import time
from os import curdir
from pathlib import (
Expand Down Expand Up @@ -489,51 +490,54 @@ def get_extractor_class(extractor_name: str) -> Union[
Type[FileMetadataExtractor]]:

""" Get an extractor from its name """
from pkg_resources import iter_entry_points
if sys.version_info < (3, 10):
from importlib_metadata import entry_points
else:
from importlib.metadata import entry_points

# The extractor class names of the old datalad-contained extractors have
# been changed, when the extractors were moved to datalad_metalad.
# Therefore, we have to use to extractors in
# `datalad_metalad.extractors.legacy` instead of any old extractor code
# from datalad core.
entry_points = [
entry_point_list = [
entry_point
for entry_point in iter_entry_points(
"datalad.metadata.extractors",
extractor_name
for entry_point in entry_points(
group="datalad.metadata.extractors",
name=extractor_name,
)
if entry_point.dist.project_name != "datalad"
if entry_point.dist.name != "datalad"
]

if not entry_points:
entry_points = [
if not entry_point_list:
entry_point_list = [
entry_point
for entry_point in iter_entry_points(
"datalad.metadata.extractors",
extractor_name
for entry_point in entry_points(
group="datalad.metadata.extractors",
name=extractor_name,
)
if entry_point.dist.project_name == "datalad"
if entry_point.dist.name == "datalad"
]

if not entry_points:
if not entry_point_list:
raise ExtractorNotFoundError(
"Requested metadata extractor '{}' not available".format(
extractor_name))

entry_point, ignored_entry_points = entry_points[-1], entry_points[:-1]
entry_point, ignored_entry_points = entry_point_list[-1], entry_point_list[:-1]
lgr.debug(
"Using metadata extractor %s from distribution %s",
extractor_name,
entry_point.dist.project_name)
entry_point.dist.name)

# Inform about overridden entry points
for ignored_entry_point in ignored_entry_points:
lgr.warning(
"MetadataRecord extractor %s from distribution %s overrides "
"metadata extractor from distribution %s",
extractor_name,
entry_point.dist.project_name,
ignored_entry_point.dist.project_name)
entry_point.dist.name,
ignored_entry_point.dist.name)

return entry_point.load()

Expand Down
3 changes: 1 addition & 2 deletions datalad_metalad/extractors/annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

from .base import MetadataExtractor

from six import text_type
import logging
lgr = logging.getLogger('datalad.metadata.extractors.metalad_annex')
from datalad.log import log_progress
Expand Down Expand Up @@ -50,7 +49,7 @@ def __call__(self, dataset, refcommit, process_type, status):
# limit query to paths that are annexed
query_paths = [
# go relative to minimize cmdline footprint of annex call
text_type(Path(s['path']).relative_to(ds.pathobj))
str(Path(s['path']).relative_to(ds.pathobj))
for s in status
# anything that looks like an annexed file
if s.get('type', None) == 'file' \
Expand Down
8 changes: 2 additions & 6 deletions datalad_metalad/extractors/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,6 @@
from datalad.utils import (
Path,
)
from six import (
iteritems,
string_types,
)

import logging
lgr = logging.getLogger('datalad.metadata.extractors.metalad_core')
Expand Down Expand Up @@ -319,7 +315,7 @@ def _get_urls_from_whereis(wi, prefixes=('http', 'dl+archive:')):
from "whereis" output"""
return [
url
for remote, rprops in iteritems(wi.get('remotes', {}) if 'status' in wi else wi)
for remote, rprops in (wi.get('remotes', {}) if 'status' in wi else wi).items()
for url in rprops.get('urls', [])
if any(url.startswith(pref) for pref in prefixes)
]
Expand Down Expand Up @@ -433,7 +429,7 @@ def whereis_file_(self, paths):
with keys: 'description', 'here', 'urls' (list) that contain
the values of the respective 'git annex whereis' response.
"""
if isinstance(paths, string_types):
if isinstance(paths, str):
raise ValueError('whereis_file(paths): paths must be '
'iterable, not a string type')

Expand Down
11 changes: 5 additions & 6 deletions datalad_metalad/extractors/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from .base import MetadataExtractor

import os.path as op
from six import text_type
import logging
lgr = logging.getLogger('datalad.metadata.extractors.custom')

Expand Down Expand Up @@ -49,7 +48,7 @@ def get_required_content(self, dataset, process_type, status):
if process_type in ('all', 'dataset'):
srcfiles, _ = _get_dsmeta_srcfiles(dataset)
for f in srcfiles:
f = text_type(dataset.pathobj / f)
f = str(dataset.pathobj / f)
if op.lexists(f):
yield dict(path=f)

Expand Down Expand Up @@ -78,7 +77,7 @@ def __call__(self, dataset, refcommit, process_type, status):
meta_fpath = _get_fmeta_objpath(ds, mfile_expr, rec)
if meta_fpath is not None and op.exists(meta_fpath):
try:
meta = jsonload(text_type(meta_fpath))
meta = jsonload(str(meta_fpath))
if isinstance(meta, dict) and meta \
and '@id' not in meta:
# in case we have a single, top-level
Expand Down Expand Up @@ -136,7 +135,7 @@ def _get_dsmeta_srcfiles(ds):
# OK to be always POSIX
srcfiles = ['.metadata/dataset.json'] \
if not cfg_srcfiles and op.lexists(
text_type(ds.pathobj / '.metadata' / 'dataset.json')) \
str(ds.pathobj / '.metadata' / 'dataset.json')) \
else cfg_srcfiles
return srcfiles, cfg_srcfiles

Expand All @@ -154,7 +153,7 @@ def _get_fmeta_objpath(ds, expr, rec):
return
# build associated metadata file path from POSIX
# pieces and convert to platform conventions at the end
return text_type(
return str(
ds.pathobj / PurePosixPath(expr.format(
freldir=fpath.relative_to(
ds.pathobj).parent.as_posix(),
Expand Down Expand Up @@ -183,7 +182,7 @@ def _yield_dsmeta(ds):
# no further operation on half-broken metadata
return
lgr.debug('Load custom metadata from %s', abssrcfile)
meta = jsonload(text_type(abssrcfile))
meta = jsonload(str(abssrcfile))
dsmeta.update(meta)
if dsmeta:
if '@id' not in dsmeta:
Expand Down
5 changes: 1 addition & 4 deletions datalad_metalad/extractors/runprov.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,6 @@
get_file_id,
get_agent_id,
)
from six import (
text_type,
)
from datalad.support.json_py import (
loads as jsonloads,
load as jsonload,
Expand Down Expand Up @@ -203,7 +200,7 @@ def _finalize_record(r):
if not isinstance(rec, dict):
# this is a runinfo file name
rec = jsonload(
text_type(ds.pathobj / '.datalad' / 'runinfo' / rec),
str(ds.pathobj / '.datalad' / 'runinfo' / rec),
# TODO this should not be necessary, instead jsonload()
# should be left on auto, and `run` should save compressed
# files with an appropriate extension
Expand Down
6 changes: 2 additions & 4 deletions datalad_metalad/extractors/tests/test_annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test annex metadata extractor"""

from six import text_type

from datalad.distribution.dataset import Dataset
from datalad.tests.utils_pytest import (
assert_equal,
Expand All @@ -29,7 +27,7 @@ def test_annex_contentmeta(path=None):
(ds.pathobj / 'ignored').write_text(u'nometa')
ds.save(result_renderer="disabled")
ds.repo.set_metadata(
text_type(mfile_path.relative_to(ds.pathobj)),
str(mfile_path.relative_to(ds.pathobj)),
init={'tag': 'mytag', 'fancy': 'this?'}
)
res = ds.meta_extract(extractorname='metalad_annex', path=str(mfile_path))
Expand All @@ -38,7 +36,7 @@ def test_annex_contentmeta(path=None):
assert_result_count(res, 1)
assert_result_count(
res, 1,
path=text_type(mfile_path),
path=str(mfile_path),
type='file',
status='ok',
action='meta_extract')
Expand Down
8 changes: 6 additions & 2 deletions datalad_metalad/extractors/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test all extractors at a basic level"""

from pkg_resources import iter_entry_points
import sys

import pytest
if sys.version_info < (3, 9):
from importlib_metadata import entry_points
else:
from importlib.metadata import entry_points

from datalad.api import (
Dataset,
Expand Down Expand Up @@ -47,7 +51,7 @@ def test_api(path=None, *, annex):
assert_repo_status(ds.path)

processed_extractors, skipped_extractors = [], []
for extractor_ep in iter_entry_points("datalad.metadata.extractors"):
for extractor_ep in entry_points(group="datalad.metadata.extractors"):

# There are a number of extractors that do not
# work on empty datasets, or datasets, or without
Expand Down
5 changes: 2 additions & 3 deletions datalad_metalad/extractors/tests/test_custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
"""Test custom metadata extractor"""

import json
from six import text_type

from datalad.distribution.dataset import Dataset
from datalad.tests.utils_pytest import (
Expand Down Expand Up @@ -177,7 +176,7 @@ def test_custom_contentmeta(path=None):

assert_result_count(
res, 1,
path=text_type(ds.pathobj / 'sub' / 'one'),
path=str(ds.pathobj / 'sub' / 'one'),
type='file',
status='ok',
action='meta_extract')
Expand Down Expand Up @@ -212,7 +211,7 @@ def test_custom_content_broken(path=None):
assert_result_count(res, 1)
assert_result_count(
res, 1,
path=text_type(ds.pathobj / 'sub' / 'one'),
path=str(ds.pathobj / 'sub' / 'one'),
type='file',
# specific message does vary a lot across platforms
#message=
Expand Down
8 changes: 3 additions & 5 deletions datalad_metalad/extractors/tests/test_runprov.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test runprov metadata extractor"""

from six import text_type

from datalad.distribution.dataset import Dataset
from datalad.tests.utils_pytest import (
assert_in,
Expand Down Expand Up @@ -48,7 +46,7 @@ def test_custom_dsmeta(path=None):
res = ds.meta_extract(extractorname='metalad_runprov', path='dummy0')
assert_result_count(res, 1)
eq_(res[0]["type"], "file")
eq_(res[0]["path"], text_type(ds.pathobj / 'dummy0'))
eq_(res[0]["path"], str(ds.pathobj / 'dummy0'))
for r in res:
# we have something from the extractor
md = r.get('metadata_record', {}).get('extracted_metadata', None)
Expand Down Expand Up @@ -80,7 +78,7 @@ def test_custom_dsmeta(path=None):
extractorname='metalad_runprov', path='dummy0')
assert_result_count(fileres, 1)
assert_result_count(
fileres, 1, type='file', path=text_type(ds.pathobj / 'dummy0'))
fileres, 1, type='file', path=str(ds.pathobj / 'dummy0'))

# smoke test to see if anything breaks with a record in a sidecar
# file
Expand All @@ -97,7 +95,7 @@ def test_custom_dsmeta(path=None):
assert_result_count(res, 1)
eq_(res[0]['type'], 'file')
assert_result_count(
res, 1, type='file', path=text_type(ds.pathobj / 'dummy_side'))
res, 1, type='file', path=str(ds.pathobj / 'dummy_side'))

# check that it survives a partial report (no _core metadata extracted)
# for JSON-LD reporting
Expand Down
14 changes: 9 additions & 5 deletions datalad_metalad/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import json
import logging
from pathlib import Path
import sys
from typing import (
Dict,
Iterable,
Expand Down Expand Up @@ -285,10 +286,13 @@ def run_filter(filter_name: str,

def get_filter_class(filter_name: str) -> Type[MetadataFilterBase]:
""" Get a filter class from its name"""
from pkg_resources import iter_entry_points
if sys.version_info < (3, 9):
from importlib_metadata import entry_points
else:
from importlib.metadata import entry_points

entry_points = list(
iter_entry_points("datalad.metadata.filters", filter_name))
entry_points(group="datalad.metadata.filters", name=filter_name))

if not entry_points:
raise ValueError(
Expand All @@ -299,16 +303,16 @@ def get_filter_class(filter_name: str) -> Type[MetadataFilterBase]:
lgr.debug(
"Using metadata filter %s from distribution %s",
filter_name,
entry_point.dist.project_name)
entry_point.dist.name)

# Inform about overridden entry points
for ignored_entry_point in ignored_entry_points:
lgr.warning(
"MetadataRecord filter %s from distribution %s overrides "
"metadata filter from distribution %s",
filter_name,
entry_point.dist.project_name,
ignored_entry_point.dist.project_name)
entry_point.dist.name,
ignored_entry_point.dist.name)

return entry_point.load()

Expand Down
3 changes: 1 addition & 2 deletions datalad_metalad/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from six import text_type
from datalad.api import (
Dataset,
save,
Expand All @@ -20,7 +19,7 @@ def make_ds_hierarchy_with_metadata(path):
ds.repo.set_metadata('file.dat', reset={'tag': ['one', 'two']})
subds = ds.create('sub')
# we need one real piece of content for metadata extraction
(subds.pathobj / 'real').write_text(text_type('real'))
(subds.pathobj / 'real').write_text('real')
ds.save(recursive=True, result_renderer="disabled")
return ds, subds

Expand Down
Loading

0 comments on commit 528d0f4

Please sign in to comment.