diff --git a/datalad_next/iter_collections/annexworktree.py b/datalad_next/iter_collections/annexworktree.py new file mode 100644 index 00000000..150a3b33 --- /dev/null +++ b/datalad_next/iter_collections/annexworktree.py @@ -0,0 +1,123 @@ +"""Report on the content of a Git-annex repository worktree +""" +from __future__ import annotations + +import logging +from dataclasses import dataclass +from os import linesep +from pathlib import ( + Path, + PurePath, + PurePosixPath, +) +from typing import ( + Any, + Generator, +) + +from more_itertools import intersperse + +from .gitworktree import ( + GitWorktreeItem, + GitWorktreeFileSystemItem, + iter_gitworktree +) +from datalad_next.itertools import ( + itemize, + join_with_list, + load_json, + route_in, + route_out, + dont_process, +) +from datalad_next.runners import iter_subproc + + +lgr = logging.getLogger('datalad.ext.next.iter_collections.annexworktree') + + +linesep_bytes = linesep.encode() + + +@dataclass +class AnnexWorktreeItem(GitWorktreeItem): + annexkey: str | None = None + annexsize: int | None = None + annexobjpath: PurePath | None = None + + +@dataclass +class AnnexWorktreeFileSystemItem(GitWorktreeFileSystemItem): + annexkey: str | None = None + annexsize: int | None = None + + +def iter_annexworktree( + path: Path, + *, + untracked: str | None = 'all', + link_target: bool = False, + fp: bool = False, +) -> Generator[AnnexWorktreeItem | AnnexWorktreeFileSystemItem, None, None]: + + glsf = iter_gitworktree( + path, + untracked=untracked, + link_target=link_target, + fp=fp + ) + + git_fileinfo_store: list[Any] = list() + key_store: list[Any] = list() + + with \ + iter_subproc( + # we get the annex key for any filename (or empty if not annexed) + ['git', '-C', str(path), 'annex', 'find', '--anything', '--format=\${key}\n', '--batch'], + # intersperse items with newlines to trigger a batch run + # this avoids string operations to append newlines to items + input=intersperse( + b'\n', + # store all output of the git ls-find in the gitfileinfo + # store + route_out( + glsf, + git_fileinfo_store, + lambda data: (str(data.name).encode(), [data]) + ) + ), + ) as gaf, \ + iter_subproc( + # get the key properties JSON-lines style + ['git', '-C', str(path), 'annex', 'examinekey', '--json', '--batch'], + # process all non-empty keys and store them in the key store, + # skip processing of empty keys and store an ignored value in + # the key store + input=route_out( + itemize(gaf, sep=linesep_bytes, keep_ends=True), + key_store, + lambda data: (dont_process, [None]) + if data == linesep_bytes + else (data, [data]) + ) + ) as gek: + + for item in route_in( + route_in( + load_json(itemize(gek, sep=linesep_bytes)), + key_store, + join_with_list, + ), + git_fileinfo_store, + join_with_list, + ): + yield AnnexWorktreeItem( + name=item[2].name, + gitsha=item[2].gitsha, + gittype=item[2].gittype, + annexkey=item[1].decode().strip() if item[1] else None, + annexsize=int(item[0]['bytesize']) if item[0] else None, + annexobjpath=PurePath(PurePosixPath(str(item[0]['objectpath']))) + if item[0] + else None, + ) diff --git a/datalad_next/iter_collections/tests/test_iterannexworktree.py b/datalad_next/iter_collections/tests/test_iterannexworktree.py new file mode 100644 index 00000000..63901618 --- /dev/null +++ b/datalad_next/iter_collections/tests/test_iterannexworktree.py @@ -0,0 +1,84 @@ +from pathlib import ( + PurePath, +) + +from datalad import cfg as dlcfg + +from datalad_next.datasets import Dataset + +from ..annexworktree import ( + iter_annexworktree, +) + + +def _mkds(tmp_path_factory, monkeypatch, cfg_overrides): + with monkeypatch.context() as m: + for k, v in cfg_overrides.items(): + m.setitem(dlcfg.overrides, k, v) + dlcfg.reload() + ds = Dataset(tmp_path_factory.mktemp('ds')).create( + result_renderer='disabled') + dlcfg.reload() + return ds + + +def _dotests(ds): + test_file_content = 'test_file' + test_file = ds.pathobj / 'annexed' / 'subdir' / 'file1.txt' + test_file.parent.mkdir(parents=True) + test_file.write_text(test_file_content) + # we create an additional file where the content will be dropped + # to test behavior on unavailable annex key + droptest_content = 'somethingdropped' + droptest_file = ds.pathobj / 'annexed' / 'dropped.txt' + droptest_file.write_text(droptest_content) + ds.save(result_renderer='disabled') + ds.drop(droptest_file, reckless='availability', + result_renderer='disabled') + + # get results for the annexed files + query_path = ds.pathobj / 'annexed' + res = list(iter_annexworktree( + query_path, untracked=None, link_target=True, + )) + assert len(res) == 2 + # + # pick the present annex file to start + r = [r for r in res if r.name.name == 'file1.txt'][0] + assert r.name == PurePath('subdir', 'file1.txt') + # we cannot check gitsha and symlink content for identity, it will change + # depending on the tuning + # we cannot check the item type, because it will vary across repository + # modes (e.g., adjusted unlocked) + assert r.annexsize == len(test_file_content) + assert r.annexkey == 'MD5E-s9--37b87ee8c563af911dcc0f949826b1c9.txt' + # with `link_target=True` we get an objpath that is relative to the + # query path, and we find the actual key file there + assert (query_path / r.annexobjpath).read_text() == test_file_content + # + # now pick the dropped annex file + r = [r for r in res if r.name.name == 'dropped.txt'][0] + assert r.name == PurePath('dropped.txt') + # we get basic info regardless of availability + assert r.annexsize == len(droptest_content) + assert r.annexkey == 'MD5E-s16--770a06889bc88f8743d1ed9a1977ff7b.txt' + # even with an absent key file, we get its would-be location, + # and it is relative to the query path + assert r.annexobjpath.parts[:2] == ('..', '.git') + + +def test_iter_annexworktree(tmp_path_factory, monkeypatch): + ds = _mkds(tmp_path_factory, monkeypatch, {}) + _dotests(ds) + + +def test_iter_annexworktree_tuned(tmp_path_factory, monkeypatch): + # same as test_file_content(), but with a "tuned" annexed that + # no longer matches the traditional setup. + # we need to be able to cope with that too + ds = _mkds(tmp_path_factory, monkeypatch, { + 'annex.tune.objecthash1': 'true', + 'annex.tune.branchhash1': 'true', + 'annex.tune.objecthashlower': 'true', + }) + _dotests(ds)