add iter_annexworktree collection iterator

Co-authored-by: Michael Hanke <[email protected]>
datalad · Nov 28, 2023 · b6bee4d · b6bee4d
1 parent 43dde05
commit b6bee4d
Show file tree

Hide file tree

Showing 2 changed files with 207 additions and 0 deletions.
diff --git a/datalad_next/iter_collections/annexworktree.py b/datalad_next/iter_collections/annexworktree.py
@@ -0,0 +1,123 @@
+"""Report on the content of a Git-annex repository worktree
+"""
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from os import linesep
+from pathlib import (
+    Path,
+    PurePath,
+    PurePosixPath,
+)
+from typing import (
+    Any,
+    Generator,
+)
+
+from more_itertools import intersperse
+
+from .gitworktree import (
+    GitWorktreeItem,
+    GitWorktreeFileSystemItem,
+    iter_gitworktree
+)
+from datalad_next.itertools import (
+    itemize,
+    join_with_list,
+    load_json,
+    route_in,
+    route_out,
+    dont_process,
+)
+from datalad_next.runners import iter_subproc
+
+
+lgr = logging.getLogger('datalad.ext.next.iter_collections.annexworktree')
+
+
+linesep_bytes = linesep.encode()
+
+
+@dataclass
+class AnnexWorktreeItem(GitWorktreeItem):
+    annexkey: str | None = None
+    annexsize: int | None = None
+    annexobjpath: PurePath | None = None
+
+
+@dataclass
+class AnnexWorktreeFileSystemItem(GitWorktreeFileSystemItem):
+    annexkey: str | None = None
+    annexsize: int | None = None
+
+
+def iter_annexworktree(
+        path: Path,
+        *,
+        untracked: str | None = 'all',
+        link_target: bool = False,
+        fp: bool = False,
+) -> Generator[AnnexWorktreeItem | AnnexWorktreeFileSystemItem, None, None]:
+
+    glsf = iter_gitworktree(
+        path,
+        untracked=untracked,
+        link_target=link_target,
+        fp=fp
+    )
+
+    git_fileinfo_store: list[Any] = list()
+    key_store: list[Any] = list()
+
+    with \
+            iter_subproc(
+                # we get the annex key for any filename (or empty if not annexed)
+                ['git', '-C', str(path), 'annex', 'find', '--anything', '--format=\${key}\n', '--batch'],
+                # intersperse items with newlines to trigger a batch run
+                # this avoids string operations to append newlines to items
+                input=intersperse(
+                    b'\n',
+                    # store all output of the git ls-find in the gitfileinfo
+                    # store
+                    route_out(
+                        glsf,
+                        git_fileinfo_store,
+                        lambda data: (str(data.name).encode(), [data])
+                    )
+                ),
+            ) as gaf, \
+            iter_subproc(
+                # get the key properties JSON-lines style
+                ['git', '-C', str(path), 'annex', 'examinekey', '--json', '--batch'],
+                # process all non-empty keys and store them in the key store,
+                # skip processing of empty keys and store an ignored value in
+                # the key store
+                input=route_out(
+                    itemize(gaf, sep=linesep_bytes, keep_ends=True),
+                    key_store,
+                    lambda data: (dont_process, [None])
+                                 if data == linesep_bytes
+                                 else (data, [data])
+                )
+            ) as gek:
+
+        for item in route_in(
+                route_in(
+                    load_json(itemize(gek, sep=linesep_bytes)),
+                    key_store,
+                    join_with_list,
+                ),
+                git_fileinfo_store,
+                join_with_list,
+        ):
+            yield AnnexWorktreeItem(
+                name=item[2].name,
+                gitsha=item[2].gitsha,
+                gittype=item[2].gittype,
+                annexkey=item[1].decode().strip() if item[1] else None,
+                annexsize=int(item[0]['bytesize']) if item[0] else None,
+                annexobjpath=PurePath(PurePosixPath(str(item[0]['objectpath'])))
+                             if item[0]
+                             else None,
+            )
diff --git a/datalad_next/iter_collections/tests/test_iterannexworktree.py b/datalad_next/iter_collections/tests/test_iterannexworktree.py
@@ -0,0 +1,84 @@
+from pathlib import (
+    PurePath,
+)
+
+from datalad import cfg as dlcfg
+
+from datalad_next.datasets import Dataset
+
+from ..annexworktree import (
+    iter_annexworktree,
+)
+
+
+def _mkds(tmp_path_factory, monkeypatch, cfg_overrides):
+    with monkeypatch.context() as m:
+        for k, v in cfg_overrides.items():
+            m.setitem(dlcfg.overrides, k, v)
+        dlcfg.reload()
+        ds = Dataset(tmp_path_factory.mktemp('ds')).create(
+            result_renderer='disabled')
+    dlcfg.reload()
+    return ds
+
+
+def _dotests(ds):
+    test_file_content = 'test_file'
+    test_file = ds.pathobj / 'annexed' / 'subdir' / 'file1.txt'
+    test_file.parent.mkdir(parents=True)
+    test_file.write_text(test_file_content)
+    # we create an additional file where the content will be dropped
+    # to test behavior on unavailable annex key
+    droptest_content = 'somethingdropped'
+    droptest_file = ds.pathobj / 'annexed' / 'dropped.txt'
+    droptest_file.write_text(droptest_content)
+    ds.save(result_renderer='disabled')
+    ds.drop(droptest_file, reckless='availability',
+            result_renderer='disabled')
+
+    # get results for the annexed files
+    query_path = ds.pathobj / 'annexed'
+    res = list(iter_annexworktree(
+        query_path, untracked=None, link_target=True,
+    ))
+    assert len(res) == 2
+    #
+    # pick the present annex file to start
+    r = [r for r in res if r.name.name == 'file1.txt'][0]
+    assert r.name == PurePath('subdir', 'file1.txt')
+    # we cannot check gitsha and symlink content for identity, it will change
+    # depending on the tuning
+    # we cannot check the item type, because it will vary across repository
+    # modes (e.g., adjusted unlocked)
+    assert r.annexsize == len(test_file_content)
+    assert r.annexkey == 'MD5E-s9--37b87ee8c563af911dcc0f949826b1c9.txt'
+    # with `link_target=True` we get an objpath that is relative to the
+    # query path, and we find the actual key file there
+    assert (query_path / r.annexobjpath).read_text() == test_file_content
+    #
+    # now pick the dropped annex file
+    r = [r for r in res if r.name.name == 'dropped.txt'][0]
+    assert r.name == PurePath('dropped.txt')
+    # we get basic info regardless of availability
+    assert r.annexsize == len(droptest_content)
+    assert r.annexkey == 'MD5E-s16--770a06889bc88f8743d1ed9a1977ff7b.txt'
+    # even with an absent key file, we get its would-be location,
+    # and it is relative to the query path
+    assert r.annexobjpath.parts[:2] == ('..', '.git')
+
+
+def test_iter_annexworktree(tmp_path_factory, monkeypatch):
+    ds = _mkds(tmp_path_factory, monkeypatch, {})
+    _dotests(ds)
+
+
+def test_iter_annexworktree_tuned(tmp_path_factory, monkeypatch):
+    # same as test_file_content(), but with a "tuned" annexed that
+    # no longer matches the traditional setup.
+    # we need to be able to cope with that too
+    ds = _mkds(tmp_path_factory, monkeypatch, {
+        'annex.tune.objecthash1': 'true',
+        'annex.tune.branchhash1': 'true',
+        'annex.tune.objecthashlower': 'true',
+    })
+    _dotests(ds)