Skip to content

Commit

Permalink
add iter_annexworktree collection iterator
Browse files Browse the repository at this point in the history
Co-authored-by: Michael Hanke <[email protected]>
  • Loading branch information
christian-monch and mih committed Nov 28, 2023
1 parent 43dde05 commit b6bee4d
Show file tree
Hide file tree
Showing 2 changed files with 207 additions and 0 deletions.
123 changes: 123 additions & 0 deletions datalad_next/iter_collections/annexworktree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""Report on the content of a Git-annex repository worktree
"""
from __future__ import annotations

import logging
from dataclasses import dataclass
from os import linesep
from pathlib import (
Path,
PurePath,
PurePosixPath,
)
from typing import (
Any,
Generator,
)

from more_itertools import intersperse

from .gitworktree import (
GitWorktreeItem,
GitWorktreeFileSystemItem,
iter_gitworktree
)
from datalad_next.itertools import (
itemize,
join_with_list,
load_json,
route_in,
route_out,
dont_process,
)
from datalad_next.runners import iter_subproc


lgr = logging.getLogger('datalad.ext.next.iter_collections.annexworktree')


linesep_bytes = linesep.encode()


@dataclass
class AnnexWorktreeItem(GitWorktreeItem):
annexkey: str | None = None
annexsize: int | None = None
annexobjpath: PurePath | None = None


@dataclass
class AnnexWorktreeFileSystemItem(GitWorktreeFileSystemItem):
annexkey: str | None = None
annexsize: int | None = None


def iter_annexworktree(
path: Path,
*,
untracked: str | None = 'all',
link_target: bool = False,
fp: bool = False,
) -> Generator[AnnexWorktreeItem | AnnexWorktreeFileSystemItem, None, None]:

glsf = iter_gitworktree(
path,
untracked=untracked,
link_target=link_target,
fp=fp
)

git_fileinfo_store: list[Any] = list()
key_store: list[Any] = list()

with \
iter_subproc(
# we get the annex key for any filename (or empty if not annexed)
['git', '-C', str(path), 'annex', 'find', '--anything', '--format=\${key}\n', '--batch'],
# intersperse items with newlines to trigger a batch run
# this avoids string operations to append newlines to items
input=intersperse(
b'\n',
# store all output of the git ls-find in the gitfileinfo
# store
route_out(
glsf,
git_fileinfo_store,
lambda data: (str(data.name).encode(), [data])
)
),
) as gaf, \
iter_subproc(
# get the key properties JSON-lines style
['git', '-C', str(path), 'annex', 'examinekey', '--json', '--batch'],
# process all non-empty keys and store them in the key store,
# skip processing of empty keys and store an ignored value in
# the key store
input=route_out(
itemize(gaf, sep=linesep_bytes, keep_ends=True),
key_store,
lambda data: (dont_process, [None])
if data == linesep_bytes
else (data, [data])
)
) as gek:

for item in route_in(
route_in(
load_json(itemize(gek, sep=linesep_bytes)),
key_store,
join_with_list,
),
git_fileinfo_store,
join_with_list,
):
yield AnnexWorktreeItem(
name=item[2].name,
gitsha=item[2].gitsha,
gittype=item[2].gittype,
annexkey=item[1].decode().strip() if item[1] else None,
annexsize=int(item[0]['bytesize']) if item[0] else None,
annexobjpath=PurePath(PurePosixPath(str(item[0]['objectpath'])))
if item[0]
else None,
)
84 changes: 84 additions & 0 deletions datalad_next/iter_collections/tests/test_iterannexworktree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from pathlib import (
PurePath,
)

from datalad import cfg as dlcfg

from datalad_next.datasets import Dataset

from ..annexworktree import (
iter_annexworktree,
)


def _mkds(tmp_path_factory, monkeypatch, cfg_overrides):
with monkeypatch.context() as m:
for k, v in cfg_overrides.items():
m.setitem(dlcfg.overrides, k, v)
dlcfg.reload()
ds = Dataset(tmp_path_factory.mktemp('ds')).create(
result_renderer='disabled')
dlcfg.reload()
return ds


def _dotests(ds):
test_file_content = 'test_file'
test_file = ds.pathobj / 'annexed' / 'subdir' / 'file1.txt'
test_file.parent.mkdir(parents=True)
test_file.write_text(test_file_content)
# we create an additional file where the content will be dropped
# to test behavior on unavailable annex key
droptest_content = 'somethingdropped'
droptest_file = ds.pathobj / 'annexed' / 'dropped.txt'
droptest_file.write_text(droptest_content)
ds.save(result_renderer='disabled')
ds.drop(droptest_file, reckless='availability',
result_renderer='disabled')

# get results for the annexed files
query_path = ds.pathobj / 'annexed'
res = list(iter_annexworktree(
query_path, untracked=None, link_target=True,
))
assert len(res) == 2
#
# pick the present annex file to start
r = [r for r in res if r.name.name == 'file1.txt'][0]
assert r.name == PurePath('subdir', 'file1.txt')
# we cannot check gitsha and symlink content for identity, it will change
# depending on the tuning
# we cannot check the item type, because it will vary across repository
# modes (e.g., adjusted unlocked)
assert r.annexsize == len(test_file_content)
assert r.annexkey == 'MD5E-s9--37b87ee8c563af911dcc0f949826b1c9.txt'
# with `link_target=True` we get an objpath that is relative to the
# query path, and we find the actual key file there
assert (query_path / r.annexobjpath).read_text() == test_file_content
#
# now pick the dropped annex file
r = [r for r in res if r.name.name == 'dropped.txt'][0]
assert r.name == PurePath('dropped.txt')
# we get basic info regardless of availability
assert r.annexsize == len(droptest_content)
assert r.annexkey == 'MD5E-s16--770a06889bc88f8743d1ed9a1977ff7b.txt'
# even with an absent key file, we get its would-be location,
# and it is relative to the query path
assert r.annexobjpath.parts[:2] == ('..', '.git')


def test_iter_annexworktree(tmp_path_factory, monkeypatch):
ds = _mkds(tmp_path_factory, monkeypatch, {})
_dotests(ds)


def test_iter_annexworktree_tuned(tmp_path_factory, monkeypatch):
# same as test_file_content(), but with a "tuned" annexed that
# no longer matches the traditional setup.
# we need to be able to cope with that too
ds = _mkds(tmp_path_factory, monkeypatch, {
'annex.tune.objecthash1': 'true',
'annex.tune.branchhash1': 'true',
'annex.tune.objecthashlower': 'true',
})
_dotests(ds)

0 comments on commit b6bee4d

Please sign in to comment.