generated from datalad/datalad-extension-template
-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add
iter_annexworktree
collection iterator
Co-authored-by: Michael Hanke <[email protected]>
- Loading branch information
1 parent
43dde05
commit b6bee4d
Showing
2 changed files
with
207 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
"""Report on the content of a Git-annex repository worktree | ||
""" | ||
from __future__ import annotations | ||
|
||
import logging | ||
from dataclasses import dataclass | ||
from os import linesep | ||
from pathlib import ( | ||
Path, | ||
PurePath, | ||
PurePosixPath, | ||
) | ||
from typing import ( | ||
Any, | ||
Generator, | ||
) | ||
|
||
from more_itertools import intersperse | ||
|
||
from .gitworktree import ( | ||
GitWorktreeItem, | ||
GitWorktreeFileSystemItem, | ||
iter_gitworktree | ||
) | ||
from datalad_next.itertools import ( | ||
itemize, | ||
join_with_list, | ||
load_json, | ||
route_in, | ||
route_out, | ||
dont_process, | ||
) | ||
from datalad_next.runners import iter_subproc | ||
|
||
|
||
lgr = logging.getLogger('datalad.ext.next.iter_collections.annexworktree') | ||
|
||
|
||
linesep_bytes = linesep.encode() | ||
|
||
|
||
@dataclass | ||
class AnnexWorktreeItem(GitWorktreeItem): | ||
annexkey: str | None = None | ||
annexsize: int | None = None | ||
annexobjpath: PurePath | None = None | ||
|
||
|
||
@dataclass | ||
class AnnexWorktreeFileSystemItem(GitWorktreeFileSystemItem): | ||
annexkey: str | None = None | ||
annexsize: int | None = None | ||
|
||
|
||
def iter_annexworktree( | ||
path: Path, | ||
*, | ||
untracked: str | None = 'all', | ||
link_target: bool = False, | ||
fp: bool = False, | ||
) -> Generator[AnnexWorktreeItem | AnnexWorktreeFileSystemItem, None, None]: | ||
|
||
glsf = iter_gitworktree( | ||
path, | ||
untracked=untracked, | ||
link_target=link_target, | ||
fp=fp | ||
) | ||
|
||
git_fileinfo_store: list[Any] = list() | ||
key_store: list[Any] = list() | ||
|
||
with \ | ||
iter_subproc( | ||
# we get the annex key for any filename (or empty if not annexed) | ||
['git', '-C', str(path), 'annex', 'find', '--anything', '--format=\${key}\n', '--batch'], | ||
# intersperse items with newlines to trigger a batch run | ||
# this avoids string operations to append newlines to items | ||
input=intersperse( | ||
b'\n', | ||
# store all output of the git ls-find in the gitfileinfo | ||
# store | ||
route_out( | ||
glsf, | ||
git_fileinfo_store, | ||
lambda data: (str(data.name).encode(), [data]) | ||
) | ||
), | ||
) as gaf, \ | ||
iter_subproc( | ||
# get the key properties JSON-lines style | ||
['git', '-C', str(path), 'annex', 'examinekey', '--json', '--batch'], | ||
# process all non-empty keys and store them in the key store, | ||
# skip processing of empty keys and store an ignored value in | ||
# the key store | ||
input=route_out( | ||
itemize(gaf, sep=linesep_bytes, keep_ends=True), | ||
key_store, | ||
lambda data: (dont_process, [None]) | ||
if data == linesep_bytes | ||
else (data, [data]) | ||
) | ||
) as gek: | ||
|
||
for item in route_in( | ||
route_in( | ||
load_json(itemize(gek, sep=linesep_bytes)), | ||
key_store, | ||
join_with_list, | ||
), | ||
git_fileinfo_store, | ||
join_with_list, | ||
): | ||
yield AnnexWorktreeItem( | ||
name=item[2].name, | ||
gitsha=item[2].gitsha, | ||
gittype=item[2].gittype, | ||
annexkey=item[1].decode().strip() if item[1] else None, | ||
annexsize=int(item[0]['bytesize']) if item[0] else None, | ||
annexobjpath=PurePath(PurePosixPath(str(item[0]['objectpath']))) | ||
if item[0] | ||
else None, | ||
) |
84 changes: 84 additions & 0 deletions
84
datalad_next/iter_collections/tests/test_iterannexworktree.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
from pathlib import ( | ||
PurePath, | ||
) | ||
|
||
from datalad import cfg as dlcfg | ||
|
||
from datalad_next.datasets import Dataset | ||
|
||
from ..annexworktree import ( | ||
iter_annexworktree, | ||
) | ||
|
||
|
||
def _mkds(tmp_path_factory, monkeypatch, cfg_overrides): | ||
with monkeypatch.context() as m: | ||
for k, v in cfg_overrides.items(): | ||
m.setitem(dlcfg.overrides, k, v) | ||
dlcfg.reload() | ||
ds = Dataset(tmp_path_factory.mktemp('ds')).create( | ||
result_renderer='disabled') | ||
dlcfg.reload() | ||
return ds | ||
|
||
|
||
def _dotests(ds): | ||
test_file_content = 'test_file' | ||
test_file = ds.pathobj / 'annexed' / 'subdir' / 'file1.txt' | ||
test_file.parent.mkdir(parents=True) | ||
test_file.write_text(test_file_content) | ||
# we create an additional file where the content will be dropped | ||
# to test behavior on unavailable annex key | ||
droptest_content = 'somethingdropped' | ||
droptest_file = ds.pathobj / 'annexed' / 'dropped.txt' | ||
droptest_file.write_text(droptest_content) | ||
ds.save(result_renderer='disabled') | ||
ds.drop(droptest_file, reckless='availability', | ||
result_renderer='disabled') | ||
|
||
# get results for the annexed files | ||
query_path = ds.pathobj / 'annexed' | ||
res = list(iter_annexworktree( | ||
query_path, untracked=None, link_target=True, | ||
)) | ||
assert len(res) == 2 | ||
# | ||
# pick the present annex file to start | ||
r = [r for r in res if r.name.name == 'file1.txt'][0] | ||
assert r.name == PurePath('subdir', 'file1.txt') | ||
# we cannot check gitsha and symlink content for identity, it will change | ||
# depending on the tuning | ||
# we cannot check the item type, because it will vary across repository | ||
# modes (e.g., adjusted unlocked) | ||
assert r.annexsize == len(test_file_content) | ||
assert r.annexkey == 'MD5E-s9--37b87ee8c563af911dcc0f949826b1c9.txt' | ||
# with `link_target=True` we get an objpath that is relative to the | ||
# query path, and we find the actual key file there | ||
assert (query_path / r.annexobjpath).read_text() == test_file_content | ||
# | ||
# now pick the dropped annex file | ||
r = [r for r in res if r.name.name == 'dropped.txt'][0] | ||
assert r.name == PurePath('dropped.txt') | ||
# we get basic info regardless of availability | ||
assert r.annexsize == len(droptest_content) | ||
assert r.annexkey == 'MD5E-s16--770a06889bc88f8743d1ed9a1977ff7b.txt' | ||
# even with an absent key file, we get its would-be location, | ||
# and it is relative to the query path | ||
assert r.annexobjpath.parts[:2] == ('..', '.git') | ||
|
||
|
||
def test_iter_annexworktree(tmp_path_factory, monkeypatch): | ||
ds = _mkds(tmp_path_factory, monkeypatch, {}) | ||
_dotests(ds) | ||
|
||
|
||
def test_iter_annexworktree_tuned(tmp_path_factory, monkeypatch): | ||
# same as test_file_content(), but with a "tuned" annexed that | ||
# no longer matches the traditional setup. | ||
# we need to be able to cope with that too | ||
ds = _mkds(tmp_path_factory, monkeypatch, { | ||
'annex.tune.objecthash1': 'true', | ||
'annex.tune.branchhash1': 'true', | ||
'annex.tune.objecthashlower': 'true', | ||
}) | ||
_dotests(ds) |