Skip to content

Commit

Permalink
use two run-contexts for iter_annexworktree
Browse files Browse the repository at this point in the history
Because a batch process to look up annex
keys is slow, this implementation uses two
concurrent run contexts. One run context
peforms `git ls-files`, while the other
run context performs `git annex find`.

The output of both is read and processed.

That reduces the runtime to about 7%
of the previous rutime.
  • Loading branch information
christian-monch committed Oct 27, 2023
1 parent bff94b1 commit 2913989
Showing 1 changed file with 54 additions and 25 deletions.
79 changes: 54 additions & 25 deletions datalad_next/iter_collections/annexworktree.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,18 @@
from pathlib import (
Path,
PurePath,
PurePosixPath,
)
from typing import Generator

from datalad.support.annexrepo import GeneratorAnnexJsonNoStderrProtocol
from datalad_next.runners import StdOutCaptureGeneratorProtocol
from datalad_next.runners.batch import (
annexjson_batchcommand,
annexline_batchcommand,
)
from datalad_next.runners.batch import annexjson_batchcommand
from datalad_next.runners.run import run

from .gitworktree import (
GitWorktreeItem,
GitWorktreeFileSystemItem,
iter_gitworktree,
_lsfiles_line2props,
_mode_type_map,
lsfiles_untracked_args,
Expand Down Expand Up @@ -153,28 +151,59 @@ def iter_annexworktree(
git_ls_files_cmd = ['git', 'ls-files', '-z', '--stage', '--cached']
if untracked:
git_ls_files_cmd.extend(lsfiles_untracked_args[untracked])

git_annex_find_cmd = [
'git', 'annex', 'find', '--include=*',
'--json', '--json-error-messages', '.'
]
common_args = dict(cwd=path, terminate_time=3, kill_time=1)
gaf_store = dict()
glf_store = dict()
read_gaf = True
read_glf = True
with \
run(git_annex_find_cmd, protocol_class=GeneratorAnnexJsonNoStderrProtocol, **common_args) as git_annex_find, \
run(git_ls_files_cmd, protocol_class=GitLsFilesProtocol, **common_args) as git_ls_files, \
annexjson_batchcommand(['git', 'annex', 'examinekey', '--json', '--batch'], **common_args) as examine_key, \
annexline_batchcommand(['git', 'annex', 'lookupkey', '--batch'], **common_args) as lookup_key:

for item in git_ls_files:
annex_key = lookup_key((str(item.name) + '\n').encode())
if annex_key:
key_properties = examine_key(annex_key.encode() + b'\n')
annex_object_path = key_properties['objectpath']
annex_size = int(key_properties['bytesize'])
else:
annex_object_path = None
annex_size = None

annexjson_batchcommand(['git', 'annex', 'examinekey', '--json', '--batch'], **common_args) as examine_key:

while read_gaf or read_glf:
if read_gaf:
try:
gaf_item = next(git_annex_find)
gaf_store[PurePath(PurePosixPath(gaf_item['file']))] = gaf_item
except StopIteration:
read_gaf = False

if read_glf:
try:
glf_item = next(git_ls_files)
glf_store[glf_item.name] = glf_item
except StopIteration:
read_glf = False

remove = []
for path, glf_item in glf_store.items():
if path in gaf_store:
remove.append(path)
key_properties = examine_key(gaf_item['key'].encode() + b'\n')
yield AnnexWorktreeItem(
name=glf_item.name,
gitsha=glf_item.gitsha,
gittype=glf_item.gittype,
annexkey=gaf_item['key'],
annexsize=int(gaf_item['bytesize']),
annexobjpath=PurePath(key_properties['objectpath']),
)
for path in remove:
del glf_store[path]
del gaf_store[path]

# Yield non-annex files
for path, glf_item in glf_store.items():
yield AnnexWorktreeItem(
name=item.name,
gitsha=item.gitsha,
gittype=item.gittype,
annexkey=annex_key,
annexsize=annex_size,
annexobjpath=None if annex_object_path is None else PurePath(annex_object_path),
name=glf_item.name,
gitsha=glf_item.gitsha,
gittype=glf_item.gittype,
annexkey=None,
annexsize=None,
annexobjpath=None
)

0 comments on commit 2913989

Please sign in to comment.