Skip to content

Commit

Permalink
change iter_annexworktree to use bulk batching
Browse files Browse the repository at this point in the history
This commit is a test to determine the runtime
of git_annexworktree if all batch input is
sent to the subprocesses, i.e.
`git annex examinekey` before the results
are consumed
  • Loading branch information
christian-monch committed Nov 12, 2023
1 parent 1938fd0 commit 8da1f08
Showing 1 changed file with 26 additions and 31 deletions.
57 changes: 26 additions & 31 deletions datalad_next/iter_collections/annexworktree.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,7 @@ def iter_annexworktree(
# "path"-property (the "path"-property is `item.name` if `item` is a result
# of `git_ls_files`, and `item['file']`, if item is a result of
# `git_annex_find`).
gaf_store = dict()
glf_store = dict()
glf_store = []

with \
run(git_annex_find_cmd, protocol_class=GeneratorAnnexJsonNoStderrProtocol, **common_args) as git_annex_find, \
Expand All @@ -131,39 +130,35 @@ def iter_annexworktree(
# files is a subset of the files in git, i.e. the `git_annex_find`
# generator yields less or equal results then the `git_ls_files`
# generator.
for gaf_item, glf_item in zip_longest(git_annex_find, git_ls_files):
# Store both results (if they exist)
gaf_store = {
PurePath(PurePosixPath(gaf_item['file'])): gaf_item
for gaf_item in git_annex_find
}
lookup_list = []
for glf_item in git_ls_files:
gaf_item = gaf_store.get(glf_item.name)
if gaf_item:
gaf_store[PurePath(PurePosixPath(gaf_item['file']))] = gaf_item
glf_store[glf_item.name] = glf_item

# Check the "path"-properties of all `git_ls_files`-items and
# check for a matching path in `git_annex_find`-items. If a
# matching pair exists, yield a result for an annexed file and
# mark the pair for deletion.
remove = []
for path, glf_item in glf_store.items():
gaf_item = gaf_store.get(path)
if gaf_item:
remove.append(path)
key_properties = examine_key(gaf_item['key'].encode() + b'\n')
yield AnnexWorktreeItem(
name=glf_item.name,
gitsha=glf_item.gitsha,
gittype=glf_item.gittype,
annexkey=gaf_item['key'],
annexsize=int(gaf_item['bytesize']),
annexobjpath=PurePath(key_properties['objectpath']),
)

# Delete marked pairs from both item-stores.
for path in remove:
del glf_store[path]
del gaf_store[path]
lookup_list.append((glf_item.name, glf_item, gaf_item))
examine_key._stdin_queue.put(gaf_item['key'].encode() + b'\n')
else:
glf_store.append(glf_item)

for (path, glf_item, gaf_item) in lookup_list:
key_properties = next(examine_key._rgen)
yield AnnexWorktreeItem(
name=glf_item.name,
gitsha=glf_item.gitsha,
gittype=glf_item.gittype,
annexkey=gaf_item['key'],
annexsize=int(gaf_item['bytesize']),
annexobjpath=PurePath(key_properties['objectpath']),
)

del gaf_store[path]

# Remaining git ls-files results are all unannexed, yield them.
assert len(gaf_store) == 0
for path, glf_item in glf_store.items():
for glf_item in glf_store:
yield AnnexWorktreeItem(
name=glf_item.name,
gitsha=glf_item.gitsha,
Expand Down

0 comments on commit 8da1f08

Please sign in to comment.