Skip to content

Commit

Permalink
Merge pull request datalad#747 from mih/gitdiff
Browse files Browse the repository at this point in the history
Add support for pathspecs to `iter_gitdiff()`
  • Loading branch information
mih authored Jul 16, 2024
2 parents de8855a + 2aa7705 commit e582c27
Show file tree
Hide file tree
Showing 3 changed files with 206 additions and 49 deletions.
230 changes: 185 additions & 45 deletions datalad_next/iter_collections/gitdiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
)

from datalad_next.consts import PRE_INIT_COMMIT_SHA
from datalad_next.gitpathspec import GitPathSpecs
from datalad_next.runners import (
CommandError,
iter_git_subproc,
Expand Down Expand Up @@ -127,8 +128,8 @@ def iter_gitdiff(
find_renames: int | None = None,
find_copies: int | None = None,
yield_tree_items: str | None = None,
# TODO add documentation
eval_submodule_state: str = 'full',
pathspecs: list[str] | GitPathSpecs | None = None,
) -> Generator[GitDiffItem, None, None]:
"""Report differences between Git tree-ishes or tracked worktree content
Expand Down Expand Up @@ -197,6 +198,27 @@ def iter_gitdiff(
will still be reported whenever there is no recursion into them.
For example, submodule items are reported when
``recursive='repository``, even when ``yield_tree_items=None``.
eval_submodule_state: {"no", "commit", "full"}, optional
If 'full' (default), the state of a submodule is evaluated by
considering all modifications ('--ignore-submodules=none').
If 'commit', the modification check is restricted to comparing the
submodule's "HEAD" commit to the one recorded in the superdataset
('--ignore-submodules=dirty'). If 'no', the state of the subdataset is
not evaluated ('--ignore-submodules=all').
pathspecs: optional
Patterns used to limit results to particular paths. Any pathspecs
supported by Git can be used and are passed to the underlying ``git
ls-files`` queries. Pathspecs are also supported for recursive reporting
on submodules. In such a case, the results match those of individual
queries with analog pathspecs on the respective submodules (Git itself
does not support pathspecs for submodule-recursive operations). For
example, a ``submodule`` recursion with a pathspec ``*.jpg`` will yield
reports on all JPG files in all submodules, even though a submodule path
itself does not match ``*.jpg``. On the other hand, a pathspec
``submoddir/*.jpg`` will only report on JPG files in the submodule at
``submoddir/``, but on all JPG files in that submodule.
As of version 1.5, the pathspec support for submodule recursion is
preliminary and results should be carefully investigated.
Yields
------
Expand All @@ -210,7 +232,81 @@ def iter_gitdiff(
# forget/ignore and leads to non-obvious errors. Running this once is
# a cheap safety net
path = Path(path)
_pathspecs = GitPathSpecs(pathspecs)
processed_submodules: set[str] = set()

for item in _iter_gitdiff(
path=path,
from_treeish=from_treeish,
to_treeish=to_treeish,
recursive=recursive,
find_renames=find_renames,
find_copies=find_copies,
yield_tree_items=yield_tree_items,
eval_submodule_state=eval_submodule_state,
pathspecs=_pathspecs,
):
# exclude non-submodules, or a submodule that was found at
# the root path -- which would indicate that the submodule
# itself it not around, only its record in the parent
if recursive == 'submodules' \
and item.gittype == GitTreeItemType.submodule \
and not item.name == '.':
# mark as processed immediately, independent of whether anything
# need to be reported
processed_submodules.add(item.name)
yield item

# we may need to loop over the (remaining) submodules, because
# with pathspecs there is a chance that a given pathspec set did not
# match a submodule (directly) that could have content that matches a
# pathspec
if recursive == 'submodules' and _pathspecs:
for item in _iter_gitdiff(
path=path,
from_treeish=from_treeish,
to_treeish=to_treeish,
# no need to double-recurse, we just need to discover all
# submodules in the diff unconstrained by pathspecs
recursive='repository',
find_renames=None,
find_copies=None,
yield_tree_items=None,
# we need to look at the recorded commit to get submodule
# reports at all
eval_submodule_state='commit',
pathspecs=GitPathSpecs(None),
):
if item.gittype != GitTreeItemType.submodule \
or item.name in processed_submodules:
# not a submodule or already reported on
continue

yield from _yield_from_submodule(
basepath=path,
subm=item,
to_treeish=to_treeish,
recursive=recursive,
yield_tree_items=yield_tree_items,
find_renames=find_renames,
find_copies=find_copies,
eval_submodule_state=eval_submodule_state,
pathspecs=_pathspecs,
)


def _iter_gitdiff(
path: Path,
from_treeish: str | None,
to_treeish: str | None,
*,
recursive: str,
find_renames: int | None,
find_copies: int | None,
yield_tree_items: str | None,
eval_submodule_state: str,
pathspecs: GitPathSpecs,
) -> Generator[GitDiffItem, None, None]:
cmd = _build_cmd(
from_treeish=from_treeish,
to_treeish=to_treeish,
Expand All @@ -219,6 +315,7 @@ def iter_gitdiff(
find_copies=find_copies,
yield_tree_items=yield_tree_items,
eval_submodule_state=eval_submodule_state,
pathspecs=pathspecs,
)

if cmd[0] == 'diff-index':
Expand Down Expand Up @@ -262,6 +359,7 @@ def iter_gitdiff(
find_copies=find_copies,
yield_tree_items=yield_tree_items,
eval_submodule_state=eval_submodule_state,
pathspecs=pathspecs,
)
pending_props = None
elif line.startswith(':'):
Expand All @@ -283,6 +381,7 @@ def iter_gitdiff(
find_copies=find_copies,
yield_tree_items=yield_tree_items,
eval_submodule_state=eval_submodule_state,
pathspecs=pathspecs,
)


Expand All @@ -295,6 +394,7 @@ def _build_cmd(
find_copies: int | None,
yield_tree_items: str | None,
eval_submodule_state: str,
pathspecs: GitPathSpecs,
) -> list[str]:
# from : to : description
# ---------------------------
Expand Down Expand Up @@ -355,13 +455,15 @@ def _build_cmd(
# add disambiguation marker for pathspec.
# even if we do not pass any, we get simpler error messages from Git
cmd.append('--')

cmd.extend(pathspecs.arglist())
return cmd


def _get_diff_item(spec: list[str]) -> GitDiffItem:
# this type annotation is a crutch, we'd really need
# https://peps.python.org/pep-0692/ to do this properly
props: dict[str, str | int | GitTreeItemType] = {}
props: dict[str, str | int | GitTreeItemType | GitDiffStatus | None] = {}
props.update(
(k, _mode_type_map.get(v, None))
for k, v in (('prev_gittype', spec[0]),
Expand Down Expand Up @@ -390,18 +492,19 @@ def _get_diff_item(spec: list[str]) -> GitDiffItem:


def _yield_diff_item(
*,
cwd: Path,
recursive: str,
from_treeish: str | None,
to_treeish: str | None,
spec: list,
single_dir: bool,
reported_dirs: set,
yield_tree_items: str | None,
find_renames: int | None,
find_copies: int | None,
eval_submodule_state: str,
*,
cwd: Path,
recursive: str,
from_treeish: str | None,
to_treeish: str | None,
spec: list,
single_dir: bool,
reported_dirs: set,
yield_tree_items: str | None,
find_renames: int | None,
find_copies: int | None,
eval_submodule_state: str,
pathspecs: GitPathSpecs,
) -> Generator[GitDiffItem, None, None]:
item = _get_diff_item(spec)

Expand Down Expand Up @@ -446,50 +549,87 @@ def _yield_diff_item(
# # this modification means that "content" is modified
# item.add_modification_type(
# GitContainerModificationType.modified_content)

if recursive != 'submodules':
# no submodule recursion, we can yield this submodule item
# directly
yield item
return
if yield_tree_items in (
'all', 'submodules'):
# we are instructed to yield this submodule item, but we are going

if yield_tree_items in ('all', 'submodules'):
# we are instructed to yield this submodule item, but if we are going
# to recurse into it, continuing to use the item instance for
# queries. Hence we yield a copy here to avoid data corruption
yield deepcopy(item)
yield deepcopy(item) if recursive == 'submodules' else item

if recursive == 'submodules':
# I believe we need no protection against absent submodules.
# The only way they can appear here is a reported modification.
# The only modification that is possible with an absent submodule
# is a deletion. And that would cause the item.gittype to be None
# -- a condition that is caught above
for i in iter_gitdiff(
cwd / PurePosixPath(item.name),
# we never want to pass None here
# if `prev_gitsha` is None, it means that the
# submodule record is new, and we want its full
# content reported. Passing None, however,
# would only report the change to the current
# state.
from_treeish=item.prev_gitsha or PRE_INIT_COMMIT_SHA,
# when comparing the parent to the worktree, we
# also want to compare any children to the worktree
to_treeish=None if to_treeish is None else item.gitsha,
# pass on the common args
yield from _yield_from_submodule(
basepath=cwd,
subm=item,
to_treeish=to_treeish,
recursive=recursive,
yield_tree_items=yield_tree_items,
find_renames=find_renames,
find_copies=find_copies,
eval_submodule_state=eval_submodule_state,
):
# prepend any item name with the parent items
# name
for attr in ('name', 'prev_name'):
val = getattr(i, attr)
if val is not None:
setattr(i, attr, f'{item.name}/{val}')
yield i
return
pathspecs=pathspecs,
)


def _yield_from_submodule(
*,
basepath: Path,
subm: GitDiffItem,
to_treeish: str | None,
recursive: str,
yield_tree_items: str | None,
find_renames: int | None,
find_copies: int | None,
eval_submodule_state: str,
pathspecs: GitPathSpecs,
) -> Generator[GitDiffItem, None, None]:
# I believe we need no protection against absent submodules.
# The only way they can appear here is a reported modification.
# The only modification that is possible with an absent submodule
# is a deletion. And that would cause the item.gittype to be None
# -- a condition that is caught above
subm_name = PurePosixPath(subm.name)
subm_pathspecs = pathspecs
if pathspecs:
# recode pathspecs to match the submodule scope
try:
subm_pathspecs = pathspecs.for_subdir(subm_name)
except ValueError:
# not a single pathspec could be translated, there is
# no chance for a match, we can stop here
return
for i in iter_gitdiff(
basepath / subm_name,
# we never want to pass None here
# if `prev_gitsha` is None, it means that the
# submodule record is new, and we want its full
# content reported. Passing None, however,
# would only report the change to the current
# state.
from_treeish=subm.prev_gitsha or PRE_INIT_COMMIT_SHA,
# when comparing the parent to the worktree, we
# also want to compare any children to the worktree
to_treeish=None if to_treeish is None else subm.gitsha,
# pass on the common args
recursive=recursive,
yield_tree_items=yield_tree_items,
find_renames=find_renames,
find_copies=find_copies,
eval_submodule_state=eval_submodule_state,
pathspecs=subm_pathspecs,
):
# prepend any item name with the parent items
# name
for attr in ('name', 'prev_name'):
val = getattr(i, attr)
if val is not None:
setattr(i, attr, f'{subm.name}/{val}')
yield i


def _mangle_item_for_singledir(
Expand Down
6 changes: 3 additions & 3 deletions datalad_next/iter_collections/gitstatus.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,17 +308,17 @@ def _yield_repo_items(
return

# lastly untracked files of this repo
for item in iter_gitworktree(
for untracked_item in iter_gitworktree(
path=path,
untracked=_untracked_mode_map[untracked],
link_target=False,
fp=False,
recursive='repository',
):
yield GitDiffItem(
name=item.name.as_posix(),
name=untracked_item.name.as_posix(),
status=GitDiffStatus.other,
gittype=item.gittype,
gittype=untracked_item.gittype,
)


Expand Down
19 changes: 18 additions & 1 deletion datalad_next/iter_collections/tests/test_itergitdiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,15 +333,18 @@ def test_iter_gitdiff_multilvl_rec(existing_dataset, no_result_rendering):
ds.save()
dsp = ds.pathobj

diff = list(iter_gitdiff(
common_args = dict(
path=dsp,
from_treeish=PRE_INIT_COMMIT_SHA,
to_treeish=ds.repo.get_corresponding_branch() or 'HEAD',
)
diff = list(iter_gitdiff(
# check that we get full repo content from all submodules
recursive='submodules',
# check that the container item flags are passed into the
# recursion properly
yield_tree_items='submodules',
**common_args
))
for repo in ('sublvl1', 'sublvl1/sublvl2'):
assert any(
Expand All @@ -352,3 +355,17 @@ def test_iter_gitdiff_multilvl_rec(existing_dataset, no_result_rendering):
d.name == f'{base}.datalad/config' \
and d.gittype == GitTreeItemType.file
for d in diff)

# try with very simple pathspec constraint, where the pathspec
# itself does not match the submodules that contain the
# matches
diff = list(iter_gitdiff(
pathspecs=[':(glob)**/config'],
recursive='submodules',
**common_args
))
assert len(diff) == 3
assert all(
d.name.endswith('config') and d.gittype == GitTreeItemType.file
for d in diff
)

0 comments on commit e582c27

Please sign in to comment.