From b88e21f320c316f4614dc1f8f8a9d4ea949be529 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Jul 2024 07:36:34 +0200 Subject: [PATCH 01/11] rf(`iter_gitdiff`): dissolve kwarg-dict to improve type annotation --- datalad_next/iter_collections/gitdiff.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/datalad_next/iter_collections/gitdiff.py b/datalad_next/iter_collections/gitdiff.py index e27a3c25..029a8ba0 100644 --- a/datalad_next/iter_collections/gitdiff.py +++ b/datalad_next/iter_collections/gitdiff.py @@ -210,9 +210,7 @@ def iter_gitdiff( # a cheap safety net path = Path(path) - # put most args in a container, we need to pass then around quite - # a bit - kwargs = dict( + cmd = _build_cmd( from_treeish=from_treeish, to_treeish=to_treeish, recursive=recursive, @@ -222,8 +220,6 @@ def iter_gitdiff( eval_submodule_state=eval_submodule_state, ) - cmd = _build_cmd(**kwargs) - if cmd[0] == 'diff-index': # when we compare to the index, we need a refresh run to not have # something like plain mtime changes trigger modification reports @@ -258,7 +254,13 @@ def iter_gitdiff( single_dir=_single_dir, spec=pending_props, reported_dirs=reported_dirs, - **kwargs + from_treeish=from_treeish, + to_treeish=to_treeish, + recursive=recursive, + find_renames=find_renames, + find_copies=find_copies, + yield_tree_items=yield_tree_items, + eval_submodule_state=eval_submodule_state, ) pending_props = None elif line.startswith(':'): @@ -273,7 +275,13 @@ def iter_gitdiff( single_dir=_single_dir, spec=pending_props, reported_dirs=reported_dirs, - **kwargs + from_treeish=from_treeish, + to_treeish=to_treeish, + recursive=recursive, + find_renames=find_renames, + find_copies=find_copies, + yield_tree_items=yield_tree_items, + eval_submodule_state=eval_submodule_state, ) From 0eb4ffb44544ed9956674e87c067784ab0d28126 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Jul 2024 07:42:07 +0200 Subject: [PATCH 02/11] fix(`iter_gitdiff`): corrected type annotation in internal helper --- datalad_next/iter_collections/gitdiff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_next/iter_collections/gitdiff.py b/datalad_next/iter_collections/gitdiff.py index 029a8ba0..b0708faa 100644 --- a/datalad_next/iter_collections/gitdiff.py +++ b/datalad_next/iter_collections/gitdiff.py @@ -362,7 +362,7 @@ def _yield_diff_item( spec: list, single_dir: bool, reported_dirs: set, - yield_tree_items: bool, + yield_tree_items: str | None, **kwargs ) -> Generator[GitDiffItem, None, None]: props: dict[str, str | int | GitTreeItemType] = {} From b4bec1800d93f18284f7abeabbb99f7430fcc739 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Jul 2024 07:58:04 +0200 Subject: [PATCH 03/11] test(`iter_gitdiff`): type-annotate internal helper --- datalad_next/iter_collections/gitdiff.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/datalad_next/iter_collections/gitdiff.py b/datalad_next/iter_collections/gitdiff.py index b0708faa..95516ebf 100644 --- a/datalad_next/iter_collections/gitdiff.py +++ b/datalad_next/iter_collections/gitdiff.py @@ -287,10 +287,13 @@ def iter_gitdiff( def _build_cmd( *, - from_treeish, to_treeish, - recursive, yield_tree_items, - find_renames, find_copies, - eval_submodule_state, + from_treeish: str | None, + to_treeish: str | None, + recursive: str, + find_renames: int | None, + find_copies: int | None, + yield_tree_items: str | None, + eval_submodule_state: str, ) -> list[str]: # from : to : description # --------------------------- @@ -333,6 +336,7 @@ def _build_cmd( raise ValueError( 'either `from_treeish` or `to_treeish` must not be None') elif to_treeish is None: + assert from_treeish is not None cmd = ['diff-index', *common_args, from_treeish] else: # diff NOT against the working tree From a513b8c8bedb7b352dd54947d4c4d09990d06ad3 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Jul 2024 08:08:01 +0200 Subject: [PATCH 04/11] rf(`iter_gitdiff`): dedicated function for Git output -> `GitDiffItem` --- datalad_next/iter_collections/gitdiff.py | 35 ++++++++++++------------ 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/datalad_next/iter_collections/gitdiff.py b/datalad_next/iter_collections/gitdiff.py index 95516ebf..051088fa 100644 --- a/datalad_next/iter_collections/gitdiff.py +++ b/datalad_next/iter_collections/gitdiff.py @@ -357,18 +357,7 @@ def _build_cmd( return cmd -def _yield_diff_item( - *, - cwd: Path, - recursive: str, - from_treeish: str | None, - to_treeish: str | None, - spec: list, - single_dir: bool, - reported_dirs: set, - yield_tree_items: str | None, - **kwargs -) -> Generator[GitDiffItem, None, None]: +def _get_diff_item(spec: list) -> GitDiffItem: props: dict[str, str | int | GitTreeItemType] = {} props.update( (k, _mode_type_map.get(v, None)) @@ -392,10 +381,22 @@ def _yield_diff_item( props['prev_name'] = spec[5] props['name'] = spec[6] if len(spec) > 6 else spec[5] - # at this point we know all about the item - # conversion should be cheap, so let's do this here - # and get a bit neater code for the rest of this function - item = GitDiffItem(**props) + return GitDiffItem(**props) + + +def _yield_diff_item( + *, + cwd: Path, + recursive: str, + from_treeish: str | None, + to_treeish: str | None, + spec: list, + single_dir: bool, + reported_dirs: set, + yield_tree_items: str | None, + **kwargs +) -> Generator[GitDiffItem, None, None]: + item = _get_diff_item(spec) if not single_dir: if item.gittype != GitTreeItemType.submodule: @@ -450,7 +451,7 @@ def _yield_diff_item( yield i return - name = props['name'] or props['prev_name'] + name = item.name or item.prev_name # we cannot have items that have no name whatsoever assert name is not None # we decide on mangling the actual report to be on the containing directory From 04d561502cdbfe044827761eb3aada9e38bbbb10 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Jul 2024 08:17:06 +0200 Subject: [PATCH 05/11] test(`test_gitdiff`): disable kwargs type-checking There is PEP692 which would do this properly, but that is not in all the Python versions we are using. The dict-handling that is relevant here has been constrain to a single function with a few lines in the previous commits. This should help gaining sufficient confidence that this particular implementation does not mess with the types. --- datalad_next/iter_collections/gitdiff.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/datalad_next/iter_collections/gitdiff.py b/datalad_next/iter_collections/gitdiff.py index 051088fa..20ab4e7c 100644 --- a/datalad_next/iter_collections/gitdiff.py +++ b/datalad_next/iter_collections/gitdiff.py @@ -357,7 +357,9 @@ def _build_cmd( return cmd -def _get_diff_item(spec: list) -> GitDiffItem: +def _get_diff_item(spec: list[str]) -> GitDiffItem: + # this type annotation is a crutch, we'd really need + # https://peps.python.org/pep-0692/ to do this properly props: dict[str, str | int | GitTreeItemType] = {} props.update( (k, _mode_type_map.get(v, None)) @@ -381,7 +383,9 @@ def _get_diff_item(spec: list) -> GitDiffItem: props['prev_name'] = spec[5] props['name'] = spec[6] if len(spec) > 6 else spec[5] - return GitDiffItem(**props) + # disable type checking here, we need + # https://peps.python.org/pep-0692/ to do this properly + return GitDiffItem(**props) # type: ignore def _yield_diff_item( From 858036fa1a3a0b2d9aa36b40767990f109c85b13 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Jul 2024 10:08:15 +0200 Subject: [PATCH 06/11] test(`iter_gitdiff`): new test to show that recursion is broken Neither is the `yield_tree_items` flag passed properly, nor are submodules recursed into reliably. For a superdataset everything is fine. A 1st-level submodule is reported as of `recursion=no` would be set. There is no reporting on lower levels of submodules in the general case. The following commits will address the underlying problems. --- .../tests/test_itergitdiff.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/datalad_next/iter_collections/tests/test_itergitdiff.py b/datalad_next/iter_collections/tests/test_itergitdiff.py index 897b582d..5e975899 100644 --- a/datalad_next/iter_collections/tests/test_itergitdiff.py +++ b/datalad_next/iter_collections/tests/test_itergitdiff.py @@ -2,6 +2,7 @@ import pytest import shutil +from datalad_next.consts import PRE_INIT_COMMIT_SHA from datalad_next.utils import rmtree from ..gitdiff import ( @@ -323,3 +324,31 @@ def test_iter_gitdiff_rec(existing_dataset, no_result_rendering): assert diff == list(iter_gitdiff(*status_args, recursive='submodules')) # use the opportunity to check equality of recursive='all' for this case assert diff == list(iter_gitdiff(*status_args, recursive='all')) + + +def test_iter_gitdiff_multilvl_rec(existing_dataset, no_result_rendering): + ds = existing_dataset + s1 = ds.create('sublvl1') + s1.create('sublvl2') + ds.save() + dsp = ds.pathobj + + diff = list(iter_gitdiff( + path=dsp, + from_treeish=PRE_INIT_COMMIT_SHA, + to_treeish=ds.repo.get_corresponding_branch() or 'HEAD', + # check that we get full repo content from all submodules + recursive='submodules', + # check that the container item flags are passed into the + # recursion properly + yield_tree_items='submodules', + )) + for repo in ('sublvl1', 'sublvl1/sublvl2'): + assert any( + d.name == repo and d.gittype == GitTreeItemType.submodule + for d in diff) + for base in ('', 'sublvl1/', 'sublvl1/sublvl2/'): + assert any( + d.name == f'{base}.datalad/config' \ + and d.gittype == GitTreeItemType.file + for d in diff) From c37dd315d23f84cc2eeb375753c94b52acb3e811 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Jul 2024 10:13:23 +0200 Subject: [PATCH 07/11] fix(`iter_gitdiff`): correct `git diff-index|tree` command for submodule recursion Previously, the command built matched the one for `recursion='no'`, due to a missing label in a conditional. --- datalad_next/iter_collections/gitdiff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_next/iter_collections/gitdiff.py b/datalad_next/iter_collections/gitdiff.py index 20ab4e7c..2f62a88e 100644 --- a/datalad_next/iter_collections/gitdiff.py +++ b/datalad_next/iter_collections/gitdiff.py @@ -341,7 +341,7 @@ def _build_cmd( else: # diff NOT against the working tree cmd = ['diff-tree', *common_args] - if recursive == 'repository': + if recursive in ('repository', 'submodules'): cmd.append('-r') if yield_tree_items in ('all', 'directories'): cmd.append('-t') From 78d71e82e01a77e2647b29fd5518daff77aa7d51 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Jul 2024 10:20:07 +0200 Subject: [PATCH 08/11] fix(`iter_gitdiff`): yield copy of container item when it is also further processed When asked `iter_gitdiff()` yields a "container item" (directory, submodule), before it recurses into them. However, during item postprocessing some properties can be modified further, leading to unexpected changes that interfere with the recursion. This change makes sure that a copy of the container item is yielded in such cases, which prevents this kind of data corruption during recursion. --- datalad_next/iter_collections/gitdiff.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/datalad_next/iter_collections/gitdiff.py b/datalad_next/iter_collections/gitdiff.py index 2f62a88e..63bd2c3b 100644 --- a/datalad_next/iter_collections/gitdiff.py +++ b/datalad_next/iter_collections/gitdiff.py @@ -4,6 +4,7 @@ """ from __future__ import annotations +from copy import deepcopy from dataclasses import dataclass from enum import Enum from functools import cached_property @@ -420,10 +421,17 @@ def _yield_diff_item( # # this modification means that "content" is modified # item.add_modification_type( # GitContainerModificationType.modified_content) - if recursive != 'submodules' or yield_tree_items in ( - 'all', 'submodules'): - # we are instructed to yield it + if recursive != 'submodules': + # no submodule recursion, we can yield this submodule item + # directly yield item + return + if yield_tree_items in ( + 'all', 'submodules'): + # we are instructed to yield this submodule item, but we are going + # to recurse into it, continuing to use the item instance for + # queries. Hence we yield a copy here to avoid data corruption + yield deepcopy(item) if recursive == 'submodules': # I believe we need no protection against absent submodules. # The only way they can appear here is a reported modification. From 94bd3c50fdcfe142055b631249e996743dddb01d Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Jul 2024 10:30:41 +0200 Subject: [PATCH 09/11] fix(`iter_gitdiff`): enable multi-level submodule recursion Due to some options not passed to recursive calls, only 1st-level submodule recursion was working before. Moreover, the implementation also did not honor the `yield_tree_items` option for submodules. --- datalad_next/iter_collections/gitdiff.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datalad_next/iter_collections/gitdiff.py b/datalad_next/iter_collections/gitdiff.py index 63bd2c3b..54fc8a2d 100644 --- a/datalad_next/iter_collections/gitdiff.py +++ b/datalad_next/iter_collections/gitdiff.py @@ -452,6 +452,8 @@ def _yield_diff_item( # when comparing the parent to the worktree, we # also want to compare any children to the worktree to_treeish=None if to_treeish is None else item.gitsha, + recursive=recursive, + yield_tree_items=yield_tree_items, ) ): # prepend any item name with the parent items From 3950586557be49403cfbfe02028b05a78cf957fd Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Jul 2024 10:56:55 +0200 Subject: [PATCH 10/11] rf(`iter_gitdiff`): flip conditional to simplify internal helper May be a matter of test, but not the conditionals are shorter overall. --- datalad_next/iter_collections/gitdiff.py | 147 ++++++++++++----------- 1 file changed, 76 insertions(+), 71 deletions(-) diff --git a/datalad_next/iter_collections/gitdiff.py b/datalad_next/iter_collections/gitdiff.py index 54fc8a2d..6f3e4795 100644 --- a/datalad_next/iter_collections/gitdiff.py +++ b/datalad_next/iter_collections/gitdiff.py @@ -403,85 +403,90 @@ def _yield_diff_item( ) -> Generator[GitDiffItem, None, None]: item = _get_diff_item(spec) - if not single_dir: - if item.gittype != GitTreeItemType.submodule: + if single_dir: + # handle the special case of reporting only on the 1st-level + # containing directory of an item. + name = item.name or item.prev_name + # we cannot have items that have no name whatsoever + assert name is not None + # we decide on mangling the actual report to be on the containing + # directory only, or to withhold it entirely + dname_l = name.split('/', maxsplit=1) + if len(dname_l) < 2: + # nothing in a subdirectory yield item return - # this is about a present submodule - if item.status == GitDiffStatus.modification: - if item.gitsha is None: - # in 'git diff-index' speak the submodule is "out-of-sync" with - # the index: this happens when there are new commits - item.add_modification_type( - GitContainerModificationType.new_commits) - # TODO we cannot give details for other modification types. - # depending on --ignore-submodules a range of situations - # could be the case - #else: - # # this modification means that "content" is modified - # item.add_modification_type( - # GitContainerModificationType.modified_content) - if recursive != 'submodules': - # no submodule recursion, we can yield this submodule item - # directly - yield item + dname = dname_l[0] + if dname in reported_dirs: + # nothing else todo, we already reported return - if yield_tree_items in ( - 'all', 'submodules'): - # we are instructed to yield this submodule item, but we are going - # to recurse into it, continuing to use the item instance for - # queries. Hence we yield a copy here to avoid data corruption - yield deepcopy(item) - if recursive == 'submodules': - # I believe we need no protection against absent submodules. - # The only way they can appear here is a reported modification. - # The only modification that is possible with an absent submodule - # is a deletion. And that would cause the item.gittype to be None - # -- a condition that is caught above - for i in iter_gitdiff( - cwd / PurePosixPath(item.name), - **dict( - kwargs, - # we never want to pass None here - # if `prev_gitsha` is None, it means that the - # submodule record is new, and we want its full - # content reported. Passing None, however, - # would only report the change to the current - # state. - from_treeish=item.prev_gitsha or PRE_INIT_COMMIT_SHA, - # when comparing the parent to the worktree, we - # also want to compare any children to the worktree - to_treeish=None if to_treeish is None else item.gitsha, - recursive=recursive, - yield_tree_items=yield_tree_items, - ) - ): - # prepend any item name with the parent items - # name - for attr in ('name', 'prev_name'): - val = getattr(i, attr) - if val is not None: - setattr(i, attr, f'{item.name}/{val}') - yield i + + reported_dirs.add(dname) + yield _mangle_item_for_singledir(item, dname, from_treeish, cwd) return - name = item.name or item.prev_name - # we cannot have items that have no name whatsoever - assert name is not None - # we decide on mangling the actual report to be on the containing directory - # only, or to withhold it entirely - dname_l = name.split('/', maxsplit=1) - if len(dname_l) < 2: - # nothing in a subdirectory + if item.gittype != GitTreeItemType.submodule: + # any non-submodule item can be yielded now and we are done here yield item return - dname = dname_l[0] - if dname in reported_dirs: - # nothing else todo, we already reported - return - reported_dirs.add(dname) - yield _mangle_item_for_singledir(item, dname, from_treeish, cwd) + # this is about a present submodule + if item.status == GitDiffStatus.modification: + if item.gitsha is None: + # in 'git diff-index' speak the submodule is "out-of-sync" with + # the index: this happens when there are new commits + item.add_modification_type( + GitContainerModificationType.new_commits) + # TODO we cannot give details for other modification types. + # depending on --ignore-submodules a range of situations + # could be the case + #else: + # # this modification means that "content" is modified + # item.add_modification_type( + # GitContainerModificationType.modified_content) + if recursive != 'submodules': + # no submodule recursion, we can yield this submodule item + # directly + yield item + return + if yield_tree_items in ( + 'all', 'submodules'): + # we are instructed to yield this submodule item, but we are going + # to recurse into it, continuing to use the item instance for + # queries. Hence we yield a copy here to avoid data corruption + yield deepcopy(item) + if recursive == 'submodules': + # I believe we need no protection against absent submodules. + # The only way they can appear here is a reported modification. + # The only modification that is possible with an absent submodule + # is a deletion. And that would cause the item.gittype to be None + # -- a condition that is caught above + for i in iter_gitdiff( + cwd / PurePosixPath(item.name), + **dict( + kwargs, + # we never want to pass None here + # if `prev_gitsha` is None, it means that the + # submodule record is new, and we want its full + # content reported. Passing None, however, + # would only report the change to the current + # state. + from_treeish=item.prev_gitsha or PRE_INIT_COMMIT_SHA, + # when comparing the parent to the worktree, we + # also want to compare any children to the worktree + to_treeish=None if to_treeish is None else item.gitsha, + recursive=recursive, + yield_tree_items=yield_tree_items, + ) + ): + # prepend any item name with the parent items + # name + for attr in ('name', 'prev_name'): + val = getattr(i, attr) + if val is not None: + setattr(i, attr, f'{item.name}/{val}') + yield i + return def _mangle_item_for_singledir( From 232dba4bd655b7d4a173a1e6334179a6018c20cb Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Jul 2024 11:04:15 +0200 Subject: [PATCH 11/11] rf(`iter_gitdiff`): discontinue use of `kwargs` dict This has caused a bug in the past, by obscuring what information is being passed on -- and more importantly what not. Here we remove this, and explicitly pass the three arguments that it was dealing with. --- datalad_next/iter_collections/gitdiff.py | 35 +++++++++++++----------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/datalad_next/iter_collections/gitdiff.py b/datalad_next/iter_collections/gitdiff.py index 6f3e4795..0d9f9c6d 100644 --- a/datalad_next/iter_collections/gitdiff.py +++ b/datalad_next/iter_collections/gitdiff.py @@ -399,7 +399,9 @@ def _yield_diff_item( single_dir: bool, reported_dirs: set, yield_tree_items: str | None, - **kwargs + find_renames: int | None, + find_copies: int | None, + eval_submodule_state: str, ) -> Generator[GitDiffItem, None, None]: item = _get_diff_item(spec) @@ -463,21 +465,22 @@ def _yield_diff_item( # -- a condition that is caught above for i in iter_gitdiff( cwd / PurePosixPath(item.name), - **dict( - kwargs, - # we never want to pass None here - # if `prev_gitsha` is None, it means that the - # submodule record is new, and we want its full - # content reported. Passing None, however, - # would only report the change to the current - # state. - from_treeish=item.prev_gitsha or PRE_INIT_COMMIT_SHA, - # when comparing the parent to the worktree, we - # also want to compare any children to the worktree - to_treeish=None if to_treeish is None else item.gitsha, - recursive=recursive, - yield_tree_items=yield_tree_items, - ) + # we never want to pass None here + # if `prev_gitsha` is None, it means that the + # submodule record is new, and we want its full + # content reported. Passing None, however, + # would only report the change to the current + # state. + from_treeish=item.prev_gitsha or PRE_INIT_COMMIT_SHA, + # when comparing the parent to the worktree, we + # also want to compare any children to the worktree + to_treeish=None if to_treeish is None else item.gitsha, + # pass on the common args + recursive=recursive, + yield_tree_items=yield_tree_items, + find_renames=find_renames, + find_copies=find_copies, + eval_submodule_state=eval_submodule_state, ): # prepend any item name with the parent items # name