From a03e5281ba3f576e5d9c2489845e567ed0c6991d Mon Sep 17 00:00:00 2001
From: Michael Hanke <michael.hanke@gmail.com>
Date: Thu, 19 Oct 2023 10:41:39 +0200
Subject: [PATCH 1/2] Add two tests that document portability issues of
 run-records

- IO specifications are stored in platform-native conventions
- IO specifications are reported as-is

combined with the absence of any platform type record, this makes
run-records non-portable across unix/windows scopes.

Ping https://github.com/datalad/datalad/issues/7512
---
 datalad_next/patches/tests/test_run.py | 88 ++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/datalad_next/patches/tests/test_run.py b/datalad_next/patches/tests/test_run.py
index 721e6de9..87c545c5 100644
--- a/datalad_next/patches/tests/test_run.py
+++ b/datalad_next/patches/tests/test_run.py
@@ -1,5 +1,8 @@
+from pathlib import Path
 import pytest
 
+from datalad.local.rerun import get_run_info
+
 from datalad_next.exceptions import IncompleteResultsError
 from datalad_next.tests.utils import (
     SkipTest,
@@ -23,3 +26,88 @@ def test_substitution_config_default(existing_dataset):
     # make sure we could actually detect breakage with the check above
     with pytest.raises(IncompleteResultsError):
         ds.run('{python} -c "breakage"', result_renderer='disabled')
+
+
+def test_runrecord_portable_paths(existing_dataset):
+    ds = existing_dataset
+    dsrepo = ds.repo
+    infile = ds.pathobj / 'inputs' / 'testfile.txt'
+    outfile = ds.pathobj / 'outputs' / 'testfile.txt'
+    infile.parent.mkdir()
+    outfile.parent.mkdir()
+    infile.touch()
+    ds.save()
+    assert not outfile.exists()
+    # script copies any 'inputs' to the outputs dir
+    res = ds.run(
+        '{python} -c "'
+        'from shutil import copyfile;'
+        'from pathlib import Path;'
+        r"""[copyfile(f.strip('\"'), Path.cwd() / \"outputs\" / Path(f.strip('\"')).name)"""
+        # we need to use a raw string to contain the inputs expansion,
+        # on windows they would contain backslashes that are unescaped
+        r""" for f in r'{inputs}'.split()]"""
+        '"',
+        result_renderer='disabled',
+        # we need to pass relative paths ourselves
+        # https://github.com/datalad/datalad/issues/7516
+        inputs=[str(infile.relative_to(ds.pathobj))],
+        outputs=[str(outfile.relative_to(ds.pathobj))],
+    )
+    # verify basic outcome
+    assert_result_count(res, 1, action='run', status='ok')
+    assert outfile.exists()
+
+    # branch we expect the runrecord on
+    branch = dsrepo.get_corresponding_branch() or dsrepo.get_active_branch()
+    cmsg = dsrepo.format_commit('%B', branch)
+
+    # the IOspecs are stored in POSIX conventions
+    assert r'"inputs/testfile.txt"' in cmsg
+    assert r'"outputs/testfile.txt"' in cmsg
+
+    # get_run_info() reports in platform conventions
+    msg, run_info = get_run_info(ds, cmsg)
+    assert run_info
+    for k in ('inputs', 'outputs'):
+        specs = run_info.get(k)
+        assert len(specs) > 0
+        for p in specs:
+            assert (ds.pathobj / p).exists()
+
+
+def test_runrecord_oldnative_paths(existing_dataset):
+    ds = existing_dataset
+    # this test is imitating a rerun situation, so we create the
+    # inputs and outputs
+    infile = ds.pathobj / 'inputs' / 'testfile.txt'
+    outfile = ds.pathobj / 'outputs' / 'testfile.txt'
+    for fp in (infile, outfile):
+        fp.parent.mkdir()
+        fp.touch()
+
+    cmsg = (
+        '[DATALAD RUNCMD] /home/mih/env/datalad-dev/bin/python -c ...\n\n'
+        '=== Do not change lines below ===\n'
+        '{\n'
+        ' "chain": [],\n'
+        ' "cmd": "{python} -c True",\n'
+        # use the ID of the test dataset to ensure proper association
+        f' "dsid": "{ds.id}",\n'
+        ' "exit": 0,\n'
+        ' "extra_inputs": [],\n'
+        ' "inputs": [\n'
+        # make windows path, used to be stored in escaped form
+        r'   "inputs\\testfile.txt"' '\n'
+        ' ],\n'
+        ' "outputs": [\n'
+        # make windows path, used to be stored in escaped form
+        r'   "outputs\\testfile.txt"' '\n'
+        ' ],\n'
+        ' "pwd": "."\n'
+        '}\n'
+        '^^^ Do not change lines above ^^^\n'
+    )
+    msg, run_info = get_run_info(ds, cmsg)
+    assert run_info['inputs'][0] == str(Path('inputs', 'testfile.txt'))
+    assert run_info['outputs'][0] == str(Path('outputs', 'testfile.txt'))

From 099fa24d805db50417c86d7e6ce6598ac04e1dee Mon Sep 17 00:00:00 2001
From: Michael Hanke <michael.hanke@gmail.com>
Date: Thu, 19 Oct 2023 22:33:31 +0200
Subject: [PATCH 2/2] Store run-record path specs POSIX and report platform
 paths

There is limited support for reading and acting on old run-records
that have paths stored in platform conventions. Detection works
when the path matching an existing item on the file system.

Paths are always stored in POSIX notation, whenever they are relative.
---
 datalad_next/patches/run.py | 111 +++++++++++++++++++++++++++++++++++-
 1 file changed, 110 insertions(+), 1 deletion(-)

diff --git a/datalad_next/patches/run.py b/datalad_next/patches/run.py
index b7672d7a..6348c1bc 100644
--- a/datalad_next/patches/run.py
+++ b/datalad_next/patches/run.py
@@ -1,4 +1,10 @@
-"""Enhance ``run()`` placeholder substitutions to honor configuration defaults
+"""Enhance datalad-core's ``run()``
+
+Portable path handling logic for run-records
+--------------------------------------------
+
+Placeholder substitutions to honor configuration defaults
+---------------------------------------------------------
 
 Previously, ``run()`` would not recognize configuration defaults for
 placeholder substitution. This means that any placeholders globally declared in
@@ -19,6 +25,12 @@
 """
 
 from itertools import filterfalse
+from os.path import lexists
+from pathlib import (
+    PurePath,
+    PureWindowsPath,
+    PurePosixPath,
+)
 import sys
 
 from datalad.core.local.run import (
@@ -26,7 +38,10 @@
     SequenceFormatter,
     normalize_command,
     quote_cmdlinearg,
+    _create_record as _orig_create_record,
 )
+from datalad.distribution.dataset import Dataset
+from datalad.local.rerun import get_run_info as _orig_get_run_info
 from datalad.interface.common_cfg import definitions as cfg_defs
 from datalad.support.constraints import EnsureStr
 from datalad.support.extensions import register_config
@@ -34,6 +49,95 @@
 from . import apply_patch
 
 
+# Deals with https://github.com/datalad/datalad/issues/7512
+def _create_record(run_info, sidecar_flag, ds):
+    # convert any input/output specification to a POSIX path
+    for k in ('inputs', 'outputs'):
+        if k not in run_info:
+            continue
+        run_info[k] = [_get_posix_relpath_for_runrecord(p)
+                       for p in run_info[k]]
+
+    return _orig_create_record(run_info, sidecar_flag, ds)
+
+
+def _get_posix_relpath_for_runrecord(path):
+    p = PurePath(path)
+    if p.is_absolute():
+        # there is no point in converting an absolute path
+        # to a different platform convention.
+        # return as-is
+        return path
+
+    return str(PurePosixPath(p))
+
+
+# Deals with https://github.com/datalad/datalad/issues/7512
+def get_run_info(dset, message):
+    msg, run_info = _orig_get_run_info(dset, message)
+    if run_info is None:
+        # nothing to process, return as-is
+        return msg, run_info
+
+    for k in ('inputs', 'outputs'):
+        if k not in run_info:
+            continue
+        run_info[k] = [_get_platform_path_from_runrecord(p, dset)
+                       for p in run_info[k]]
+    return msg, run_info
+
+
+def _get_platform_path_from_runrecord(path: str, ds: Dataset) -> PurePath:
+    """Helper to standardize run_info path handling
+
+    Previously, run-records would contain platform-paths (e.g., windows paths
+    when added on windows, POSIX paths elsewhere). This made cross-platform
+    rerun impossible out-of-the box, but it also means that such dataset are
+    out there in unknown numbers.
+
+    This helper inspects any input/output path reported by get_run_info()
+    and tries to ensure that it matches platform conventions.
+
+    Parameters
+    ----------
+    path: str
+      A str-path from an input/output specification
+    ds: Dataset
+      This dataset's base path is used for existence testing for
+      convention determination.
+
+    Returns
+    -------
+    str
+    """
+    # we only need to act differently, when an incoming path is
+    # windows. This is not possible to say with 100% confidence,
+    # because a POSIX path can also contain a backslash. We support
+    # a few standard cases where we CAN tell
+    try:
+        pathobj = None
+        if '\\' not in path:
+            # no windows pathsep, no problem
+            pathobj = PurePosixPath(path)
+        # let's assume it is windows for a moment
+        elif lexists(str(ds.pathobj / PureWindowsPath(path))):
+            # if there is something on the filesystem for this path,
+            # we can be reasonably sure that this is indeed a windows
+            # path. This won't catch everything, but better than nothing
+            pathobj = PureWindowsPath(path)
+        else:
+            # if we get here, we have no idea, and no means to verify
+            # further hypotheses -- go with the POSIX assumption
+            # and hope for the best
+            pathobj = PurePosixPath(path)
+        assert pathobj is not None
+    except Exception:
+        return path
+
+    # we report in platform-conventions
+    return str(PurePath(pathobj))
+
+
 # This function is taken from datalad-core@a96c51c0b2794b2a2b4432ec7bd51f260cb91a37
 # datalad/core/local/run.py
 # The change has been proposed in https://github.com/datalad/datalad/pull/7509
@@ -80,6 +184,11 @@ def not_subst(x):
 
 apply_patch(
     'datalad.core.local.run', None, 'format_command', format_command)
+apply_patch(
+    'datalad.core.local.run', None, '_create_record', _create_record)
+apply_patch(
+    'datalad.local.rerun', None, 'get_run_info', get_run_info)
+
 register_config(
     'datalad.run.substitutions.python',
     'Substitution for {python} placeholder',