datalad · mih · Oct 19, 2023 · Oct 19, 2023
@@ -1,4 +1,10 @@
-"""Enhance ``run()`` placeholder substitutions to honor configuration defaults
+"""Enhance datalad-core's ``run()``
+
+Portable path handling logic for run-records
+--------------------------------------------
+
+Placeholder substitutions to honor configuration defaults
+---------------------------------------------------------
 
 Previously, ``run()`` would not recognize configuration defaults for
 placeholder substitution. This means that any placeholders globally declared in
@@ -19,21 +25,119 @@
 """
 
 from itertools import filterfalse
+from os.path import lexists
+from pathlib import (
+    PurePath,
+    PureWindowsPath,
+    PurePosixPath,
+)
 import sys
 
 from datalad.core.local.run import (
     GlobbedPaths,
     SequenceFormatter,
     normalize_command,
     quote_cmdlinearg,
+    _create_record as _orig_create_record,
 )
+from datalad.distribution.dataset import Dataset
+from datalad.local.rerun import get_run_info as _orig_get_run_info
 from datalad.interface.common_cfg import definitions as cfg_defs
 from datalad.support.constraints import EnsureStr
 from datalad.support.extensions import register_config
 
 from . import apply_patch
 
 
+# Deals with https://github.com/datalad/datalad/issues/7512
+def _create_record(run_info, sidecar_flag, ds):
+    # convert any input/output specification to a POSIX path
+    for k in ('inputs', 'outputs'):
+        if k not in run_info:
+            continue
+        run_info[k] = [_get_posix_relpath_for_runrecord(p)
+                       for p in run_info[k]]
+
+    return _orig_create_record(run_info, sidecar_flag, ds)
+
+
+def _get_posix_relpath_for_runrecord(path):
+    p = PurePath(path)
+    if p.is_absolute():
+        # there is no point in converting an absolute path
+        # to a different platform convention.
+        # return as-is
+        return path
+
+    return str(PurePosixPath(p))
+
+
+# Deals with https://github.com/datalad/datalad/issues/7512
+def get_run_info(dset, message):
+    msg, run_info = _orig_get_run_info(dset, message)
+    if run_info is None:
+        # nothing to process, return as-is
+        return msg, run_info
+
+    for k in ('inputs', 'outputs'):
+        if k not in run_info:
+            continue
+        run_info[k] = [_get_platform_path_from_runrecord(p, dset)
+                       for p in run_info[k]]
+    return msg, run_info
+
+
+def _get_platform_path_from_runrecord(path: str, ds: Dataset) -> PurePath:
+    """Helper to standardize run_info path handling
+
+    Previously, run-records would contain platform-paths (e.g., windows paths
+    when added on windows, POSIX paths elsewhere). This made cross-platform
+    rerun impossible out-of-the box, but it also means that such dataset are
+    out there in unknown numbers.
+
+    This helper inspects any input/output path reported by get_run_info()
+    and tries to ensure that it matches platform conventions.
+
+    Parameters
+    ----------
+    path: str
+      A str-path from an input/output specification
+    ds: Dataset
+      This dataset's base path is used for existence testing for
+      convention determination.
+
+    Returns
+    -------
+    str
+    """
+    # we only need to act differently, when an incoming path is
+    # windows. This is not possible to say with 100% confidence,
+    # because a POSIX path can also contain a backslash. We support
+    # a few standard cases where we CAN tell
+    try:
+        pathobj = None
+        if '\\' not in path:
+            # no windows pathsep, no problem
+            pathobj = PurePosixPath(path)
+        # let's assume it is windows for a moment
+        elif lexists(str(ds.pathobj / PureWindowsPath(path))):
+            # if there is something on the filesystem for this path,
+            # we can be reasonably sure that this is indeed a windows
+            # path. This won't catch everything, but better than nothing
+            pathobj = PureWindowsPath(path)
+        else:
+            # if we get here, we have no idea, and no means to verify
+            # further hypotheses -- go with the POSIX assumption
+            # and hope for the best
+            pathobj = PurePosixPath(path)
+        assert pathobj is not None
+    except Exception:
+        return path
+
+    # we report in platform-conventions
+    return str(PurePath(pathobj))
+
+
 # This function is taken from datalad-core@a96c51c0b2794b2a2b4432ec7bd51f260cb91a37
 # datalad/core/local/run.py
 # The change has been proposed in https://github.com/datalad/datalad/pull/7509
@@ -80,6 +184,11 @@
 
 apply_patch(
     'datalad.core.local.run', None, 'format_command', format_command)
+apply_patch(
+    'datalad.core.local.run', None, '_create_record', _create_record)
+apply_patch(
+    'datalad.local.rerun', None, 'get_run_info', get_run_info)
+
 register_config(
     'datalad.run.substitutions.python',
     'Substitution for {python} placeholder',

@@ -1,5 +1,8 @@
+from pathlib import Path
 import pytest
 
+from datalad.local.rerun import get_run_info
+
 from datalad_next.exceptions import IncompleteResultsError
 from datalad_next.tests.utils import (
     SkipTest,
@@ -23,3 +26,88 @@ def test_substitution_config_default(existing_dataset):
     # make sure we could actually detect breakage with the check above
     with pytest.raises(IncompleteResultsError):
         ds.run('{python} -c "breakage"', result_renderer='disabled')
+
+
+def test_runrecord_portable_paths(existing_dataset):
+    ds = existing_dataset
+    dsrepo = ds.repo
+    infile = ds.pathobj / 'inputs' / 'testfile.txt'
+    outfile = ds.pathobj / 'outputs' / 'testfile.txt'
+    infile.parent.mkdir()
+    outfile.parent.mkdir()
+    infile.touch()
+    ds.save()
+    assert not outfile.exists()
+    # script copies any 'inputs' to the outputs dir
+    res = ds.run(
+        '{python} -c "'
+        'from shutil import copyfile;'
+        'from pathlib import Path;'
+        r"""[copyfile(f.strip('\"'), Path.cwd() / \"outputs\" / Path(f.strip('\"')).name)"""
+        # we need to use a raw string to contain the inputs expansion,
+        # on windows they would contain backslashes that are unescaped
+        r""" for f in r'{inputs}'.split()]"""
+        '"',
+        result_renderer='disabled',
+        # we need to pass relative paths ourselves
+        # https://github.com/datalad/datalad/issues/7516
+        inputs=[str(infile.relative_to(ds.pathobj))],
+        outputs=[str(outfile.relative_to(ds.pathobj))],
+    )
+    # verify basic outcome
+    assert_result_count(res, 1, action='run', status='ok')
+    assert outfile.exists()
+
+    # branch we expect the runrecord on
+    branch = dsrepo.get_corresponding_branch() or dsrepo.get_active_branch()
+    cmsg = dsrepo.format_commit('%B', branch)
+
+    # the IOspecs are stored in POSIX conventions
+    assert r'"inputs/testfile.txt"' in cmsg
+    assert r'"outputs/testfile.txt"' in cmsg
+
+    # get_run_info() reports in platform conventions
+    msg, run_info = get_run_info(ds, cmsg)
+    assert run_info
+    for k in ('inputs', 'outputs'):
+        specs = run_info.get(k)
+        assert len(specs) > 0
+        for p in specs:
+            assert (ds.pathobj / p).exists()
+
+
+def test_runrecord_oldnative_paths(existing_dataset):
+    ds = existing_dataset
+    # this test is imitating a rerun situation, so we create the
+    # inputs and outputs
+    infile = ds.pathobj / 'inputs' / 'testfile.txt'
+    outfile = ds.pathobj / 'outputs' / 'testfile.txt'
+    for fp in (infile, outfile):
+        fp.parent.mkdir()
+        fp.touch()
+
+    cmsg = (
+        '[DATALAD RUNCMD] /home/mih/env/datalad-dev/bin/python -c ...\n\n'
+        '=== Do not change lines below ===\n'
+        '{\n'
+        ' "chain": [],\n'
+        ' "cmd": "{python} -c True",\n'
+        # use the ID of the test dataset to ensure proper association
+        f' "dsid": "{ds.id}",\n'
+        ' "exit": 0,\n'
+        ' "extra_inputs": [],\n'
+        ' "inputs": [\n'
+        # make windows path, used to be stored in escaped form
+        r'   "inputs\\testfile.txt"' '\n'
+        ' ],\n'
+        ' "outputs": [\n'
+        # make windows path, used to be stored in escaped form
+        r'   "outputs\\testfile.txt"' '\n'
+        ' ],\n'
+        ' "pwd": "."\n'
+        '}\n'
+        '^^^ Do not change lines above ^^^\n'
+    )
+    msg, run_info = get_run_info(ds, cmsg)
+    assert run_info['inputs'][0] == str(Path('inputs', 'testfile.txt'))
+    assert run_info['outputs'][0] == str(Path('outputs', 'testfile.txt'))