Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add two tests that document portability issues of run-records #491

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 110 additions & 1 deletion datalad_next/patches/run.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
"""Enhance ``run()`` placeholder substitutions to honor configuration defaults
"""Enhance datalad-core's ``run()``

Portable path handling logic for run-records
--------------------------------------------

Placeholder substitutions to honor configuration defaults
---------------------------------------------------------

Previously, ``run()`` would not recognize configuration defaults for
placeholder substitution. This means that any placeholders globally declared in
Expand All @@ -19,21 +25,119 @@
"""

from itertools import filterfalse
from os.path import lexists
from pathlib import (
PurePath,
PureWindowsPath,
PurePosixPath,
)
import sys

from datalad.core.local.run import (
GlobbedPaths,
SequenceFormatter,
normalize_command,
quote_cmdlinearg,
_create_record as _orig_create_record,
)
from datalad.distribution.dataset import Dataset
from datalad.local.rerun import get_run_info as _orig_get_run_info
from datalad.interface.common_cfg import definitions as cfg_defs
from datalad.support.constraints import EnsureStr
from datalad.support.extensions import register_config

from . import apply_patch


# Deals with https://github.com/datalad/datalad/issues/7512
def _create_record(run_info, sidecar_flag, ds):
# convert any input/output specification to a POSIX path
for k in ('inputs', 'outputs'):
if k not in run_info:
continue

Check warning on line 57 in datalad_next/patches/run.py

View check run for this annotation

Codecov / codecov/patch

datalad_next/patches/run.py#L57

Added line #L57 was not covered by tests
run_info[k] = [_get_posix_relpath_for_runrecord(p)
for p in run_info[k]]

return _orig_create_record(run_info, sidecar_flag, ds)


def _get_posix_relpath_for_runrecord(path):
p = PurePath(path)
if p.is_absolute():
# there is no point in converting an absolute path
# to a different platform convention.
# return as-is
return path

Check warning on line 70 in datalad_next/patches/run.py

View check run for this annotation

Codecov / codecov/patch

datalad_next/patches/run.py#L70

Added line #L70 was not covered by tests

return str(PurePosixPath(p))


# Deals with https://github.com/datalad/datalad/issues/7512
def get_run_info(dset, message):
msg, run_info = _orig_get_run_info(dset, message)
if run_info is None:
# nothing to process, return as-is
return msg, run_info

for k in ('inputs', 'outputs'):
if k not in run_info:
continue
run_info[k] = [_get_platform_path_from_runrecord(p, dset)
for p in run_info[k]]
return msg, run_info


def _get_platform_path_from_runrecord(path: str, ds: Dataset) -> PurePath:
"""Helper to standardize run_info path handling

Previously, run-records would contain platform-paths (e.g., windows paths
when added on windows, POSIX paths elsewhere). This made cross-platform
rerun impossible out-of-the box, but it also means that such dataset are
out there in unknown numbers.

This helper inspects any input/output path reported by get_run_info()
and tries to ensure that it matches platform conventions.

Parameters
----------
path: str
A str-path from an input/output specification
ds: Dataset
This dataset's base path is used for existence testing for
convention determination.

Returns
-------
str
"""
# we only need to act differently, when an incoming path is
# windows. This is not possible to say with 100% confidence,
# because a POSIX path can also contain a backslash. We support
# a few standard cases where we CAN tell
try:
pathobj = None
if '\\' not in path:
# no windows pathsep, no problem
pathobj = PurePosixPath(path)
# let's assume it is windows for a moment
elif lexists(str(ds.pathobj / PureWindowsPath(path))):
# if there is something on the filesystem for this path,
# we can be reasonably sure that this is indeed a windows
# path. This won't catch everything, but better than nothing
pathobj = PureWindowsPath(path)
else:
# if we get here, we have no idea, and no means to verify
# further hypotheses -- go with the POSIX assumption
# and hope for the best
pathobj = PurePosixPath(path)

Check warning on line 132 in datalad_next/patches/run.py

View check run for this annotation

Codecov / codecov/patch

datalad_next/patches/run.py#L132

Added line #L132 was not covered by tests
assert pathobj is not None
except Exception:
return path

Check warning on line 135 in datalad_next/patches/run.py

View check run for this annotation

Codecov / codecov/patch

datalad_next/patches/run.py#L134-L135

Added lines #L134 - L135 were not covered by tests

# we report in platform-conventions
return str(PurePath(pathobj))


# This function is taken from datalad-core@a96c51c0b2794b2a2b4432ec7bd51f260cb91a37
# datalad/core/local/run.py
# The change has been proposed in https://github.com/datalad/datalad/pull/7509
Expand Down Expand Up @@ -80,6 +184,11 @@

apply_patch(
'datalad.core.local.run', None, 'format_command', format_command)
apply_patch(
'datalad.core.local.run', None, '_create_record', _create_record)
apply_patch(
'datalad.local.rerun', None, 'get_run_info', get_run_info)

register_config(
'datalad.run.substitutions.python',
'Substitution for {python} placeholder',
Expand Down
88 changes: 88 additions & 0 deletions datalad_next/patches/tests/test_run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from pathlib import Path
import pytest

from datalad.local.rerun import get_run_info

from datalad_next.exceptions import IncompleteResultsError
from datalad_next.tests.utils import (
SkipTest,
Expand All @@ -23,3 +26,88 @@ def test_substitution_config_default(existing_dataset):
# make sure we could actually detect breakage with the check above
with pytest.raises(IncompleteResultsError):
ds.run('{python} -c "breakage"', result_renderer='disabled')


def test_runrecord_portable_paths(existing_dataset):
ds = existing_dataset
dsrepo = ds.repo
infile = ds.pathobj / 'inputs' / 'testfile.txt'
outfile = ds.pathobj / 'outputs' / 'testfile.txt'
infile.parent.mkdir()
outfile.parent.mkdir()
infile.touch()
ds.save()
assert not outfile.exists()
# script copies any 'inputs' to the outputs dir
res = ds.run(
'{python} -c "'
'from shutil import copyfile;'
'from pathlib import Path;'
r"""[copyfile(f.strip('\"'), Path.cwd() / \"outputs\" / Path(f.strip('\"')).name)"""
# we need to use a raw string to contain the inputs expansion,
# on windows they would contain backslashes that are unescaped
r""" for f in r'{inputs}'.split()]"""
'"',
result_renderer='disabled',
# we need to pass relative paths ourselves
# https://github.com/datalad/datalad/issues/7516
inputs=[str(infile.relative_to(ds.pathobj))],
outputs=[str(outfile.relative_to(ds.pathobj))],
)
# verify basic outcome
assert_result_count(res, 1, action='run', status='ok')
assert outfile.exists()

# branch we expect the runrecord on
branch = dsrepo.get_corresponding_branch() or dsrepo.get_active_branch()
cmsg = dsrepo.format_commit('%B', branch)

# the IOspecs are stored in POSIX conventions
assert r'"inputs/testfile.txt"' in cmsg
assert r'"outputs/testfile.txt"' in cmsg

# get_run_info() reports in platform conventions
msg, run_info = get_run_info(ds, cmsg)
assert run_info
for k in ('inputs', 'outputs'):
specs = run_info.get(k)
assert len(specs) > 0
for p in specs:
assert (ds.pathobj / p).exists()


def test_runrecord_oldnative_paths(existing_dataset):
ds = existing_dataset
# this test is imitating a rerun situation, so we create the
# inputs and outputs
infile = ds.pathobj / 'inputs' / 'testfile.txt'
outfile = ds.pathobj / 'outputs' / 'testfile.txt'
for fp in (infile, outfile):
fp.parent.mkdir()
fp.touch()

cmsg = (
'[DATALAD RUNCMD] /home/mih/env/datalad-dev/bin/python -c ...\n\n'
'=== Do not change lines below ===\n'
'{\n'
' "chain": [],\n'
' "cmd": "{python} -c True",\n'
# use the ID of the test dataset to ensure proper association
f' "dsid": "{ds.id}",\n'
' "exit": 0,\n'
' "extra_inputs": [],\n'
' "inputs": [\n'
# make windows path, used to be stored in escaped form
r' "inputs\\testfile.txt"' '\n'
' ],\n'
' "outputs": [\n'
# make windows path, used to be stored in escaped form
r' "outputs\\testfile.txt"' '\n'
' ],\n'
' "pwd": "."\n'
'}\n'
'^^^ Do not change lines above ^^^\n'
)
msg, run_info = get_run_info(ds, cmsg)
assert run_info['inputs'][0] == str(Path('inputs', 'testfile.txt'))
assert run_info['outputs'][0] == str(Path('outputs', 'testfile.txt'))
Loading