From 5bdaac9447a989f6a12d2507a3eee78ddec70d60 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Sun, 15 Oct 2023 09:19:45 -0500 Subject: [PATCH] Improve `-dump-hashes` output adding json format (#4369) Signed-off-by: Ben Sherman Signed-off-by: Paolo Di Tommaso Co-authored-by: Paolo Di Tommaso --- docs/cache-and-resume.md | 27 +++++++++++++++++++ docs/cli.md | 5 +++- .../src/main/groovy/nextflow/Session.groovy | 4 +-- .../main/groovy/nextflow/cli/CmdRun.groovy | 2 +- .../main/groovy/nextflow/cli/Launcher.groovy | 4 +++ .../nextflow/config/ConfigBuilder.groovy | 7 ++--- .../nextflow/processor/TaskProcessor.groovy | 15 ++++++++++- 7 files changed, 56 insertions(+), 8 deletions(-) diff --git a/docs/cache-and-resume.md b/docs/cache-and-resume.md index 0fc5ddf4a0..55b890ef43 100644 --- a/docs/cache-and-resume.md +++ b/docs/cache-and-resume.md @@ -186,6 +186,8 @@ nextflow run rnaseq-nf -resume 4dc656d2-c410-44c8-bc32-7dd0ea87bebf You can use the {ref}`cli-log` command to view all previous runs as well as the task executions for each run. +(cache-compare-hashes)= + ### Comparing the hashes of two runs One way to debug a resumed run is to compare the task hashes of each run using the `-dump-hashes` option. @@ -196,3 +198,28 @@ One way to debug a resumed run is to compare the task hashes of each run using t 4. Compare the runs with a diff viewer While some manual effort is required, the final diff can often reveal the exact change that caused a task to be re-executed. + +:::{versionadded} 23.10.0 +::: + +When using `-dump-hashes json`, the task hashes can be more easily extracted into a diff. Here is an example Bash script to perform two runs and produce a diff: + +```bash +nextflow -log run_1.log run $pipeline -dump-hashes json +nextflow -log run_2.log run $pipeline -dump-hashes json -resume + +get_hashes() { + cat $1 \ + | grep 'cache hash:' \ + | cut -d ' ' -f 10- \ + | sort \ + | awk '{ print; print ""; }' +} + +get_hashes run_1.log > run_1.tasks.log +get_hashes run_2.log > run_2.tasks.log + +diff run_1.tasks.log run_2.tasks.log +``` + +You can then view the `diff` output or use a graphical diff viewer to compare `run_1.tasks.log` and `run_2.tasks.log`. diff --git a/docs/cli.md b/docs/cli.md index 5fd19d7aaa..dbe54312b2 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -1154,7 +1154,10 @@ The `run` command is used to execute a local pipeline script or remote pipeline : Dump channels for debugging purpose. `-dump-hashes` -: Dump task hash keys for debugging purpose. +: Dump task hash keys for debugging purposes. +: :::{versionadded} 23.10.0 + You can use `-dump-hashes json` to dump the task hash keys as JSON for easier post-processing. See the {ref}`caching and resuming tips ` for more details. + ::: `-e.=` : Add the specified variable to execution environment. diff --git a/modules/nextflow/src/main/groovy/nextflow/Session.groovy b/modules/nextflow/src/main/groovy/nextflow/Session.groovy index 7f8d4ecf24..668b1166ad 100644 --- a/modules/nextflow/src/main/groovy/nextflow/Session.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/Session.groovy @@ -247,11 +247,11 @@ class Session implements ISession { boolean getStatsEnabled() { statsEnabled } - private boolean dumpHashes + private String dumpHashes private List dumpChannels - boolean getDumpHashes() { dumpHashes } + String getDumpHashes() { dumpHashes } List getDumpChannels() { dumpChannels } diff --git a/modules/nextflow/src/main/groovy/nextflow/cli/CmdRun.groovy b/modules/nextflow/src/main/groovy/nextflow/cli/CmdRun.groovy index ec233778c9..7fa7d65298 100644 --- a/modules/nextflow/src/main/groovy/nextflow/cli/CmdRun.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/cli/CmdRun.groovy @@ -223,7 +223,7 @@ class CmdRun extends CmdBase implements HubOptions { String profile @Parameter(names=['-dump-hashes'], description = 'Dump task hash keys for debugging purpose') - boolean dumpHashes + String dumpHashes @Parameter(names=['-dump-channels'], description = 'Dump channels for debugging purpose') String dumpChannels diff --git a/modules/nextflow/src/main/groovy/nextflow/cli/Launcher.groovy b/modules/nextflow/src/main/groovy/nextflow/cli/Launcher.groovy index eabdb84bf0..50e9d8df2a 100644 --- a/modules/nextflow/src/main/groovy/nextflow/cli/Launcher.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/cli/Launcher.groovy @@ -224,6 +224,10 @@ class Launcher { normalized << '%all' } + else if( current == '-dump-hashes' && (i==args.size() || args[i].startsWith('-'))) { + normalized << '-' + } + else if( current == '-with-cloudcache' && (i==args.size() || args[i].startsWith('-'))) { normalized << '-' } diff --git a/modules/nextflow/src/main/groovy/nextflow/config/ConfigBuilder.groovy b/modules/nextflow/src/main/groovy/nextflow/config/ConfigBuilder.groovy index 02270c9aa1..d30ad9e0dd 100644 --- a/modules/nextflow/src/main/groovy/nextflow/config/ConfigBuilder.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/config/ConfigBuilder.groovy @@ -595,9 +595,10 @@ class ConfigBuilder { if( config.isSet('resume') ) config.resume = normalizeResumeId(config.resume as String) - // -- sets `dumpKeys` option - if( cmdRun.dumpHashes ) - config.dumpHashes = cmdRun.dumpHashes + // -- sets `dumpHashes` option + if( cmdRun.dumpHashes ) { + config.dumpHashes = cmdRun.dumpHashes != '-' ? cmdRun.dumpHashes : 'default' + } if( cmdRun.dumpChannels ) config.dumpChannels = cmdRun.dumpChannels.tokenize(',') diff --git a/modules/nextflow/src/main/groovy/nextflow/processor/TaskProcessor.groovy b/modules/nextflow/src/main/groovy/nextflow/processor/TaskProcessor.groovy index c31597e1a3..bfcb34d4da 100644 --- a/modules/nextflow/src/main/groovy/nextflow/processor/TaskProcessor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/processor/TaskProcessor.groovy @@ -32,6 +32,7 @@ import java.util.regex.Pattern import ch.artecat.grengine.Grengine import com.google.common.hash.HashCode +import groovy.json.JsonOutput import groovy.transform.CompileStatic import groovy.transform.Memoized import groovy.transform.PackageScope @@ -2155,7 +2156,9 @@ class TaskProcessor { final mode = config.getHashMode() final hash = computeHash(keys, mode) if( session.dumpHashes ) { - traceInputsHashes(task, keys, mode, hash) + session.dumpHashes=='json' + ? traceInputsHashesJson(task, keys, mode, hash) + : traceInputsHashes(task, keys, mode, hash) } return hash } @@ -2191,6 +2194,16 @@ class TaskProcessor { return result } + private void traceInputsHashesJson( TaskRun task, List entries, CacheHelper.HashMode mode, hash ) { + final collector = (item) -> [ + hash: CacheHelper.hasher(item, mode).hash().toString(), + type: item?.getClass()?.getName(), + value: item?.toString() + ] + final json = JsonOutput.toJson(entries.collect(collector)) + log.info "[${safeTaskName(task)}] cache hash: ${hash}; mode: ${mode}; entries: ${JsonOutput.prettyPrint(json)}" + } + private void traceInputsHashes( TaskRun task, List entries, CacheHelper.HashMode mode, hash ) { def buffer = new StringBuilder()