diff --git a/modules/nextflow/src/main/groovy/nextflow/processor/TaskPollingMonitor.groovy b/modules/nextflow/src/main/groovy/nextflow/processor/TaskPollingMonitor.groovy index 4e97543d2e..f65237ca53 100644 --- a/modules/nextflow/src/main/groovy/nextflow/processor/TaskPollingMonitor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/processor/TaskPollingMonitor.groovy @@ -16,6 +16,12 @@ package nextflow.processor +import nextflow.cloud.CloudSpotTerminationException +import nextflow.exception.FailedGuardException +import nextflow.exception.ProcessEvalException +import nextflow.exception.ProcessException +import nextflow.exception.ProcessRetryableException + import static nextflow.processor.TaskProcessor.* import java.util.concurrent.ExecutorService @@ -573,6 +579,16 @@ class TaskPollingMonitor implements TaskMonitor { checkTaskStatus(handler) } catch (Throwable error) { + // At this point NF assumes job is not running, but there could be errors at monitoring that could leave a job running (#5516). + // In this case, NF needs to ensure the job is killed. + if( error !instanceof ProcessException && error !instanceof ProcessRetryableException && error !instanceof CloudSpotTerminationException + && error !instanceof ProcessEvalException && error !instanceof FailedGuardException) { + try { + handler.kill() + } catch( Throwable t ) { + log.debug("Unable to cancel task ${handler.task.lazyName()} after error", t) + } + } handleException(handler, error) } }