From 25db34e925579ea7f53d91854adabd561209c230 Mon Sep 17 00:00:00 2001 From: Matt Pawelczyk Date: Tue, 15 Oct 2024 15:37:43 +0000 Subject: [PATCH 1/4] Account that k8s pods may be uncounted yet. --- lib/galaxy/jobs/runners/kubernetes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/jobs/runners/kubernetes.py b/lib/galaxy/jobs/runners/kubernetes.py index 56fef20a0ecb..9b75870005d2 100644 --- a/lib/galaxy/jobs/runners/kubernetes.py +++ b/lib/galaxy/jobs/runners/kubernetes.py @@ -736,7 +736,10 @@ def check_watched_item(self, job_state): # as probably this means that the k8s API server hasn't # had time to fill in the object status since the # job was created only too recently. - if len(job.obj["status"]) == 0: + # It is possible that k8s didn't account for the status of the pods + # and they are in the uncountedTerminatedPods status. In this + # case we also need to wait a moment + if len(job.obj["status"]) == 0 or 'uncountedTerminatedPods' in job.obj["status"]: return job_state if "succeeded" in job.obj["status"]: succeeded = job.obj["status"]["succeeded"] From cbeedf371c0e63c394433b1f91a2e984c883d991 Mon Sep 17 00:00:00 2001 From: Matt Pawelczyk Date: Tue, 15 Oct 2024 17:00:23 +0000 Subject: [PATCH 2/4] Fix linting --- lib/galaxy/jobs/runners/kubernetes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/jobs/runners/kubernetes.py b/lib/galaxy/jobs/runners/kubernetes.py index 9b75870005d2..9a358c107acd 100644 --- a/lib/galaxy/jobs/runners/kubernetes.py +++ b/lib/galaxy/jobs/runners/kubernetes.py @@ -739,7 +739,7 @@ def check_watched_item(self, job_state): # It is possible that k8s didn't account for the status of the pods # and they are in the uncountedTerminatedPods status. In this # case we also need to wait a moment - if len(job.obj["status"]) == 0 or 'uncountedTerminatedPods' in job.obj["status"]: + if len(job.obj["status"]) == 0 or "uncountedTerminatedPods" in job.obj["status"]: return job_state if "succeeded" in job.obj["status"]: succeeded = job.obj["status"]["succeeded"] From c311092aad316805872670999538cada60f551f5 Mon Sep 17 00:00:00 2001 From: Matt Pawelczyk <125464188+mapk-amazon@users.noreply.github.com> Date: Wed, 16 Oct 2024 20:33:45 +0200 Subject: [PATCH 3/4] Update lib/galaxy/jobs/runners/kubernetes.py Co-authored-by: Marius van den Beek --- lib/galaxy/jobs/runners/kubernetes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/jobs/runners/kubernetes.py b/lib/galaxy/jobs/runners/kubernetes.py index 9a358c107acd..be3d4239ff79 100644 --- a/lib/galaxy/jobs/runners/kubernetes.py +++ b/lib/galaxy/jobs/runners/kubernetes.py @@ -739,7 +739,7 @@ def check_watched_item(self, job_state): # It is possible that k8s didn't account for the status of the pods # and they are in the uncountedTerminatedPods status. In this # case we also need to wait a moment - if len(job.obj["status"]) == 0 or "uncountedTerminatedPods" in job.obj["status"]: + if len(job.obj["status"]) == 0 or in job.obj["status"].get("uncountedTerminatedPods"): return job_state if "succeeded" in job.obj["status"]: succeeded = job.obj["status"]["succeeded"] From df1bb9a53632402ccd14a041657068c568cd3883 Mon Sep 17 00:00:00 2001 From: Matt Pawelczyk <125464188+mapk-amazon@users.noreply.github.com> Date: Wed, 16 Oct 2024 21:02:27 +0200 Subject: [PATCH 4/4] Update lib/galaxy/jobs/runners/kubernetes.py Co-authored-by: Nuwan Goonasekera <2070605+nuwang@users.noreply.github.com> --- lib/galaxy/jobs/runners/kubernetes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/jobs/runners/kubernetes.py b/lib/galaxy/jobs/runners/kubernetes.py index be3d4239ff79..68bbddc81ebb 100644 --- a/lib/galaxy/jobs/runners/kubernetes.py +++ b/lib/galaxy/jobs/runners/kubernetes.py @@ -739,7 +739,7 @@ def check_watched_item(self, job_state): # It is possible that k8s didn't account for the status of the pods # and they are in the uncountedTerminatedPods status. In this # case we also need to wait a moment - if len(job.obj["status"]) == 0 or in job.obj["status"].get("uncountedTerminatedPods"): + if len(job.obj["status"]) == 0 or job.obj["status"].get("uncountedTerminatedPods"): return job_state if "succeeded" in job.obj["status"]: succeeded = job.obj["status"]["succeeded"]