From b81e427cd1b6bf1db2ac7d56043ddd885f255d9d Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Fri, 2 Aug 2024 16:39:59 -0700 Subject: [PATCH] fix #619 remove nodeSelector provisioner-nodepool --- axlearn/cloud/gcp/job.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/axlearn/cloud/gcp/job.py b/axlearn/cloud/gcp/job.py index 97fc1efd..bce846fa 100644 --- a/axlearn/cloud/gcp/job.py +++ b/axlearn/cloud/gcp/job.py @@ -538,20 +538,6 @@ def _build_pod(self) -> Nested[Any]: PRE_PROVISIONER_LABEL: cfg.name, } ) - else: - # Used by GCP auto-provisioner. - selector.update( - { - # NOTE: This is an arbitrary key, with a value that must be unique to the - # jobset. This forces the jobset to be associated with its own node pool; - # without this, the TPU provisioner may create a node pool and the scheduler may - # schedule a different jobset onto the node pool, which can cause conflicts if - # the original jobset attempts to restart (node pool conflict). This is more - # reliable at the moment but doesn't take advantage of node pool sharing. GCP is - # working on a fix. - "provisioner-nodepool-id": cfg.name, - } - ) annotations.update( {