From e5fe41a266d7e690e76879a65e3274bcca97391f Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sat, 28 Dec 2024 18:15:40 -0800 Subject: [PATCH 1/6] Fix purge not cleaning up stale k8s context cluster --- sky/backends/cloud_vm_ray_backend.py | 58 +++++++++++++++------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 9d94f469df3..48ac8494276 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4195,34 +4195,40 @@ def post_teardown_cleanup(self, # that successfully call this function but do not first call # teardown_cluster or terminate_instances. See # https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032 + # If purge is set, we do not care about instance status and can skip + # the check. attempts = 0 - while True: - logger.debug(f'instance statuses attempt {attempts + 1}') - node_status_dict = provision_lib.query_instances( - repr(cloud), - cluster_name_on_cloud, - config['provider'], - non_terminated_only=False) - - unexpected_node_state: Optional[Tuple[str, str]] = None - for node_id, node_status in node_status_dict.items(): - logger.debug(f'{node_id} status: {node_status}') - # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish - # between "stopping/stopped" and "terminating/terminated", so we - # allow for either status instead of casing on `terminate`. - if node_status not in [None, status_lib.ClusterStatus.STOPPED]: - unexpected_node_state = (node_id, node_status) - - if unexpected_node_state is None: - break + if not purge: + while True: + logger.debug(f'instance statuses attempt {attempts + 1}') + node_status_dict = provision_lib.query_instances( + repr(cloud), + cluster_name_on_cloud, + config['provider'], + non_terminated_only=False) + + unexpected_node_state: Optional[Tuple[str, str]] = None + for node_id, node_status in node_status_dict.items(): + logger.debug(f'{node_id} status: {node_status}') + # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish + # between "stopping/stopped" and "terminating/terminated", + # so we allow for either status instead of casing + # on `terminate`. + if node_status not in [ + None, status_lib.ClusterStatus.STOPPED + ]: + unexpected_node_state = (node_id, node_status) + + if unexpected_node_state is None: + break - attempts += 1 - if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS: - time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS) - else: - (node_id, node_status) = unexpected_node_state - raise RuntimeError(f'Instance {node_id} in unexpected state ' - f'{node_status}.') + attempts += 1 + if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS: + time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS) + else: + (node_id, node_status) = unexpected_node_state + raise RuntimeError(f'Instance {node_id} in unexpected ' + 'state {node_status}.') global_user_state.remove_cluster(handle.cluster_name, terminate=terminate) From c08fcc3a0d4f18499d5ddf4f1a81d2bce4bae65e Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sat, 28 Dec 2024 18:30:48 -0800 Subject: [PATCH 2/6] update comment --- sky/backends/cloud_vm_ray_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 48ac8494276..36f309927c2 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4195,8 +4195,8 @@ def post_teardown_cleanup(self, # that successfully call this function but do not first call # teardown_cluster or terminate_instances. See # https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032 - # If purge is set, we do not care about instance status and can skip - # the check. + # If purge is set, we do not care about instance status and should skip + # the check because it may fail if the cluster is not reachable. attempts = 0 if not purge: while True: From c58dcaaf351d5e635dcf888cba67cee4e4eb3662 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sun, 5 Jan 2025 16:27:26 -0800 Subject: [PATCH 3/6] Apply purge after printing warnings. --- sky/backends/cloud_vm_ray_backend.py | 59 +++++++++++++++++----------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 36f309927c2..a4912844264 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4198,37 +4198,52 @@ def post_teardown_cleanup(self, # If purge is set, we do not care about instance status and should skip # the check because it may fail if the cluster is not reachable. attempts = 0 - if not purge: - while True: - logger.debug(f'instance statuses attempt {attempts + 1}') + while True: + logger.debug(f'instance statuses attempt {attempts + 1}') + try: node_status_dict = provision_lib.query_instances( repr(cloud), cluster_name_on_cloud, config['provider'], non_terminated_only=False) - - unexpected_node_state: Optional[Tuple[str, str]] = None - for node_id, node_status in node_status_dict.items(): - logger.debug(f'{node_id} status: {node_status}') - # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish - # between "stopping/stopped" and "terminating/terminated", - # so we allow for either status instead of casing - # on `terminate`. - if node_status not in [ - None, status_lib.ClusterStatus.STOPPED - ]: - unexpected_node_state = (node_id, node_status) - - if unexpected_node_state is None: + except Exception as e: + if purge: + logger.warning( + f'Failed to query instances. Skipping since purge is ' + f'set. Details: ' + f'{common_utils.format_exception(e, use_bracket=True)}') break + else: + raise + + unexpected_node_state: Optional[Tuple[str, str]] = None + for node_id, node_status in node_status_dict.items(): + logger.debug(f'{node_id} status: {node_status}') + # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish + # between "stopping/stopped" and "terminating/terminated", + # so we allow for either status instead of casing + # on `terminate`. + if node_status not in [ + None, status_lib.ClusterStatus.STOPPED + ]: + unexpected_node_state = (node_id, node_status) + + if unexpected_node_state is None: + break - attempts += 1 - if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS: - time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS) + attempts += 1 + if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS: + time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS) + else: + (node_id, node_status) = unexpected_node_state + if purge: + logger.warning(f'Instance {node_id} in unexpected ' + 'state {node_status}. Skipping since purge ' + 'is set.') + break else: - (node_id, node_status) = unexpected_node_state raise RuntimeError(f'Instance {node_id} in unexpected ' - 'state {node_status}.') + 'state {node_status}.') global_user_state.remove_cluster(handle.cluster_name, terminate=terminate) From 754237a186058ec677996197cc4978831554921e Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sun, 5 Jan 2025 16:29:26 -0800 Subject: [PATCH 4/6] lint --- sky/backends/cloud_vm_ray_backend.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index a4912844264..babca5dc2b5 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4206,7 +4206,7 @@ def post_teardown_cleanup(self, cluster_name_on_cloud, config['provider'], non_terminated_only=False) - except Exception as e: + except Exception as e: # pylint: disable=broad-except if purge: logger.warning( f'Failed to query instances. Skipping since purge is ' @@ -4223,9 +4223,7 @@ def post_teardown_cleanup(self, # between "stopping/stopped" and "terminating/terminated", # so we allow for either status instead of casing # on `terminate`. - if node_status not in [ - None, status_lib.ClusterStatus.STOPPED - ]: + if node_status not in [None, status_lib.ClusterStatus.STOPPED]: unexpected_node_state = (node_id, node_status) if unexpected_node_state is None: @@ -4238,12 +4236,12 @@ def post_teardown_cleanup(self, (node_id, node_status) = unexpected_node_state if purge: logger.warning(f'Instance {node_id} in unexpected ' - 'state {node_status}. Skipping since purge ' - 'is set.') + 'state {node_status}. Skipping since purge ' + 'is set.') break else: raise RuntimeError(f'Instance {node_id} in unexpected ' - 'state {node_status}.') + 'state {node_status}.') global_user_state.remove_cluster(handle.cluster_name, terminate=terminate) From 17ef3af0d750a97dc8b9dd3273ecb98954386348 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sun, 5 Jan 2025 16:43:50 -0800 Subject: [PATCH 5/6] Fix comments --- sky/backends/cloud_vm_ray_backend.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index babca5dc2b5..92449ad2e5d 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4195,8 +4195,6 @@ def post_teardown_cleanup(self, # that successfully call this function but do not first call # teardown_cluster or terminate_instances. See # https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032 - # If purge is set, we do not care about instance status and should skip - # the check because it may fail if the cluster is not reachable. attempts = 0 while True: logger.debug(f'instance statuses attempt {attempts + 1}') @@ -4220,9 +4218,8 @@ def post_teardown_cleanup(self, for node_id, node_status in node_status_dict.items(): logger.debug(f'{node_id} status: {node_status}') # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish - # between "stopping/stopped" and "terminating/terminated", - # so we allow for either status instead of casing - # on `terminate`. + # between "stopping/stopped" and "terminating/terminated", so we + # allow for either status instead of casing on `terminate`. if node_status not in [None, status_lib.ClusterStatus.STOPPED]: unexpected_node_state = (node_id, node_status) @@ -4236,12 +4233,12 @@ def post_teardown_cleanup(self, (node_id, node_status) = unexpected_node_state if purge: logger.warning(f'Instance {node_id} in unexpected ' - 'state {node_status}. Skipping since purge ' + f'state {node_status}. Skipping since purge ' 'is set.') break else: raise RuntimeError(f'Instance {node_id} in unexpected ' - 'state {node_status}.') + f'state {node_status}.') global_user_state.remove_cluster(handle.cluster_name, terminate=terminate) From 3c0372f08a8860650947d2dceb5ed7aa0ba807ca Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sun, 5 Jan 2025 16:47:58 -0800 Subject: [PATCH 6/6] clean up condition --- sky/backends/cloud_vm_ray_backend.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 92449ad2e5d..3289500cd88 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4211,8 +4211,7 @@ def post_teardown_cleanup(self, f'set. Details: ' f'{common_utils.format_exception(e, use_bracket=True)}') break - else: - raise + raise unexpected_node_state: Optional[Tuple[str, str]] = None for node_id, node_status in node_status_dict.items(): @@ -4236,9 +4235,8 @@ def post_teardown_cleanup(self, f'state {node_status}. Skipping since purge ' 'is set.') break - else: - raise RuntimeError(f'Instance {node_id} in unexpected ' - f'state {node_status}.') + raise RuntimeError(f'Instance {node_id} in unexpected ' + f'state {node_status}.') global_user_state.remove_cluster(handle.cluster_name, terminate=terminate)