HabanaAI · michalkuligowski · Jan 22, 2025 · Jan 21, 2025 · Jan 22, 2025 · Jan 22, 2025
@@ -407,7 +407,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'or equal to the number of GPUs available, "mp" will be used to '
             'keep processing on a single host. Otherwise, this will default '
             'to "ray" if Ray is installed and fail otherwise. Note that tpu '
-            'and hpu only support Ray for distributed inference.')
+            'only support Ray for distributed inference.')
 
         parser.add_argument(
             '--worker-use-ray',

@@ -91,8 +91,12 @@ def _init_executor(self) -> None:
                           max_parallel_loading_workers)
         self.driver_exec_model = make_async(self.driver_worker.execute_model)
         self.pp_locks: Optional[List[asyncio.Lock]] = None
+        self.shutdown_workers = True
 
     def shutdown(self):
+        if getattr(self, 'shutdown_workers', False):
+            self._run_workers("shutdown")
+            self.shutdown_workers = False
         if (worker_monitor := getattr(self, "worker_monitor",
                                       None)) is not None:
             worker_monitor.close()

@@ -98,7 +98,17 @@ def _init_executor(self) -> None:
             self.driver_exec_method = make_async(
                 self.driver_worker.execute_method)
 
+        self.shutdown_workers = True
+        self.terminate_ray = True
+
     def shutdown(self) -> None:
+        if getattr(self, 'shutdown_workers', False):
+            self._run_workers("shutdown")
+            self.shutdown_workers = False
+        if getattr(self, 'terminate_ray', False):
+            for worker in self.workers:
+                worker.__ray_terminate__.remote()
+            self.terminate_ray = False
         if hasattr(self, "forward_dag") and self.forward_dag is not None:
             self.forward_dag.teardown()
             import ray

@@ -39,6 +39,8 @@ def _init_executor(self) -> None:
         self.collective_rpc("init_device")
         self.collective_rpc("load_model")
 
+        self.shutdown_worker = True
+
     def collective_rpc(self,
                        method: Union[str, Callable],
                        timeout: Optional[float] = None,
@@ -54,6 +56,11 @@ def check_health(self) -> None:
         # it's running.
         return
 
+    def shutdown(self):
+        if getattr(self, 'shutdown_worker', False):
+            self.collective_rpc("shutdown")
+            self.shutdown_worker = False
+
 
 UniProcExecutorAsync = UniProcExecutor
 
@@ -112,6 +119,8 @@ def _init_executor(self) -> None:
         self.collective_rpc("init_device")
         self.collective_rpc("load_model")
 
+        self.shutdown_worker = True
+
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """
         Determine the number of available KV blocks.

@@ -475,7 +475,7 @@ def list_prompt_adapters(self) -> Set[int]:
         raise NotImplementedError(
             "Prompt Adapter is not implemented for HPU backend.")
 
-    def shutdown_inc(self):
+    def shutdown(self):
         self.model_runner.shutdown_inc()
 
     @property

@@ -96,6 +96,10 @@ def execute_model(
     ) -> Optional[List[SamplerOutput]]:
         raise NotImplementedError
 
+    def shutdown(self) -> None:
+        """Shutdown the worker."""
+        return
+
     @abstractmethod
     def get_cache_block_size_bytes(self) -> int:
         """Return the size of a single cache block, in bytes. Used in