From a1e17c41977b88a9640806e2d21d4d48a2cee197 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:18:48 +0000 Subject: [PATCH] updated --- vllm/v1/engine/async_llm.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 7f2a597bbdd97..3601085801a4c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -89,7 +89,7 @@ def __init__( to_engine_core_path = get_open_zmq_ipc_path() # Detokenizer (background process). - self.detokenizer = MPDetokenizerClient( + self.detokenizer_client = MPDetokenizerClient( from_engine_core_path=from_engine_core_path, to_engine_core_path=to_engine_core_path, tokenizer_name=vllm_config.model_config.tokenizer, @@ -99,7 +99,7 @@ def __init__( ) # EngineCore (background process). - self.engine_core = MPEngineCoreClient( + self.engine_core_client = MPEngineCoreClient( input_path=to_engine_core_path, output_path=from_engine_core_path, vllm_config=vllm_config, @@ -148,11 +148,11 @@ def shutdown(self): if output_handler := getattr(self, "output_hander", None): output_handler.cancel() - if engine_core := getattr(self, "engine_core", None): - engine_core.shutdown() + if engine_core_client := getattr(self, "engine_core_client", None): + engine_core_client.shutdown() - if detokenizer := getattr(self, "detokenizer", None): - detokenizer.shutdown() + if detokenizer_client := getattr(self, "detokenizer_client", None): + detokenizer_client.shutdown() @classmethod def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: @@ -190,7 +190,7 @@ async def add_request( self.rid_to_queue[request_id] = asyncio.Queue() # 3) Send to Detokenizer (which forwards to EngineCore). - await self.detokenizer.input_socket.send_pyobj(engine_request) + await self.detokenizer_client.input_socket.send_pyobj(engine_request) return self.rid_to_queue[request_id] @@ -272,7 +272,7 @@ async def output_handler_loop(self): # Note: use socket directly to avoid calling await multiple # times, which causes too much task switching at high QPS. outputs: List[RequestOutput] = [] - outputs = await self.detokenizer.output_socket.recv_pyobj() + outputs = await self.detokenizer_client.output_socket.recv_pyobj() for out in outputs: # Note: it is possible that a request was aborted @@ -286,7 +286,7 @@ async def abort(self, request_id: str): """Abort request if the client cancels the request.""" # Send abort to Detokenizer (which will fwd to EngineCore) - await self.detokenizer.input_socket.send_pyobj( + await self.detokenizer_client.input_socket.send_pyobj( EngineAbortRequest([request_id])) # Remove from request output queues. @@ -336,10 +336,10 @@ async def check_health(self) -> None: logger.debug("Called check_health.") async def start_profile(self) -> None: - await self.engine_core.profile_async(True) + await self.engine_core_client.profile_async(True) async def stop_profile(self) -> None: - await self.engine_core.profile_async(False) + await self.engine_core_client.profile_async(False) @property def is_running(self) -> bool: