diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index 1377dbf8d0aba..3755dde6be95b 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -168,10 +168,8 @@ def sleep( All data in the memory allocation with the specified tag will be offloaded to CPU memory, and others will be discarded. - Args: - offload_tags: The tags of the memory allocation that will be + :param offload_tags: The tags of the memory allocation that will be offloaded. The rest of the memory allocation will be discarded. - """ if offload_tags is None: # by default, allocated tensors are offloaded @@ -220,10 +218,8 @@ def use_memory_pool(self, tag: Optional[str] = None): All memory allocation created inside the context will be allocated in the memory pool, and has the specified tag. - Args: - tag: The tag of the memory allocation. If None, the default tag + :param tag: The tag of the memory allocation. If None, the default tag will be used. - """ if tag is None: tag = CuMemAllocator.default_tag diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 500a8bbc12e10..04056f37f851b 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1138,8 +1138,7 @@ def sleep(self, level: int = 1): The caller should guarantee that no requests are being processed during the sleep period, before `wake_up` is called. - Args: - level: The sleep level. Level 1 sleep will offload the model + :param level: The sleep level. Level 1 sleep will offload the model weights and discard the kv cache. The content of kv cache is forgotten. Level 1 sleep is good for sleeping and waking up the engine to run the same model again. The model weights are backed @@ -1150,7 +1149,6 @@ def sleep(self, level: int = 1): waking up the engine to run a different model or update the model, where previous model weights are not needed. It reduces CPU memory pressure. - """ self.llm_engine.sleep(level=level)