vllm-project · tlrmchlsmth · Dec 10, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
@@ -26,6 +26,14 @@
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def test_vllm_gc_ed():
     """Verify vllm instance is GC'ed when it is deleted"""
     llm = LLM("facebook/opt-125m")
@@ -143,6 +151,7 @@ def test_models_distributed(
     )
 
 
+@pytest.mark.skip_v1
 def test_model_with_failure(vllm_runner) -> None:
     try:
         with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
@@ -169,6 +178,7 @@ def test_model_with_failure(vllm_runner) -> None:
         os.remove(filename)
 
 
+@pytest.mark.skip_v1
 def test_failure_with_async_out_proc(vllm_runner) -> None:
 
     filename = None

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -1,12 +1,15 @@
 import os
 import pickle
+import struct
+import sys
 import time
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from multiprocessing import shared_memory
-from typing import List, Optional
+from typing import List, Optional, Tuple
 from unittest.mock import patch
 
+import msgspec
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
@@ -21,6 +24,13 @@
 
 logger = init_logger(__name__)
 
+# We prefer to use os.sched_yield as it results in tighter polling loops,
+# measured to be around 3e-7 seconds. However on earlier versions of Python
+# os.sched_yield() does not release the GIL, so we fall back to time.sleep(0)
+USE_SCHED_YIELD = ((sys.version_info[:3] >= (3, 11, 1))
+                   or (sys.version_info[:2] == (3, 10)
+                       and sys.version_info[2] >= 8))
+
 
 class ShmRingBuffer:
 
@@ -74,7 +84,7 @@ def __init__(self,
         NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct.
 
         During creation, `name` is None and the buffer is created. We can pass the
-        created object to other processes by pickling it. The other processes will
+        created object to other processes by serializing it. The other processes will
         get the name of the shared memory and open it, so that they can access the
         same shared memory buffer.
         """# noqa
@@ -114,6 +124,10 @@ def __init__(self,
                     # and we should suppress the error
                     pass
 
+    def handle(self):
+        return (self.n_reader, self.max_chunk_bytes, self.max_chunks,
+                self.shared_memory.name)
+
     def __reduce__(self):
         return (
             self.__class__,
@@ -147,13 +161,19 @@ class Handle:
     connect_ip: str
     local_reader_ranks: List[int] = field(default_factory=list)
 
-    buffer: Optional[ShmRingBuffer] = None
+    buffer_handle: Optional[Tuple[int, int, int, str]] = None
     local_subscribe_port: Optional[int] = None
     remote_subscribe_port: Optional[int] = None
 
 
 class MessageQueue:
 
+    # For msgpack serialization, we use 4 bytes to store the size of each
+    # message, as we need the size of the encoded message while decoding.
+    # This is not needed for zmq or pickle.
+    SIZE_PREFIX_FORMAT = '!I'  # unsigned int, 4 bytes, network byte order
+    SIZE_PREFIX_LEN = struct.calcsize(SIZE_PREFIX_FORMAT)
+
     def __init__(
         self,
         n_reader,  # number of all readers
@@ -228,7 +248,7 @@ def __init__(
         self.handle = Handle(
             connect_ip=connect_ip,
             local_reader_ranks=local_reader_ranks,
-            buffer=self.buffer,
+            buffer_handle=self.buffer.handle(),
             local_subscribe_port=local_subscribe_port,
             remote_subscribe_port=remote_subscribe_port,
         )
@@ -247,8 +267,8 @@ def create_from_handle(handle: Handle, rank) -> "MessageQueue":
         context = Context()
 
         if rank in handle.local_reader_ranks:
-            assert handle.buffer is not None
-            self.buffer = handle.buffer
+            assert handle.buffer_handle is not None
+            self.buffer = ShmRingBuffer(*handle.buffer_handle)
             self.current_idx = 0
             self.local_reader_rank = handle.local_reader_ranks.index(rank)
             self._is_local_reader = True
@@ -329,7 +349,10 @@ def acquire_write(self):
                     # we need to wait until it is read by all readers
 
                     # Release the processor to other threads
-                    os.sched_yield()
+                    if USE_SCHED_YIELD:
+                        os.sched_yield()
+                    else:
+                        time.sleep(1e-5)
 
                     # if we wait for a long time, we should warn the user
                     if (time.monotonic() - start_time >
@@ -383,7 +406,10 @@ def acquire_read(self):
                     # we need to wait until it is written
 
                     # Release the processor to other threads
-                    os.sched_yield()
+                    if USE_SCHED_YIELD:
+                        os.sched_yield()
+                    else:
+                        time.sleep(0)
 
                     # if we wait for a long time, we should warn the user
                     if (time.monotonic() - start_time >
@@ -407,6 +433,7 @@ def acquire_read(self):
                 break
 
     def enqueue(self, obj):
+        """Enqueue obj using pickle serialization"""
         assert self._is_writer, "Only writers can enqueue"
         serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
         if self.n_local_reader > 0:
@@ -422,6 +449,7 @@ def enqueue(self, obj):
             self.remote_socket.send(serialized_obj)
 
     def dequeue(self):
+        """Dequeue obj using pickle serialization"""
         if self._is_local_reader:
             with self.acquire_read() as buf:
                 overflow = buf[0] == 1
@@ -440,6 +468,57 @@ def dequeue(self):
             raise RuntimeError("Only readers can dequeue")
         return obj
 
+    def enqueue_via_msgpack(self, obj: msgspec.Struct):
+        """Enqueue obj using msgpack serialization"""
+        assert self._is_writer, "Only writers can enqueue"
+
+        encoder = msgspec.msgpack.Encoder()
+        serialized_obj = encoder.encode(obj)
+        size_to_write = self.SIZE_PREFIX_LEN + len(serialized_obj)
+
+        if self.n_local_reader > 0:
+            if size_to_write >= self.buffer.max_chunk_bytes:
+                with self.acquire_write() as buf:
+                    buf[0] = 1  # overflow
+                self.local_socket.send(serialized_obj)
+            else:
+                with self.acquire_write() as buf:
+                    buf[0] = 0  # not overflow
+                    obj_offset = 1 + self.SIZE_PREFIX_LEN
+
+                    # Write size prefix
+                    buf[1:obj_offset] = struct.pack(self.SIZE_PREFIX_FORMAT,
+                                                    len(serialized_obj))
+
+                    buf[obj_offset:obj_offset +
+                        len(serialized_obj)] = serialized_obj
+        if self.n_remote_reader > 0:
+            self.remote_socket.send(serialized_obj)
+
+    def dequeue_via_msgpack(self, obj_type):
+        """Enqueue obj using msgpack serialization"""
+        decoder = msgspec.msgpack.Decoder(obj_type)
+
+        if self._is_local_reader:
+            with self.acquire_read() as buf:
+                overflow = buf[0] == 1
+                if not overflow:
+                    obj_offset = 1 + self.SIZE_PREFIX_LEN
+                    size_bytes = buf[1:obj_offset]
+                    msg_size = struct.unpack(self.SIZE_PREFIX_FORMAT,
+                                             size_bytes)[0]
+
+                    obj = decoder.decode(buf[obj_offset:obj_offset + msg_size])
+            if overflow:
+                recv = self.local_socket.recv()
+                obj = decoder.decode(recv)
+        elif self._is_remote_reader:
+            recv = self.remote_socket.recv()
+            obj = decoder.decode(recv)
+        else:
+            raise RuntimeError("Only readers can dequeue")
+        return obj
+
     def broadcast_object(self, obj=None):
         if self._is_writer:
             self.enqueue(obj)

diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
@@ -7,6 +7,7 @@
 
 from vllm.distributed import (tensor_model_parallel_all_gather,
                               tensor_model_parallel_gather)
+from vllm.envs import VLLM_USE_V1
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -42,7 +43,7 @@ def __init__(self,
         # Soft cap the logits. Used in Gemma 2.
         self.soft_cap = soft_cap
         # Whether to use gather or all-gather to gather the logits.
-        self.use_gather = not current_platform.is_tpu()
+        self.use_gather = not current_platform.is_tpu() and not VLLM_USE_V1
 
     def forward(
         self,

@@ -1,21 +1,18 @@
 from collections import deque
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
-                    Tuple, Union)
+from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
+from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal.base import PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.engine import EngineCoreOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
-if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalKwargs
-    from vllm.multimodal.base import PlaceholderRange
-
 logger = init_logger(__name__)
 
 
@@ -382,7 +379,7 @@ def update_from_output(
         model_runner_output: "ModelRunnerOutput",
     ) -> List[EngineCoreOutput]:
         # NOTE(woosuk): This method doesn't consider speculative decoding.
-        sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
+        sampled_token_ids = model_runner_output.sampled_token_ids_cpu
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         new_running: List[Request] = []
         engine_core_outputs: List[EngineCoreOutput] = []
@@ -508,8 +505,8 @@ class NewRequestData:
     req_id: str
     prompt_token_ids: List[int]
     prompt: Optional[str]
-    mm_inputs: List["MultiModalKwargs"]
-    mm_positions: List["PlaceholderRange"]
+    mm_inputs: List[MultiModalKwargs]
+    mm_positions: List[PlaceholderRange]
     sampling_params: SamplingParams
     block_ids: List[int]
     num_computed_tokens: int

diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler_output.py
@@ -0,0 +1,24 @@
+from enum import Enum, auto
+from typing import Optional
+
+import msgspec
+
+from vllm.v1.core.scheduler import SchedulerOutput
+
+
+#TODO: Move this file
+class ExecutorMsgType(Enum):
+    TOIL = auto()
+    TERMINATE = auto()
+
+
+class ExecutorMsg(msgspec.Struct,
+                  array_like=True,
+                  omit_defaults=True,
+                  gc=False):
+    """A directive from the core process to its worker processes.
+
+	Wraps SchedulerOutput with a message type to distinguish between
+	regular work assignments and termination orders."""
+    message_type: ExecutorMsgType
+    payload: Optional[SchedulerOutput]
@@ -3,10 +3,9 @@
 import queue
 import threading
 import time
-from contextlib import contextmanager
 from multiprocessing.process import BaseProcess
 from multiprocessing.sharedctypes import Synchronized
-from typing import Any, Iterator, List, Tuple, Type, Union
+from typing import List, Tuple, Type, Union
 
 import zmq
 import zmq.asyncio
@@ -23,6 +22,7 @@
 from vllm.v1.executor.gpu_executor import GPUExecutor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder
+from vllm.v1.utils import make_zmq_socket
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -128,8 +128,11 @@ def step(self) -> List[EngineCoreOutput]:
             scheduler_output, output)
         return engine_core_outputs
 
+    def shutdown(self):
+        self.model_executor.shutdown()
+
     def profile(self, is_start=True):
-        self.model_executor.worker.profile(is_start)
+        self.model_executor.profile(is_start)
 
 
 class EngineCoreProc(EngineCore):
@@ -167,32 +170,9 @@ def __init__(
                          daemon=True).start()
 
         # Send Readiness signal to EngineClient.
-        with self.make_socket(ready_path, zmq.constants.PUSH) as ready_socket:
+        with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket:
             ready_socket.send_string(EngineCoreProc.READY_STR)
 
-    @contextmanager
-    def make_socket(self, path: str, type: Any) -> Iterator[zmq.Socket]:
-        """Context manager for use """
-
-        ctx = zmq.Context()
-        try:
-            socket = ctx.socket(type)
-
-            if type == zmq.constants.PULL:
-                socket.connect(path)
-            elif type == zmq.constants.PUSH:
-                socket.bind(path)
-            else:
-                raise ValueError(f"Unknown Socket Type: {type}")
-
-            yield socket
-
-        except KeyboardInterrupt:
-            logger.debug("EngineCore had Keyboard Interrupt.")
-
-        finally:
-            ctx.destroy(linger=0)
-
     @staticmethod
     def wait_for_startup(
         proc: BaseProcess,
@@ -337,7 +317,7 @@ def process_input_socket(self, input_path: str):
         decoder_add_req = PickleEncoder()
         decoder_abort_req = PickleEncoder()
 
-        with self.make_socket(input_path, zmq.constants.PULL) as socket:
+        with make_zmq_socket(input_path, zmq.constants.PULL) as socket:
             while True:
                 # (RequestType, RequestData)
                 type_frame, data_frame = socket.recv_multipart(copy=False)
@@ -365,7 +345,7 @@ def process_output_socket(self, output_path: str):
         # Reuse send buffer.
         buffer = bytearray()
 
-        with self.make_socket(output_path, zmq.constants.PUSH) as socket:
+        with make_zmq_socket(output_path, zmq.constants.PUSH) as socket:
             while True:
                 engine_core_outputs = self.output_queue.get()
                 outputs = EngineCoreOutputs(outputs=engine_core_outputs)