From e461c262f0d4c9911f1bf75bea723f8ae17219be Mon Sep 17 00:00:00 2001
From: yangzhibin <45459326+Ghjk94522@users.noreply.github.com>
Date: Fri, 20 Dec 2024 01:54:24 +0800
Subject: [PATCH 01/14] [Misc] Remove unused vllm/block.py (#11336)

---
 vllm/block.py        | 88 --------------------------------------------
 vllm/core/evictor.py |  4 +-
 2 files changed, 2 insertions(+), 90 deletions(-)
 delete mode 100644 vllm/block.py

diff --git a/vllm/block.py b/vllm/block.py
deleted file mode 100644
index 47c381c19383b..0000000000000
--- a/vllm/block.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""Token blocks."""
-from typing import TYPE_CHECKING, Iterator, List, Optional
-
-from vllm.utils import Device
-
-DEFAULT_LAST_ACCESSED_TIME: float = -1
-
-
-class PhysicalTokenBlock:
-    """Represents the state of a block in the KV cache."""
-
-    def __init__(
-        self,
-        device: Device,
-        block_number: int,
-        block_size: int,
-        block_hash: int,
-        num_hashed_tokens: int,
-    ) -> None:
-        self.device = device
-        self.block_number = block_number
-        self.block_size = block_size
-        self.block_hash = block_hash
-        self.num_hashed_tokens = num_hashed_tokens
-
-        self.ref_count = 0
-        self.last_accessed = DEFAULT_LAST_ACCESSED_TIME
-
-        self.computed = False
-
-    def __repr__(self) -> str:
-        return (f'PhysicalTokenBlock(device={self.device}, '
-                f'block_number={self.block_number}, '
-                f'num_hashed_tokens={self.num_hashed_tokens}, '
-                f'ref_count={self.ref_count}, '
-                f'last_accessed={self.last_accessed}, '
-                f'computed={self.computed})')
-
-
-class BlockTable:
-    """Holds a list of blocks with caching of their associated block_ids 
-    """
-
-    def __init__(self, blocks: Optional[List[PhysicalTokenBlock]] = None):
-        self._blocks: List[PhysicalTokenBlock] = []
-        self._block_ids: List[int] = []
-
-        if blocks is not None:
-            for block in blocks:
-                self.append(block)
-
-    def append(self, block: PhysicalTokenBlock):
-        self._blocks.append(block)
-        self._block_ids.append(block.block_number)
-
-    def __len__(self) -> int:
-        return len(self._blocks)
-
-    def __getitem__(self, key):
-        return self._blocks[key]
-
-    if TYPE_CHECKING:
-
-        def __iter__(self) -> Iterator[PhysicalTokenBlock]:
-            raise RuntimeError("Method should be automatically generated")
-
-    def __setitem__(self, key, value):
-        if isinstance(key, slice):
-            blocks = value
-            self._blocks[key] = blocks
-            self._block_ids[key] = [b.block_number for b in blocks]
-        else:
-            block = value
-            self._blocks[key] = block
-            self._block_ids[key] = block.block_number
-
-    def reset(self):
-        self._blocks = []
-        self._block_ids = []
-
-    def copy(self) -> "BlockTable":
-        return BlockTable(self._blocks)
-
-    def list(self) -> List[PhysicalTokenBlock]:
-        return self._blocks
-
-    def ids(self) -> List[int]:
-        return self._block_ids
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 44adc4158abec..c9306518223a3 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -13,7 +13,7 @@ class EvictionPolicy(enum.Enum):
 
 class Evictor(ABC):
     """The Evictor subclasses should be used by the BlockAllocator class to
-    handle eviction of freed PhysicalTokenBlocks.
+    handle eviction of freed Blocks.
     """
 
     @abstractmethod
@@ -70,7 +70,7 @@ def __init__(self, content_hash: int, num_hashed_tokens: int,
 
 class LRUEvictor(Evictor):
     """Evicts in a least-recently-used order using the last_accessed timestamp
-    that's recorded in the PhysicalTokenBlock. If there are multiple blocks with
+    that's recorded in the Block. If there are multiple blocks with
     the same last_accessed time, then the one with the largest num_hashed_tokens
     will be evicted. If two blocks each have the lowest last_accessed time and
     highest num_hashed_tokens value, then one will be chose arbitrarily

From a985f7af9f7b249974b283a9d999575ac30fac3d Mon Sep 17 00:00:00 2001
From: Yuan <yuan.zhou@intel.com>
Date: Fri, 20 Dec 2024 03:46:55 +0800
Subject: [PATCH 02/14] [CI] Adding CPU docker pipeline (#11261)

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
Co-authored-by: Kevin H. Luu <kevin@anyscale.com>
---
 .buildkite/release-pipeline.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 2de6fceb0c3fe..51618a2955fb1 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -55,3 +55,18 @@ steps:
           password-env: DOCKERHUB_TOKEN
     env:
       DOCKER_BUILDKIT: "1"
+
+  - block: "Build CPU release image"
+    key: block-cpu-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish CPU release image"
+    depends_on: block-cpu-release-image-build
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
+    env:
+      DOCKER_BUILDKIT: "1"

From 48edab8041741a82a1fd2f4d463cc0f393561b05 Mon Sep 17 00:00:00 2001
From: Akash kaothalkar <61960177+Akashcodes732@users.noreply.github.com>
Date: Fri, 20 Dec 2024 07:02:07 +0530
Subject: [PATCH 03/14] [Bugfix][Hardware][POWERPC] Fix auto dtype failure in
 case of POWER10 (#11331)

Signed-off-by: Akash Kaothalkar <0052v2@linux.vnet.ibm.com>
---
 vllm/config.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 0e886e18fcd6d..6badae24d9d7d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -22,7 +22,7 @@
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                      get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import current_platform
+from vllm.platforms import current_platform, interface
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
@@ -2199,6 +2199,17 @@ def _get_and_verify_dtype(
             else:
                 torch_dtype = config_dtype
 
+            if (current_platform.is_cpu()
+                    and current_platform.get_cpu_architecture()
+                    == interface.CpuArchEnum.POWERPC
+                    and (config_dtype == torch.float16
+                         or config_dtype == torch.float32)):
+                logger.info(
+                    "For POWERPC, we cast models to bfloat16 instead of "
+                    "using float16 by default. Float16 is not currently "
+                    "supported for POWERPC.")
+                torch_dtype = torch.bfloat16
+
             if current_platform.is_hpu() and config_dtype == torch.float16:
                 logger.info(
                     "For HPU, we cast models to bfloat16 instead of"

From 7801f56ed76d9bec0344728bfa3359b42c926074 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 19 Dec 2024 18:13:06 -0800
Subject: [PATCH 04/14] [ci][gh200] dockerfile clean up (#11351)

Signed-off-by: drikster80 <ed.sealing@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: drikster80 <ed.sealing@gmail.com>
Co-authored-by: cenzhiyao <2523403608@qq.com>
---
 .buildkite/run-gh200-test.sh                  |  3 ++
 Dockerfile                                    | 39 ++++++++++---------
 docs/source/serving/deploying_with_docker.rst | 30 +++++++++-----
 requirements-build.txt                        |  2 +-
 requirements-common.txt                       |  7 ++--
 requirements-cuda-arm64.txt                   |  3 --
 requirements-cuda.txt                         |  4 +-
 7 files changed, 51 insertions(+), 37 deletions(-)
 delete mode 100644 requirements-cuda-arm64.txt

diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index d06604f96f2b8..4fc6d089cc666 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -4,6 +4,9 @@
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 
+# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
+python3 use_existing_torch.py
+
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
   --target vllm-openai \
diff --git a/Dockerfile b/Dockerfile
index 123703848749c..0944050f7dfca 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,17 +45,21 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 WORKDIR /workspace
 
 # install build and runtime dependencies
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cuda.txt requirements-cuda.txt
-COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-cuda.txt
 
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install -r requirements-cuda-arm64.txt; \
+        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
     fi
 
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-cuda.txt
+
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
@@ -77,11 +81,6 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-build.txt
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install -r requirements-cuda-arm64.txt; \
-    fi
-
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
@@ -157,8 +156,6 @@ WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
 
-COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
-
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 
@@ -183,17 +180,20 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
+    fi
+
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        pip uninstall -y torch && \
-        python3 -m pip install -r requirements-cuda-arm64.txt; \
-    fi
-
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
@@ -244,6 +244,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     else \
         pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
     fi
+
 ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 56f0020a1011a..b64eef819cd2e 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -3,6 +3,9 @@
 Deploying with Docker
 ============================
 
+Use vLLM's Official Docker Image
+--------------------------------
+
 vLLM offers an official Docker image for deployment.
 The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
 
@@ -24,12 +27,15 @@ The image can be used to run OpenAI compatible server and is available on Docker
         memory to share data between processes under the hood, particularly for tensor parallel inference.
 
 
+Building vLLM's Docker Image from Source
+----------------------------------------
+
 You can build and run vLLM from source via the provided `Dockerfile <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_. To build vLLM:
 
 .. code-block:: console
 
-    $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-
+    $ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
+    $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
 
 .. note::
 
@@ -41,18 +47,19 @@ Building for Arm64/aarch64
 --------------------------
 
 A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
-of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+of PyTorch Nightly and should be considered **experimental**. Using the flag ``--platform "linux/arm64"`` will attempt to build for arm64.
 
 .. note::
 
-        Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
-        flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
+        Multiple modules must be compiled, so this process can take a while. Recommend using ``--build-arg max_jobs=`` & ``--build-arg nvcc_threads=``
+        flags to speed up build process. However, ensure your ``max_jobs`` is substantially larger than ``nvcc_threads`` to get the most benefits.
         Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
 
 .. code-block:: console
 
-    # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
-    $ DOCKER_BUILDKIT=1 sudo docker build . \
+    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
+    $ python3 use_existing_torch.py
+    $ DOCKER_BUILDKIT=1 docker build . \
       --target vllm-openai \
       --platform "linux/arm64" \
       -t vllm/vllm-gh200-openai:latest \
@@ -61,7 +68,10 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
       --build-arg torch_cuda_arch_list="9.0+PTX" \
       --build-arg vllm_fa_cmake_gpu_arches="90-real"
 
-To run vLLM:
+Use the custom-built vLLM Docker image
+--------------------------------------
+
+To run vLLM with the custom-built Docker image:
 
 .. code-block:: console
 
@@ -71,6 +81,8 @@ To run vLLM:
         --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
         vllm/vllm-openai <args...>
 
+The argument ``vllm/vllm-openai`` specifies the image to run, and should be replaced with the name of the custom-built image (the ``-t`` tag from the build command).
+
 .. note::
 
-        **For `v0.4.1` and `v0.4.2` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` .
+        **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` .
diff --git a/requirements-build.txt b/requirements-build.txt
index 388b193403e88..fec01caaf25ef 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.1; platform_machine != 'aarch64'
+torch==2.5.1
 wheel
 jinja2
diff --git a/requirements-common.txt b/requirements-common.txt
index 250e2b17ffc23..6c390bcfd18e6 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -19,7 +19,7 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines == 0.1.11
+outlines == 0.1.11 # Requires pytorch
 lark == 1.2.2 
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
@@ -34,5 +34,6 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.1 # required for compressed-tensors
-depyf==0.18.0 # required for profiling and debugging torch.compile
+compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch
+depyf==0.18.0 # required for profiling and debugging with compilation config
+cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt
deleted file mode 100644
index bbcb5cb7012ce..0000000000000
--- a/requirements-cuda-arm64.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---index-url https://download.pytorch.org/whl/nightly/cu124
-torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
-torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 5d4dee8c7129a..058ab7c1ee9df 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,7 +4,7 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
-torch == 2.5.1; platform_machine != 'aarch64'
+torch == 2.5.1
 # These must be updated alongside torch
-torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1

From b880ffb87e0bcde5e3693203b480df49e46d67bc Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 19 Dec 2024 23:35:18 -0500
Subject: [PATCH 05/14] [Misc] Add tqdm progress bar during graph capture
 (#11349)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/worker/model_runner.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 6ff98a8f1bab2..2b545d1b28bd2 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -13,6 +13,7 @@
 import torch
 import torch.distributed
 import torch.nn as nn
+from tqdm import tqdm
 
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
@@ -21,7 +22,8 @@
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.distributed import get_kv_transfer_group, get_pp_group
-from vllm.distributed.parallel_state import graph_capture
+from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
+                                             graph_capture)
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
@@ -1413,8 +1415,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
         logger.info("Capturing cudagraphs for decoding. This may lead to "
                     "unexpected consequences if the model is not static. To "
                     "run the model in eager mode, set 'enforce_eager=True' or "
-                    "use '--enforce-eager' in the CLI.")
-        logger.info("If out-of-memory error occurs during cudagraph capture,"
+                    "use '--enforce-eager' in the CLI. "
+                    "If out-of-memory error occurs during cudagraph capture,"
                     " consider decreasing `gpu_memory_utilization` or "
                     "switching to eager mode. You can also reduce the "
                     "`max_num_seqs` as needed to decrease memory usage.")
@@ -1451,8 +1453,14 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
             # memory usage of CUDA graph.
             for virtual_engine in range(
                     self.parallel_config.pipeline_parallel_size):
-                for batch_size in \
-                    self.vllm_config.compilation_config.capture_sizes:
+                # Only rank 0 should print progress bar during capture
+                capture_sizes = (
+                    tqdm(
+                        self.vllm_config.compilation_config.capture_sizes,
+                        desc="Capturing CUDA graph shapes",
+                    ) if get_tensor_model_parallel_rank() == 0 else
+                    self.vllm_config.compilation_config.capture_sizes)
+                for batch_size in capture_sizes:
                     attn_metadata = (
                         self.attn_state.graph_capture_get_metadata_for_batch(
                             batch_size,

From 86c2d8fd1cb27e607928ca8c92fa20d9694d2e4b Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Fri, 20 Dec 2024 02:15:31 -0300
Subject: [PATCH 06/14] [Bugfix] Fix spec decoding when seed is none in a batch
 (#10863)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 tests/samplers/test_rejection_sampler.py      | 63 +++++++++++++++++++
 .../layers/rejection_sampler.py               | 10 +--
 2 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index f5497976faf7a..397fa2cc85821 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -200,6 +200,69 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
                 assert torch.equal(results[j][i], results[0][i])
 
 
+@pytest.mark.parametrize("k", [1, 3, 6])
+@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
+@pytest.mark.parametrize("batch_size", [3, 8, 32, 128])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("use_flashinfer", [True, False])
+@torch.inference_mode()
+def test_mixed_seeded_batch(k: int, vocab_size: int, batch_size: int,
+                            device: str, use_flashinfer: bool):
+    torch.set_default_device(device)
+    set_random_seed(0)
+    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size,
+                              k + 1,
+                              vocab_size,
+                              dtype=torch.float32)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+
+    single_batches = []
+    for i in range(batch_size):
+        single_batches.append((draft_probs[i].clone().unsqueeze(0),
+                               draft_token_ids[i].clone().unsqueeze(0),
+                               target_probs[i].clone().unsqueeze(0),
+                               bonus_token_ids[i].clone().unsqueeze(0),
+                               draft_token_ids[i].clone().unsqueeze(0)))
+
+    set_random_seed(0)
+    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
+    rejection_sampler.init_gpu_tensors(device=device)
+
+    results = []
+    seeded_seqs = {
+        i: torch.Generator(device=device).manual_seed(i)
+        for i in range(1, batch_size)  # 0 is seed None
+    }
+    batch_result = rejection_sampler(target_probs.clone(),
+                                     bonus_token_ids.clone(),
+                                     draft_probs.clone(),
+                                     draft_token_ids.clone(), seeded_seqs)
+
+    set_random_seed(0)
+
+    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
+    rejection_sampler.init_gpu_tensors(device=device)
+    for i in range(batch_size):
+        request_seeded_seqs = {
+            0: torch.Generator(device=device).manual_seed(i)
+        } if seeded_seqs.get(i) is not None else None
+        (draft_probs, draft_token_ids, target_probs, bonus_token_ids,
+         draft_token_ids) = single_batches[i]
+        results.append(
+            rejection_sampler(target_probs, bonus_token_ids, draft_probs,
+                              draft_token_ids, request_seeded_seqs))
+    for i in range(batch_size):
+        assert torch.equal(batch_result[i], results[i].squeeze(0))
+
+
 @pytest.mark.parametrize("k", [1, 3, 6])
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 3ab0ba9e9f5c2..97a1b0c9603bd 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,6 +1,6 @@
 from functools import cached_property
 from importlib.util import find_spec
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.jit
@@ -386,16 +386,12 @@ def _multinomial(
     if not seeded_seqs:
         q.exponential_(1.0)
     else:
-        non_seeded_indices: List[int] = []
         start = 0
         for idx in range(len(q) // k):
             end = start + k
             generator = seeded_seqs.get(idx)
-            if generator is None:
-                non_seeded_indices.extend(list(range(start, end)))
-            else:
-                q[start:end].exponential_(1.0, generator=generator)
+            # Note: generator might be None for non seeded
+            q[start:end].exponential_(1.0, generator=generator)
             start = end
-        q[non_seeded_indices].exponential_(1.0)
 
     return probs.div_(q).argmax(dim=1).view(-1, num_samples)

From c954f21ac05642c416cbd87861ddebe9af2ae1b4 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 19 Dec 2024 21:18:25 -0800
Subject: [PATCH 07/14] [misc] add early error message for custom ops (#11355)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/utils.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 3934903385ad4..1b90eca1cd6cc 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1577,8 +1577,18 @@ def direct_register_custom_op(
     library object. If you want to bind the operator to a different library,
     make sure the library object is alive when the operator is used.
     """
-    if is_in_doc_build() or not supports_custom_op():
+    if is_in_doc_build():
         return
+
+    if not supports_custom_op():
+        assert not current_platform.is_cuda_alike(), (
+            "cuda platform needs torch>=2.4 to support custom op, "
+            "chances are you are using an old version of pytorch "
+            "or a custom build of pytorch. It is recommended to "
+            "use vLLM in a fresh new environment and let it install "
+            "the required dependencies.")
+        return
+
     import torch.library
     if hasattr(torch.library, "infer_schema"):
         schema_str = torch.library.infer_schema(op_func,

From 1ecc645b8f5431f1404551ad24721a63f01aea4e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 19 Dec 2024 21:33:53 -0800
Subject: [PATCH 08/14] [doc] backward compatibility for 0.6.4 (#11359)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/debugging.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index d6c83014dc69f..7f36d65a227f0 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -86,6 +86,11 @@ If GPU/CPU communication cannot be established, you can use the following Python
     from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 
     pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
+    # pynccl is enabled by default for 0.6.5+,
+    # but for 0.6.4 and below, we need to enable it manually.
+    # keep the code for backward compatibility when because people
+    # prefer to read the latest documentation.
+    pynccl.disabled = False
 
     s = torch.cuda.Stream()
     with torch.cuda.stream(s):

From 04139ade599eedd493ce8effcda7ceabb57f2fb5 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 20 Dec 2024 04:04:21 -0800
Subject: [PATCH 09/14] [V1] Fix profiling for models with merged input
 processor (#11370)

Signed-off-by: ywang96 <ywang@roblox.com>
---
 vllm/v1/worker/gpu_model_runner.py | 44 ++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index cb89246db0cc9..ace62d8978bea 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -635,17 +635,6 @@ def profile_run(self) -> None:
             )
             dummy_mm_data = dummy_request_data.multi_modal_data
 
-            # Compute MM hashes (if enabled)
-            mm_hashes = None
-            if self.use_hash:
-                mm_hashes = self.mm_hasher.hash_dummy_mm_data(dummy_mm_data)
-
-            dummy_mm_kwargs = self.mm_input_mapper_client.process_inputs(
-                mm_data=dummy_mm_data,
-                mm_hashes=mm_hashes,
-                mm_processor_kwargs=None,
-                precomputed_mm_inputs=None)
-
             # NOTE: Currently model is profiled with a single non-text
             # modality even when it supports multiple.
             max_tokens_per_mm_item = max(
@@ -660,8 +649,39 @@ def profile_run(self) -> None:
             # (e.g, multiple images) for a single request, therefore here we
             # always replicate first item by max_num_mm_items times since in V1
             # they are scheduled to be processed separately.
+
+            # Case when models have a merged processor, their dummy data is
+            # already batched `MultiModalKwargs`, therefore we need to "unbatch"
+            # and take the first item in each batched tensor.
+            # TODO (ywang96): This is somewhat hacky. Refactor this to be
+            # consistent with the other case.
+            if isinstance(dummy_mm_data, MultiModalKwargs):
+                dummy_mm_kwargs = {
+                    k: v[0].unsqueeze(0)
+                    for k, v in dummy_mm_data.items()
+                }
+
+            # Case when models have dummy data explicitly defined as
+            # `MultiModalDataDict`, so they need to be processed through input
+            # mapper.
+            else:
+                # Compute MM hashes (if enabled)
+                mm_hashes = None
+                if self.use_hash:
+                    mm_hashes = self.mm_hasher.hash_dummy_mm_data(
+                        dummy_mm_data)
+
+                mm_kwargs_list = self.mm_input_mapper_client.process_inputs(
+                    mm_data=dummy_mm_data,
+                    mm_hashes=mm_hashes,
+                    mm_processor_kwargs=None,
+                    precomputed_mm_inputs=None)
+
+                # Take the first `MultiModalKwargs`
+                dummy_mm_kwargs = mm_kwargs_list[0]
+
             batched_dummy_mm_inputs = MultiModalKwargs.batch(
-                [dummy_mm_kwargs[0]] * max_num_mm_items)
+                [dummy_mm_kwargs] * max_num_mm_items)
             batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_dummy_mm_inputs, device=self.device)
 

From 7c7aa37c6933c40a94da0789d0f330a8d89f091b Mon Sep 17 00:00:00 2001
From: Daniele <36171005+dtrifiro@users.noreply.github.com>
Date: Fri, 20 Dec 2024 17:14:40 +0100
Subject: [PATCH 10/14] [CI/Build] fix pre-compiled wheel install for exact tag
 (#11373)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fcfaa207c176a..a860093fe5f35 100644
--- a/setup.py
+++ b/setup.py
@@ -466,7 +466,7 @@ def get_vllm_version() -> str:
             version += f"{sep}empty"
     elif _is_cuda():
         if envs.VLLM_USE_PRECOMPILED:
-            version += ".precompiled"
+            version += f"{sep}precompiled"
         else:
             cuda_version = str(get_nvcc_cuda_version())
             if cuda_version != MAIN_CUDA_VERSION:

From 995f56236bc08300ea11fc8cd3d66029ffec8678 Mon Sep 17 00:00:00 2001
From: omer-dayan <omer@run.ai>
Date: Fri, 20 Dec 2024 18:46:24 +0200
Subject: [PATCH 11/14] [Core] Loading model from S3 using RunAI Model Streamer
 as optional loader (#10192)

Signed-off-by: OmerD <omer@run.ai>
---
 Dockerfile                                    |   4 +-
 docs/source/index.rst                         |   1 +
 docs/source/serving/runai_model_streamer.rst  |  53 +++++++
 setup.py                                      |   1 +
 tests/runai_model_streamer/__init__.py        |   0
 .../test_runai_model_streamer_loader.py       |  31 ++++
 .../runai_model_streamer/test_weight_utils.py |  39 +++++
 vllm/config.py                                |  37 +++++
 vllm/engine/arg_utils.py                      |   2 +
 vllm/model_executor/model_loader/loader.py    | 118 +++++++++++++-
 .../model_loader/weight_utils.py              |  24 +++
 vllm/transformers_utils/s3_utils.py           | 146 ++++++++++++++++++
 vllm/transformers_utils/utils.py              |   4 +
 13 files changed, 457 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/serving/runai_model_streamer.rst
 create mode 100644 tests/runai_model_streamer/__init__.py
 create mode 100644 tests/runai_model_streamer/test_runai_model_streamer_loader.py
 create mode 100644 tests/runai_model_streamer/test_weight_utils.py
 create mode 100644 vllm/transformers_utils/s3_utils.py

diff --git a/Dockerfile b/Dockerfile
index 0944050f7dfca..84350cde59bfb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -240,9 +240,9 @@ FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     else \
-        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     fi
 
 ENV VLLM_USAGE_SOURCE production-docker-image
diff --git a/docs/source/index.rst b/docs/source/index.rst
index fd741ea5e9766..d812885aafea9 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -88,6 +88,7 @@ Documentation
    serving/metrics
    serving/integrations
    serving/tensorizer
+   serving/runai_model_streamer
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/serving/runai_model_streamer.rst b/docs/source/serving/runai_model_streamer.rst
new file mode 100644
index 0000000000000..459eb8677fb95
--- /dev/null
+++ b/docs/source/serving/runai_model_streamer.rst
@@ -0,0 +1,53 @@
+.. _runai_model_streamer:
+
+Loading Models with Run:ai Model Streamer
+=========================================
+Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
+Further reading can be found in `Run:ai Model Streamer Documentation <https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md>`_.
+
+vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer.
+You first need to install vLLM RunAI optional dependency:
+
+.. code-block:: console
+
+    $ pip3 install vllm[runai]
+
+To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag:
+
+.. code-block:: console
+
+    $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer
+
+To run model from AWS S3 object store run:
+
+.. code-block:: console
+
+    $ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+
+
+To run model from a S3 compatible object store run:
+
+.. code-block:: console
+
+    $ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+
+Tunable parameters
+------------------
+You can tune parameters using `--model-loader-extra-config`:
+
+You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer.
+For reading from S3, it will be the number of client instances the host is opening to the S3 server.
+
+ .. code-block:: console
+
+    $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
+
+You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
+You can read further about CPU buffer memory limiting `here <https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit>`_.
+
+ .. code-block:: console
+
+    $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
+
+.. note::
+  For further instructions about tunable parameters and additional parameters configurable through environment variables, read the `Environment Variables Documentation <https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md>`_.
diff --git a/setup.py b/setup.py
index a860093fe5f35..73407b64edf22 100644
--- a/setup.py
+++ b/setup.py
@@ -630,6 +630,7 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules=ext_modules,
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
+        "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
         "audio": ["librosa", "soundfile"],  # Required for audio processing
         "video": ["decord"]  # Required for video processing
     },
diff --git a/tests/runai_model_streamer/__init__.py b/tests/runai_model_streamer/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/runai_model_streamer/test_runai_model_streamer_loader.py
new file mode 100644
index 0000000000000..c5722fbae5c8a
--- /dev/null
+++ b/tests/runai_model_streamer/test_runai_model_streamer_loader.py
@@ -0,0 +1,31 @@
+from vllm import SamplingParams
+from vllm.config import LoadConfig, LoadFormat
+from vllm.model_executor.model_loader.loader import (RunaiModelStreamerLoader,
+                                                     get_model_loader)
+
+test_model = "openai-community/gpt2"
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
+
+
+def get_runai_model_loader():
+    load_config = LoadConfig(load_format=LoadFormat.RUNAI_STREAMER)
+    return get_model_loader(load_config)
+
+
+def test_get_model_loader_with_runai_flag():
+    model_loader = get_runai_model_loader()
+    assert isinstance(model_loader, RunaiModelStreamerLoader)
+
+
+def test_runai_model_loader_download_files(vllm_runner):
+    with vllm_runner(test_model, load_format=LoadFormat.RUNAI_STREAMER) as llm:
+        deserialized_outputs = llm.generate(prompts, sampling_params)
+        assert deserialized_outputs
diff --git a/tests/runai_model_streamer/test_weight_utils.py b/tests/runai_model_streamer/test_weight_utils.py
new file mode 100644
index 0000000000000..5c89bd78ad81d
--- /dev/null
+++ b/tests/runai_model_streamer/test_weight_utils.py
@@ -0,0 +1,39 @@
+import glob
+import tempfile
+
+import huggingface_hub.constants
+import torch
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf, runai_safetensors_weights_iterator,
+    safetensors_weights_iterator)
+
+
+def test_runai_model_loader():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf("openai-community/gpt2",
+                                 allow_patterns=["*.safetensors"],
+                                 cache_dir=tmpdir)
+        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
+        assert len(safetensors) > 0
+
+        runai_model_streamer_tensors = {}
+        hf_safetensors_tensors = {}
+
+        for name, tensor in runai_safetensors_weights_iterator(safetensors):
+            runai_model_streamer_tensors[name] = tensor
+
+        for name, tensor in safetensors_weights_iterator(safetensors):
+            hf_safetensors_tensors[name] = tensor
+
+        assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors)
+
+        for name, runai_tensor in runai_model_streamer_tensors.items():
+            assert runai_tensor.dtype == hf_safetensors_tensors[name].dtype
+            assert runai_tensor.shape == hf_safetensors_tensors[name].shape
+            assert torch.all(runai_tensor.eq(hf_safetensors_tensors[name]))
+
+
+if __name__ == "__main__":
+    test_runai_model_loader()
diff --git a/vllm/config.py b/vllm/config.py
index 6badae24d9d7d..643698f8bbec3 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -29,6 +29,7 @@
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
     try_get_generation_config, uses_mrope)
+from vllm.transformers_utils.utils import is_s3
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
                         get_cpu_memory, print_warning_once, random_uuid,
                         resolve_obj_by_qualname)
@@ -256,6 +257,8 @@ def __init__(self,
                    f"'Please instead use `--hf-overrides '{hf_override!r}'`")
             warnings.warn(DeprecationWarning(msg), stacklevel=2)
 
+        self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
+
         # The tokenizer version is consistent with the model version by default.
         if tokenizer_revision is None:
             self.tokenizer_revision = revision
@@ -357,6 +360,39 @@ def __init__(self,
         self._verify_cuda_graph()
         self._verify_bnb_config()
 
+    def maybe_pull_model_tokenizer_for_s3(self, model: str,
+                                          tokenizer: str) -> None:
+        """
+        Pull the model config or tokenizer to a temporary 
+        directory in case of S3.
+
+        Args:
+            model: The model name or path.
+            tokenizer: The tokenizer name or path.
+
+        """
+        if is_s3(model) or is_s3(tokenizer):
+            try:
+                from vllm.transformers_utils.s3_utils import S3Model
+            except ImportError as err:
+                raise ImportError(
+                    "Please install Run:ai optional dependency "
+                    "to use the S3 capabilities. "
+                    "You can install it with: pip install vllm[runai]"
+                ) from err
+
+            if is_s3(model):
+                self.s3_model = S3Model()
+                self.s3_model.pull_files(model, allow_pattern=["*config.json"])
+                self.model_weights = self.model
+                self.model = self.s3_model.dir
+
+            if is_s3(tokenizer):
+                self.s3_tokenizer = S3Model()
+                self.s3_tokenizer.pull_files(
+                    model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
+                self.tokenizer = self.s3_tokenizer.dir
+
     def _init_multimodal_config(
         self, limit_mm_per_prompt: Optional[Mapping[str, int]]
     ) -> Optional["MultiModalConfig"]:
@@ -1099,6 +1135,7 @@ class LoadFormat(str, enum.Enum):
     GGUF = "gguf"
     BITSANDBYTES = "bitsandbytes"
     MISTRAL = "mistral"
+    RUNAI_STREAMER = "runai_streamer"
 
 
 @dataclass
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 912a8b2f54adb..7aa45b7958e26 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -316,6 +316,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '* "tensorizer" will load the weights using tensorizer from '
             'CoreWeave. See the Tensorize vLLM Model script in the Examples '
             'section for more information.\n'
+            '* "runai_streamer" will load the Safetensors weights using Run:ai'
+            'Model Streamer \n'
             '* "bitsandbytes" will load the weights using bitsandbytes '
             'quantization.\n')
         parser.add_argument(
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index fdc4c6305bd5e..24e554e6060ab 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -45,9 +45,10 @@
     filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
     get_gguf_extra_tensor_names, gguf_quant_weights_iterator,
     initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator,
-    safetensors_weights_iterator)
+    runai_safetensors_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
+from vllm.transformers_utils.utils import is_s3
 from vllm.utils import is_pin_memory_available
 
 
@@ -1234,6 +1235,118 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
         return model
 
 
+class RunaiModelStreamerLoader(BaseModelLoader):
+    """
+        Model loader that can load safetensors 
+        files from local FS or S3 bucket.
+    """
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            extra_config = load_config.model_loader_extra_config
+
+            if ("concurrency" in extra_config
+                    and isinstance(extra_config.get("concurrency"), int)):
+                os.environ["RUNAI_STREAMER_CONCURRENCY"] = str(
+                    extra_config.get("concurrency"))
+
+            if ("memory_limit" in extra_config
+                    and isinstance(extra_config.get("memory_limit"), int)):
+                os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str(
+                    extra_config.get("memory_limit"))
+
+            runai_streamer_s3_endpoint = os.getenv(
+                'RUNAI_STREAMER_S3_ENDPOINT')
+            aws_endpoint_url = os.getenv('AWS_ENDPOINT_URL')
+            if (runai_streamer_s3_endpoint is None
+                    and aws_endpoint_url is not None):
+                os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url
+
+    def _prepare_weights(self, model_name_or_path: str,
+                         revision: Optional[str]) -> List[str]:
+        """Prepare weights for the model.
+
+        If the model is not local, it will be downloaded."""
+        is_s3_path = is_s3(model_name_or_path)
+        if is_s3_path:
+            try:
+                from vllm.transformers_utils.s3_utils import glob as s3_glob
+            except ImportError as err:
+                raise ImportError(
+                    "Please install Run:ai optional dependency "
+                    "to use the S3 capabilities. "
+                    "You can install it with: pip install vllm[runai]"
+                ) from err
+
+        is_local = os.path.isdir(model_name_or_path)
+        safetensors_pattern = "*.safetensors"
+        index_file = SAFE_WEIGHTS_INDEX_NAME
+
+        hf_folder = (model_name_or_path if
+                     (is_local or is_s3_path) else download_weights_from_hf(
+                         model_name_or_path,
+                         self.load_config.download_dir,
+                         [safetensors_pattern],
+                         revision,
+                         ignore_patterns=self.load_config.ignore_patterns,
+                     ))
+
+        if is_s3_path:
+            hf_weights_files = s3_glob(path=hf_folder,
+                                       allow_pattern=[safetensors_pattern])
+        else:
+            hf_weights_files = glob.glob(
+                os.path.join(hf_folder, safetensors_pattern))
+
+        if not is_local and not is_s3_path:
+            download_safetensors_index_file_from_hf(
+                model_name_or_path, index_file, self.load_config.download_dir,
+                revision)
+
+        if not hf_weights_files:
+            raise RuntimeError(
+                f"Cannot find any safetensors model weights with "
+                f"`{model_name_or_path}`")
+
+        return hf_weights_files
+
+    def _get_weights_iterator(
+            self, model_or_path: str,
+            revision: str) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        """Get an iterator for the model weights based on the load format."""
+        hf_weights_files = self._prepare_weights(model_or_path, revision)
+        return runai_safetensors_weights_iterator(hf_weights_files)
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        """Download model if necessary"""
+        self._prepare_weights(model_config.model, model_config.revision)
+
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        """Perform streaming of the model to destination"""
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
+
+        target_device = torch.device(device_config.device)
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = _initialize_model(vllm_config=vllm_config)
+
+            model_weights = model_config.model
+            if hasattr(model_config, "model_weights"):
+                model_weights = model_config.model_weights
+            model.load_weights(
+                self._get_weights_iterator(model_weights,
+                                           model_config.revision))
+
+            for _, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                if quant_method is not None:
+                    with device_loading_context(module, target_device):
+                        quant_method.process_weights_after_loading(module)
+        return model.eval()
+
+
 def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     """Get a model loader based on the load format."""
 
@@ -1255,4 +1368,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     if load_config.load_format == LoadFormat.GGUF:
         return GGUFModelLoader(load_config)
 
+    if load_config.load_format == LoadFormat.RUNAI_STREAMER:
+        return RunaiModelStreamerLoader(load_config)
+
     return DefaultModelLoader(load_config)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 9488d54edf365..f2a9e7e2687cb 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -410,6 +410,30 @@ def safetensors_weights_iterator(
                 yield name, param
 
 
+def runai_safetensors_weights_iterator(
+    hf_weights_files: List[str]
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files."""
+    try:
+        from runai_model_streamer import SafetensorsStreamer
+    except ImportError as err:
+        raise ImportError(
+            "Please install Run:ai optional dependency."
+            "You can install it with: pip install vllm[runai]") from err
+
+    enable_tqdm = not torch.distributed.is_initialized(
+    ) or torch.distributed.get_rank() == 0
+    with SafetensorsStreamer() as streamer:
+        for st_file in tqdm(
+                hf_weights_files,
+                desc="Loading safetensors using Runai Model Streamer",
+                disable=not enable_tqdm,
+                bar_format=_BAR_FORMAT,
+        ):
+            streamer.stream_file(st_file)
+            yield from streamer.get_tensors()
+
+
 def pt_weights_iterator(
     hf_weights_files: List[str]
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
new file mode 100644
index 0000000000000..6f63dab74d696
--- /dev/null
+++ b/vllm/transformers_utils/s3_utils.py
@@ -0,0 +1,146 @@
+import fnmatch
+import os
+import shutil
+import signal
+import tempfile
+from pathlib import Path
+from typing import Optional
+
+import boto3
+
+
+def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path for path in paths if any(
+            fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path for path in paths
+        if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def glob(s3=None,
+         path: str = "",
+         allow_pattern: Optional[list[str]] = None) -> list[str]:
+    """
+    List full file names from S3 path and filter by allow pattern.
+
+    Args:
+        s3: S3 client to use.
+        path: The S3 path to list from.
+        allow_pattern: A list of patterns of which files to pull.
+
+    Returns:
+        list[str]: List of full S3 paths allowed by the pattern
+    """
+    if s3 is None:
+        s3 = boto3.client("s3")
+    bucket_name, _, paths = list_files(s3,
+                                       path=path,
+                                       allow_pattern=allow_pattern)
+    return [f"s3://{bucket_name}/{path}" for path in paths]
+
+
+def list_files(
+        s3,
+        path: str,
+        allow_pattern: Optional[list[str]] = None,
+        ignore_pattern: Optional[list[str]] = None
+) -> tuple[str, str, list[str]]:
+    """
+    List files from S3 path and filter by pattern.
+
+    Args:
+        s3: S3 client to use.
+        path: The S3 path to list from.
+        allow_pattern: A list of patterns of which files to pull.
+        ignore_pattern: A list of patterns of which files not to pull.
+
+    Returns:
+        tuple[str, str, list[str]]: A tuple where:
+            - The first element is the bucket name
+            - The second element is string represent the bucket 
+              and the prefix as a dir like string
+            - The third element is a list of files allowed or 
+              disallowed by pattern
+    """
+    parts = path.removeprefix('s3://').split('/')
+    prefix = '/'.join(parts[1:])
+    bucket_name = parts[0]
+
+    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
+    paths = [obj['Key'] for obj in objects.get('Contents', [])]
+
+    paths = _filter_ignore(paths, ["*/"])
+    if allow_pattern is not None:
+        paths = _filter_allow(paths, allow_pattern)
+
+    if ignore_pattern is not None:
+        paths = _filter_ignore(paths, ignore_pattern)
+
+    return bucket_name, prefix, paths
+
+
+class S3Model:
+    """
+    A class representing a S3 model mirrored into a temporary directory.
+
+    Attributes:
+        s3: S3 client.
+        dir: The temporary created directory.
+
+    Methods:
+        pull_files(): Pull model from S3 to the temporary directory.
+    """
+
+    def __init__(self) -> None:
+        self.s3 = boto3.client('s3')
+        for sig in (signal.SIGINT, signal.SIGTERM):
+            existing_handler = signal.getsignal(sig)
+            signal.signal(sig, self._close_by_signal(existing_handler))
+        self.dir = tempfile.mkdtemp()
+
+    def __del__(self):
+        self._close()
+
+    def _close(self) -> None:
+        if os.path.exists(self.dir):
+            shutil.rmtree(self.dir)
+
+    def _close_by_signal(self, existing_handler=None):
+
+        def new_handler(signum, frame):
+            self._close()
+            if existing_handler:
+                existing_handler(signum, frame)
+
+        return new_handler
+
+    def pull_files(self,
+                   s3_model_path: str = "",
+                   allow_pattern: Optional[list[str]] = None,
+                   ignore_pattern: Optional[list[str]] = None) -> None:
+        """
+        Pull files from S3 storage into the temporary directory.
+
+        Args:
+            s3_model_path: The S3 path of the model.
+            allow_pattern: A list of patterns of which files to pull.
+            ignore_pattern: A list of patterns of which files not to pull.
+
+        """
+        bucket_name, base_dir, files = list_files(self.s3, s3_model_path,
+                                                  allow_pattern,
+                                                  ignore_pattern)
+        if len(files) == 0:
+            return
+
+        for file in files:
+            destination_file = self.dir + file.removeprefix(base_dir)
+            local_dir = Path(destination_file).parent
+            os.makedirs(local_dir, exist_ok=True)
+            self.s3.download_file(bucket_name, file, destination_file)
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 7a9041b04fbb9..10a09fb4f566c 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -3,6 +3,10 @@
 from typing import Union
 
 
+def is_s3(model_or_path: str) -> bool:
+    return model_or_path.lower().startswith('s3://')
+
+
 def check_gguf_file(model: Union[str, PathLike]) -> bool:
     """Check if the file is a GGUF model."""
     model = Path(model)

From d573aeadcc891976f09d6d50f1a4f98c8ff809aa Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 20 Dec 2024 14:03:50 -0500
Subject: [PATCH 12/14] [Bugfix] Don't log OpenAI field aliases as ignored
 (#11378)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/entrypoints/openai/protocol.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 1314de714215e..1d8b0d19f9516 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -46,7 +46,15 @@ class OpenAIBaseModel(BaseModel):
     @classmethod
     def __log_extra_fields__(cls, data):
         if isinstance(data, dict):
-            extra_fields = data.keys() - cls.model_fields.keys()
+            # Get all class field names and their potential aliases
+            field_names = set()
+            for field_name, field in cls.model_fields.items():
+                field_names.add(field_name)
+                if hasattr(field, 'alias') and field.alias:
+                    field_names.add(field.alias)
+
+            # Compare against both field names and aliases
+            extra_fields = data.keys() - field_names
             if extra_fields:
                 logger.warning(
                     "The following fields were present in the request "

From 5d2248d81ab1f83a2874bfa726f0a1933ef2d048 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 20 Dec 2024 13:00:56 -0800
Subject: [PATCH 13/14] [doc] explain nccl requirements for rlhf (#11381)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/debugging.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 7f36d65a227f0..b123960533816 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -200,3 +200,4 @@ try this instead:
 Known Issues
 ----------------------------------------
 - In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_.
+- To circumvent a NCCL `bug <https://github.com/NVIDIA/nccl/issues/1234>`__ , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in `the RLHF integration <https://github.com/OpenRLHF/OpenRLHF/pull/604>`__ and the `discussion <https://github.com/vllm-project/vllm/issues/5723#issuecomment-2554389656>`__ .

From 47a0b615b45efd0a9ed57049d8ca6eff1c249844 Mon Sep 17 00:00:00 2001
From: Jiaxin Shan <seedjeffwan@gmail.com>
Date: Fri, 20 Dec 2024 13:54:55 -0800
Subject: [PATCH 14/14] Add ray[default] to wget to run distributed inference
 out of box (#11265)

Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
---
 Dockerfile            | 2 +-
 requirements-cuda.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 84350cde59bfb..6226569e9d3b4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -163,7 +163,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
     && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 058ab7c1ee9df..8002fbd8ee5b9 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for NVIDIA GPUs
-ray >= 2.9
+ray[default] >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
 torch == 2.5.1
 # These must be updated alongside torch