From adac58e10fc2e2a217bdef7eb21d88d36d5bfee1 Mon Sep 17 00:00:00 2001
From: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Date: Tue, 17 Dec 2024 11:26:19 +0200
Subject: [PATCH 01/67] multimodality fix

---
 vllm/attention/layer.py         |  2 +-
 vllm/worker/hpu_model_runner.py | 35 +++++++++++++++++++++++++++++----
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 05d997279893b..614cbd799249e 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -228,7 +228,7 @@ def forward(
                                                  value,
                                                  scale=self.scale)
             out = out.transpose(1, 2)
-        return out.view(bsz, q_len, -1)
+        return out.contiguous().view(bsz, q_len, -1)
 
 
 def unified_attention(
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 7c3679d40546d..83d9af5896725 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -43,7 +43,7 @@
 from vllm.model_executor.models import supports_multimodal
 from vllm.model_executor.sampling_metadata import SequenceGroupToSample
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs, MultiModalRegistry)
+                             MultiModalKwargs, MultiModalRegistry, MultiModalPlaceholderMap)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
                            Logprob, SequenceData, SequenceGroupMetadata,
@@ -793,7 +793,10 @@ def _prepare_prompt(
         query_lens: List[int] = []
         prefix_block_tables: List[List[int]] = []
         multi_modal_kwargs_list: List[MultiModalKwargs] = []
-
+        multi_modal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = collections.defaultdict(MultiModalPlaceholderMap)
+        
         if len(seq_group_metadata_list) == 0:
             return PreparePromptMetadata.empty()
 
@@ -850,11 +853,35 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
 
-            mm_data = seq_group_metadata.multi_modal_data
-            if mm_data:
+             mm_data = seq_group_metadata.multi_modal_data
+             if mm_data:
                 mm_kwargs = self.multi_modal_input_mapper(mm_data)
                 multi_modal_kwargs_list.append(mm_kwargs)
 
+            computed_len = seq_data.get_num_computed_tokens()
+            positions_range = range(computed_len, seq_len)
+
+            if seq_group_metadata.multi_modal_data:
+                mm_data, placeholder_maps = MultiModalPlaceholderMap \
+                    .from_seq_group(seq_group_metadata, positions_range)
+
+                print(mm_data)
+
+                if self.mm_registry.has_processor(
+                        self.model_config):
+                    mm_kwargs = mm_data
+                else:
+                    mm_kwargs = self.multi_modal_input_mapper(
+                        mm_data,
+                        seq_group_metadata.mm_processor_kwargs,
+                    )
+
+                multi_modal_kwargs_list.append(mm_kwargs)
+
+                for modality, placeholder_map in placeholder_maps.items():
+                    multi_modal_placeholder_maps[modality].extend(
+                        placeholder_map)
+
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
                 # yet. In this case, we just use a dummy slot mapping.

From e8ce81e8cd3f29f6c74792db3e780b414240697a Mon Sep 17 00:00:00 2001
From: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Date: Tue, 17 Dec 2024 15:17:41 +0200
Subject: [PATCH 02/67] formating

---
 vllm/worker/hpu_model_runner.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 83d9af5896725..2eb54ceabf46f 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -43,7 +43,8 @@
 from vllm.model_executor.models import supports_multimodal
 from vllm.model_executor.sampling_metadata import SequenceGroupToSample
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs, MultiModalRegistry, MultiModalPlaceholderMap)
+                             MultiModalKwargs, MultiModalPlaceholderMap,
+                             MultiModalRegistry)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
                            Logprob, SequenceData, SequenceGroupMetadata,
@@ -794,9 +795,9 @@ def _prepare_prompt(
         prefix_block_tables: List[List[int]] = []
         multi_modal_kwargs_list: List[MultiModalKwargs] = []
         multi_modal_placeholder_maps: Dict[
-            str,
-            MultiModalPlaceholderMap] = collections.defaultdict(MultiModalPlaceholderMap)
-        
+            str, MultiModalPlaceholderMap] = collections.defaultdict(
+                MultiModalPlaceholderMap)
+
         if len(seq_group_metadata_list) == 0:
             return PreparePromptMetadata.empty()
 
@@ -853,8 +854,8 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
 
-             mm_data = seq_group_metadata.multi_modal_data
-             if mm_data:
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
                 mm_kwargs = self.multi_modal_input_mapper(mm_data)
                 multi_modal_kwargs_list.append(mm_kwargs)
 
@@ -865,10 +866,7 @@ def _prepare_prompt(
                 mm_data, placeholder_maps = MultiModalPlaceholderMap \
                     .from_seq_group(seq_group_metadata, positions_range)
 
-                print(mm_data)
-
-                if self.mm_registry.has_processor(
-                        self.model_config):
+                if self.mm_registry.has_processor(self.model_config):
                     mm_kwargs = mm_data
                 else:
                     mm_kwargs = self.multi_modal_input_mapper(

From 67df809851750200514999ff489c35bda6b504a2 Mon Sep 17 00:00:00 2001
From: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Date: Wed, 18 Dec 2024 17:32:03 +0200
Subject: [PATCH 03/67] undo changes in layer.py

---
 vllm/attention/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 614cbd799249e..05d997279893b 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -228,7 +228,7 @@ def forward(
                                                  value,
                                                  scale=self.scale)
             out = out.transpose(1, 2)
-        return out.contiguous().view(bsz, q_len, -1)
+        return out.view(bsz, q_len, -1)
 
 
 def unified_attention(

From 1259d8db566f7e1d4b1453358d3c4c310a9c332d Mon Sep 17 00:00:00 2001
From: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Date: Fri, 20 Dec 2024 11:01:03 +0200
Subject: [PATCH 04/67] remove past code

---
 vllm/worker/hpu_model_runner.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 1aaa8cd108d6e..673536cc4abe2 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -865,11 +865,6 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
 
-            mm_data = seq_group_metadata.multi_modal_data
-            if mm_data:
-                mm_kwargs = self.multi_modal_input_mapper(mm_data)
-                multi_modal_kwargs_list.append(mm_kwargs)
-
             computed_len = seq_data.get_num_computed_tokens()
             positions_range = range(computed_len, seq_len)
 

From cbfb022a9a3fb5b8f2455c0596f1e69ddf0ef8e6 Mon Sep 17 00:00:00 2001
From: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Date: Thu, 9 Jan 2025 09:53:09 +0200
Subject: [PATCH 05/67] send placeholder_index_maps

---
 vllm/worker/hpu_model_runner.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 673536cc4abe2..40c8433afb6af 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -865,12 +865,10 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
 
-            computed_len = seq_data.get_num_computed_tokens()
-            positions_range = range(computed_len, seq_len)
-
             if seq_group_metadata.multi_modal_data:
+                positions = input_positions[0]
                 mm_data, placeholder_maps = MultiModalPlaceholderMap \
-                    .from_seq_group(seq_group_metadata, positions_range)
+                    .from_seq_group(seq_group_metadata, range(positions[0], positions[0] + len(positions)))
 
                 if self.mm_registry.has_processor(self.model_config):
                     mm_kwargs = mm_data
@@ -988,6 +986,12 @@ def _prepare_prompt(
                                            dtype=torch.long,
                                            device='cpu')
 
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            multi_modal_placeholder_maps.items()
+        }
+
         # Note: num_prefill_tokens is calculated using the length of
         # input_tokens after padding.
         num_prefill_tokens = input_tokens_tensor.numel()
@@ -1021,8 +1025,7 @@ def _prepare_prompt(
             num_prefill_tokens=num_prefill_tokens,
             num_decode_tokens=0,
             slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=
-            None  # FIXME(kzawora): mutli-modality will not work here
+            multi_modal_placeholder_index_maps=placeholder_index_maps
         )
         multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
         for t in multi_modal_kwargs:

From bb354e6b2dd4f8154c39c33b7eee77fa452b7703 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 14 Jan 2025 20:16:11 +0800
Subject: [PATCH 06/67] [Bugfix] Fix various bugs in multi-modal processor
 (#12031)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py | 19 ++++++
 vllm/multimodal/processing.py       | 89 +++++++++++++----------------
 vllm/multimodal/registry.py         |  5 +-
 3 files changed, 62 insertions(+), 51 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 54269c3ef7ce0..9e58ed4cfde93 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -421,6 +421,8 @@ def test_find_replace_tokens(
             "pattern_1": [32000, 32000],
             "pattern_2": [],
             "pattern_3": [1550, 918, 1550],
+            # Test different modalities having the same tokens (32000)
+            "pattern_4": [32000],
         },
     ],
 )
@@ -438,6 +440,14 @@ def test_find_replace_tokens(
                         replacement=[32000, 32000],
                     ),
                 ],
+                "pattern_4": [
+                    PlaceholderInfo(
+                        modality="pattern_4",
+                        item_idx=0,
+                        start_idx=3,
+                        replacement=[32000],
+                    ),
+                ],
             }
 
         ),
@@ -466,6 +476,7 @@ def test_find_replace_tokens(
                         replacement=[1550, 918, 1550],
                     ),
                 ],
+                # No match for pattern_4 as it has lower priority than pattern_1
             }
         ),
         (
@@ -485,6 +496,14 @@ def test_find_replace_tokens(
                         replacement=[32000, 32000],
                     ),
                 ],
+                "pattern_4": [
+                    PlaceholderInfo(
+                        modality="pattern_4",
+                        item_idx=0,
+                        start_idx=5,
+                        replacement=[32000],
+                    ),
+                ],
                 "pattern_3": [
                     PlaceholderInfo(
                         modality="pattern_3",
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 8b47dfb07387f..fa199a07b4cf8 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -404,73 +404,62 @@ def replace_text_matches(
     return "".join(texts)
 
 
-def _iter_modality_placeholders(
+def _iter_placeholders(
+    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
-    modality: str,
-    modality_repls: Sequence[BoundPromptReplacement],
-    modal_item_count: int,
+    mm_item_counts: Mapping[str, int],
 ) -> Iterable[PlaceholderInfo]:
-    if modal_item_count == 0:
-        return
+    """
+    Yield each set of placeholder tokens found in :code:`prompt`.
+
+    Matches are exclusive even when multiple modalities share
+    the same placeholder tokens. In that case, the modality that
+    appears earlier in `mm_prompt_repls` takes priority.
 
+    Note that empty matches are ignored.
+    """
     prompt_len = len(prompt)
-    item_idx = 0
+    item_idx_by_modality = defaultdict[str, int](lambda: 0)
 
     start_idx = 0
     while start_idx < prompt_len:
         found = False
 
-        for repl_info in modality_repls:
-            replacement = repl_info.get_replacement(item_idx)
-            repl_tokens = replacement.token_ids
-            repl_len = len(repl_tokens)
-            end_idx = start_idx + repl_len
-
-            if repl_len == 0 or end_idx > prompt_len:
+        for modality, modality_repls in mm_prompt_repls.items():
+            item_idx = item_idx_by_modality[modality]
+            if item_idx >= mm_item_counts.get(modality, 0):
                 continue
 
-            if prompt[start_idx:end_idx] == repl_tokens:
-                yield PlaceholderInfo(
-                    modality=modality,
-                    item_idx=item_idx,
-                    start_idx=start_idx,
-                    replacement=repl_tokens,
-                )
+            for repl_info in modality_repls:
+                replacement = repl_info.get_replacement(item_idx)
+                repl_tokens = replacement.token_ids
+                repl_len = len(repl_tokens)
+                end_idx = start_idx + repl_len
+
+                if repl_len == 0 or end_idx > prompt_len:
+                    continue
+
+                if prompt[start_idx:end_idx] == repl_tokens:
+                    yield PlaceholderInfo(
+                        modality=modality,
+                        item_idx=item_idx,
+                        start_idx=start_idx,
+                        replacement=repl_tokens,
+                    )
 
-                item_idx += 1
-                if item_idx >= modal_item_count:
-                    return
+                    # Exclude overlapping matches
+                    start_idx = end_idx
+                    item_idx_by_modality[modality] += 1
+                    found = True
+                    break
 
-                # Exclude overlapping matches
-                start_idx = end_idx
-                found = True
-                break
+            if found:
+                break  # Go back to the outer while loop
 
         if not found:
             start_idx += 1
 
 
-def _iter_placeholders(
-    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
-    prompt: list[int],
-    mm_item_counts: Mapping[str, int],
-) -> Iterable[PlaceholderInfo]:
-    """
-    For each modality, yield each set of placeholder tokens found in
-    :code:`prompt`.
-
-    Note that empty matches are ignored.
-    """
-    for modality, modal_item_count in mm_item_counts.items():
-        if modality in mm_prompt_repls:
-            yield from _iter_modality_placeholders(
-                prompt,
-                modality,
-                mm_prompt_repls[modality],
-                modal_item_count,
-            )
-
-
 def find_mm_placeholders(
     mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
@@ -1156,7 +1145,7 @@ def apply(
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
-        if all(len(repls) == 0 for repls in mm_missing_repls.items()):
+        if all(len(repls) == 0 for repls in mm_missing_repls.values()):
             tokenizer = self.info.get_tokenizer()
             prompt = decode_tokens(tokenizer, prompt_ids)
             mm_placeholders = hf_mm_placeholders
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 804a91da8c889..2961f7c76ca12 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -259,7 +259,10 @@ def get_max_tokens_per_item_by_modality(
             This is currently directly used only in V1.
         """
         if self.has_processor(model_config):
-            tokenizer = cached_get_tokenizer(model_config.tokenizer)
+            tokenizer = cached_get_tokenizer(
+                model_config.tokenizer,
+                trust_remote_code=model_config.trust_remote_code,
+            )
             processor = self.create_processor(model_config, tokenizer)
             seq_len = model_config.max_model_len
             return processor.info.get_mm_max_tokens_per_item(seq_len)

From 1f18adb2451e9b45048d17023169bd8cbb39747e Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 14 Jan 2025 20:59:32 +0800
Subject: [PATCH 07/67] [Kernel] Revert the API change of Attention.forward
 (#12038)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/attention/layer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index c7e7a4d52e5a7..a06db075f334d 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -134,8 +134,8 @@ def forward(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        _kv_cache: torch.Tensor,
-        _attn_metadata: AttentionMetadata,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         if self.use_output:
             output = torch.empty_like(query)

From 2e0e0176104965c9c8c090609f331f9b70e492f3 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Tue, 14 Jan 2025 21:27:04 +0800
Subject: [PATCH 08/67] [Platform] Add output for Attention Backend (#11981)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/attention/backends/abstract.py      | 4 ++++
 vllm/attention/backends/flash_attn.py    | 2 ++
 vllm/attention/layer.py                  | 6 +-----
 vllm/v1/attention/backends/flash_attn.py | 2 ++
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index f5dcaea79af93..737559bfe70ca 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -31,6 +31,10 @@ class AttentionType:
 
 class AttentionBackend(ABC):
     """Abstract class for attention backends."""
+    # For some attention backends, we allocate an output tensor before
+    # calling the custom op. When piecewise cudagraph is enabled, this
+    # makes sure the output tensor is allocated inside the cudagraph.
+    accept_output_buffer: bool = False
 
     @staticmethod
     @abstractmethod
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 23ea244f07dfe..48b3e8d177ec9 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -29,6 +29,8 @@
 
 class FlashAttentionBackend(AttentionBackend):
 
+    accept_output_buffer: bool = True
+
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index a06db075f334d..a283e87d84070 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -110,11 +110,7 @@ def __init__(
         self.use_direct_call = not current_platform.is_cuda_alike(
         ) and not current_platform.is_cpu()
 
-        # For some attention backends, we allocate an output tensor before
-        # calling the custom op. When piecewise cudagraph is enabled, this
-        # makes sure the output tensor is allocated inside the cudagraph.
-        self.use_output = self.backend == _Backend.FLASH_ATTN or \
-            self.backend == _Backend.FLASH_ATTN_VLLM_V1
+        self.use_output = attn_backend.accept_output_buffer
         compilation_config = get_current_vllm_config().compilation_config
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index b02bc9ffde538..7b0786261a6a6 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -15,6 +15,8 @@
 
 class FlashAttentionBackend(AttentionBackend):
 
+    accept_output_buffer: bool = True
+
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]

From a2d2acb4c8d240b1e5946afe2736e497ce5b71a2 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 14 Jan 2025 23:45:05 +0800
Subject: [PATCH 09/67] [Bugfix][Kernel] Give unique name to
 BlockSparseFlashAttention (#12040)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/attention/backends/blocksparse_attn.py | 3 +--
 vllm/platforms/interface.py                 | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 7089d59392c36..77cfa8490172b 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -89,8 +89,7 @@ class BlocksparseFlashAttentionBackend(AttentionBackend):
 
     @staticmethod
     def get_name() -> str:
-        # For attention layer compatibility
-        return "FLASH_ATTN"
+        return "BLOCK_SPARSE_FLASH_ATTN"
 
     @staticmethod
     def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 3c2ec9636df91..85fde76796901 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -33,6 +33,7 @@ class _Backend(enum.Enum):
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
     IPEX = enum.auto()
+    BLOCK_SPARSE_FLASH_ATTN = enum.auto()
     NO_ATTENTION = enum.auto()
 
 

From c9d6ff530b32c526bedda3105dcbab3d2f6ce992 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 14 Jan 2025 16:05:50 +0000
Subject: [PATCH 10/67] Explain where the engine args go when using Docker
 (#12041)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/deployment/docker.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 9e301483ef7f9..2606e2765c1ae 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -19,6 +19,8 @@ $ docker run --runtime nvidia --gpus all \
     --model mistralai/Mistral-7B-v0.1
 ```
 
+You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
+
 ```{note}
 You can either use the `ipc=host` flag or `--shm-size` flag to allow the
 container to access the host's shared memory. vLLM uses PyTorch, which uses shared

From 87054a57ab39bad6c7fe8999e7d93566ded713e3 Mon Sep 17 00:00:00 2001
From: maang-h <55082429+maang-h@users.noreply.github.com>
Date: Wed, 15 Jan 2025 01:03:04 +0800
Subject: [PATCH 11/67] [Doc]: Update the Json Example of the `Engine
 Arguments` document (#12045)

---
 vllm/engine/arg_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0850bab6bb7e1..c31b206d6f60e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -538,7 +538,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=None,
             type=json.loads,
             help='RoPE scaling configuration in JSON format. '
-            'For example, {"rope_type":"dynamic","factor":2.0}')
+            'For example, ``{"rope_type":"dynamic","factor":2.0}``')
         parser.add_argument('--rope-theta',
                             default=None,
                             type=float,
@@ -607,7 +607,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=None,
             type=json.loads,
             help=('Overrides for the multimodal input mapping/processing, '
-                  'e.g., image processor. For example: {"num_crops": 4}.'))
+                  'e.g., image processor. For example: ``{"num_crops": 4}``.'))
         parser.add_argument(
             '--disable-mm-preprocessor-cache',
             action='store_true',
@@ -908,13 +908,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=json.loads,
             default=None,
             help="Override or set neuron device configuration. "
-            "e.g. {\"cast_logits_dtype\": \"bloat16\"}.'")
+            "e.g. ``{\"cast_logits_dtype\": \"bloat16\"}``.")
         parser.add_argument(
             '--override-pooler-config',
             type=PoolerConfig.from_json,
             default=None,
             help="Override or set the pooling method for pooling models. "
-            "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")
+            "e.g. ``{\"pooling_type\": \"mean\", \"normalize\": false}``.")
 
         parser.add_argument('--compilation-config',
                             '-O',

From a3a3ee4e6febe8c270fdec0765c844186a728079 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 15 Jan 2025 07:49:49 +0800
Subject: [PATCH 12/67] [Misc]  Merge bitsandbytes_stacked_params_mapping and
 packed_modules_mapping (#11924)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/model_loader/loader.py | 27 +++++++++-------------
 vllm/model_executor/model_loader/utils.py  | 26 ++++++++++++++++++++-
 vllm/model_executor/models/baichuan.py     |  7 ------
 vllm/model_executor/models/exaone.py       |  8 -------
 vllm/model_executor/models/falcon.py       |  6 ++---
 vllm/model_executor/models/gemma.py        |  9 --------
 vllm/model_executor/models/gemma2.py       | 10 --------
 vllm/model_executor/models/granite.py      |  8 -------
 vllm/model_executor/models/idefics3.py     | 10 --------
 vllm/model_executor/models/llama.py        | 10 --------
 vllm/model_executor/models/llava.py        | 12 ++++------
 vllm/model_executor/models/minicpm.py      | 10 --------
 vllm/model_executor/models/minicpm3.py     |  6 -----
 vllm/model_executor/models/minicpmv.py     | 20 ----------------
 vllm/model_executor/models/mllama.py       | 11 +++------
 vllm/model_executor/models/molmo.py        |  6 -----
 vllm/model_executor/models/nemotron.py     |  6 -----
 vllm/model_executor/models/opt.py          | 10 +++-----
 vllm/model_executor/models/phi.py          |  8 -------
 vllm/model_executor/models/phi3.py         |  4 ----
 vllm/model_executor/models/qwen.py         |  7 ------
 vllm/model_executor/models/qwen2.py        | 10 --------
 vllm/model_executor/models/qwen2_vl.py     | 10 --------
 vllm/model_executor/models/solar.py        |  8 -------
 24 files changed, 49 insertions(+), 200 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 0033fbff0e9ac..9fe0db62435a0 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -39,7 +39,8 @@
 from vllm.model_executor.model_loader.tensorizer import (
     TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
     serialize_vllm_model, tensorizer_weights_iterator)
-from vllm.model_executor.model_loader.utils import (get_model_architecture,
+from vllm.model_executor.model_loader.utils import (ParamMapping,
+                                                    get_model_architecture,
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
@@ -983,21 +984,11 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
 
     def _get_bnb_target_modules(self, model: nn.Module) -> None:
 
-        # TODO: Maybe we can replace bitsandbytes_stacked_params_mapping with
-        # packed_modules_mapping.
-        inverse_stacked_mapping: Dict[str, List[str]] = {}
-        for orig, (
-                packed,
-                idx,
-        ) in model.bitsandbytes_stacked_params_mapping.items():
-            if packed not in inverse_stacked_mapping:
-                inverse_stacked_mapping[packed] = []
-            inverse_stacked_mapping[packed].insert(idx, orig)
-
         for name, module in model.named_modules():
             if isinstance(module, (LinearBase, )):
                 last_name = name.split(".")[-1]
-                if sub_modules := inverse_stacked_mapping.get(last_name, []):
+                if sub_modules := self.modules_mapping.packed_mapping.get(
+                        last_name, []):
                     # Map vllm's names to transformers's names.
                     for sub_name in sub_modules:
                         self.target_modules.append(
@@ -1018,15 +1009,19 @@ def _load_weights(self, model_config: ModelConfig,
                 "The required method 'load_weights' is not defined in class"
                 f" {type(model).__name__}.")
 
-        if not hasattr(model, "bitsandbytes_stacked_params_mapping"):
+        if not hasattr(model, "packed_modules_mapping"):
             raise AttributeError(
                 f"Model {type(model).__name__} does not support BitsAndBytes "
-                "quantization yet.")
+                "quantization yet. No 'packed_modules_mapping' found.")
+
+        self.modules_mapping = ParamMapping(
+            copy.deepcopy(model.packed_modules_mapping))
 
         # For some models like Molmo, we need to use hf_to_vllm_mapper
         # to ensure correct loading of weights.
         if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
             self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name)
+
         # Modules whose weights might have fused on disk
         # we need their output_sizes to make shard in flight correctly with TP
         self.maybe_fused_weights_modules: Dict[str, List[int]] = {}
@@ -1109,7 +1104,7 @@ def _load_weights(self, model_config: ModelConfig,
             for shard_name, (
                     weight_name,
                     index,
-            ) in model.bitsandbytes_stacked_params_mapping.items():
+            ) in self.modules_mapping.inverse_packed_mapping.items():
                 shard_pos = quant_param_name.find(shard_name)
                 # Some models, such as MiniCPM V2.5/2.6, contain both
                 # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 44978a55e072d..3f923d2f6632a 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -1,6 +1,7 @@
 """Utilities for selecting and loading models."""
 import contextlib
-from typing import Tuple, Type
+from dataclasses import dataclass, field
+from typing import Dict, List, Tuple, Type
 
 import torch
 from torch import nn
@@ -49,3 +50,26 @@ def get_model_architecture(
 
 def get_architecture_class_name(model_config: ModelConfig) -> str:
     return get_model_architecture(model_config)[1]
+
+
+@dataclass
+class ParamMapping:
+    """
+    A class to handle parameter mapping for model weight loading.
+    It creates a bidirectional mapping between packed parameters and their 
+    constituent parts.
+    """
+    packed_mapping: Dict[str, List[str]]
+    inverse_packed_mapping: Dict[str, Tuple[str,
+                                            int]] = field(default_factory=dict)
+
+    def __post_init__(self):
+        for packed_name, sub_params in self.packed_mapping.items():
+            # Skip self-contained cases (e.g., {"W_pack": ["W_pack"]})
+            if len(sub_params) == 1 and sub_params[0] == packed_name:
+                continue
+            for index, param_name in enumerate(sub_params):
+                self.inverse_packed_mapping[param_name] = (
+                    packed_name,
+                    index,
+                )
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 5e68b7f165bf4..a923ed36a9db2 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -350,13 +350,6 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     def __init__(
         self,
         *,
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 8324a563edd64..ad15f835b1609 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -430,14 +430,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "lm_head": "output_embeddings",
     }
     embedding_padding_modules = ["lm_head"]
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "c_fc_0": ("gate_up_proj", 0),
-        "c_fc_1": ("gate_up_proj", 1),
-    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 8660cf79b9cdb..c503a368e8244 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -409,9 +409,9 @@ def forward(
 
 
 class FalconForCausalLM(nn.Module, SupportsPP):
-
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {}
+    packed_modules_mapping = {
+        "query_key_value": ["query_key_value"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index b28715c48adfb..6de0c866bc2f0 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -349,15 +349,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "gate_up_proj",
         "down_proj",
     ]
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
 
     # Gemma does not apply LoRA to the embedding layer.
     embedding_modules = {}
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index f4530e4771960..698b9a5b6b1d6 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -399,16 +399,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index a91ed4158a73f..3e95926fd1e22 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -362,14 +362,6 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "lm_head": "output_embeddings",
     }
     embedding_padding_modules = ["lm_head"]
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 17e772e7faa32..d16a77f862d98 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -662,16 +662,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         "down_proj",
     ]
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     embedding_modules = {}
     embedding_padding_modules = []
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 17b0fbb777e8e..16fa7acf54fdc 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -478,16 +478,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
     embedding_padding_modules = ["lm_head"]
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     # Mistral/Llama models can also be loaded with --load-format mistral
     # from consolidated.safetensors checkpoints
     mistral_mapping = {
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index bb3db60c7d8ed..722fff98d5c19 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -463,14 +463,10 @@ def init_vision_tower_for_llava(
                                         info=_build_llava_or_pixtral_hf_info,
                                         dummy_inputs=LlavaDummyInputsBuilder)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 5a0f202364f26..6254d26c7060d 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -534,16 +534,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
     embedding_padding_modules = ["lm_head"]
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index e9d7eada1d16c..5e1e6c6fa6141 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -241,11 +241,5 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
     # `embedding_modules` and `embedding_padding_modules`
     # are inherited from MiniCPMForCausalLM
 
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
         return MiniCPM3Model(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index ff7dab89e4da8..1aa529056893b 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -761,16 +761,6 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
         "kv_proj",
     ]
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     embedding_modules = {}
     embedding_padding_modules = []
 
@@ -881,16 +871,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
         "kv_proj",
     ]
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     embedding_modules = {}
     embedding_padding_modules = []
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 593a4d3fb6940..b2368ffff5412 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1107,14 +1107,9 @@ def forward(
 @INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_mllama)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_mllama)
 class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index c45ee9b921c9e..a2fd1701316f2 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1193,12 +1193,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
     embedding_modules = {}
     embedding_padding_modules = []
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        "gate_proj": ("merged_linear", 0),
-        "up_proj": ("merged_linear", 1),
-    }
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 34cb9981c167b..8cc62d5c803cc 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -395,12 +395,6 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "lm_head": "output_embeddings",
     }
     embedding_padding_modules = ["lm_head"]
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 7edafcd20b5db..ea1185aa80dc6 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -329,13 +329,9 @@ def forward(
 
 
 class OPTForCausalLM(nn.Module, SupportsPP):
-
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index f9e972688ddd1..59b7508a370f8 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -279,14 +279,6 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "fc2",
     ]
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-    }
-
     embedding_modules = {}
     embedding_padding_modules = []
 
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
index 937858ee3b8c2..34141511ea791 100644
--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
@@ -14,7 +14,3 @@ class Phi3ForCausalLM(LlamaForCausalLM):
             "gate_up_proj",
         ],
     }
-
-    # BitandBytes specific attributes
-    # Initialize an empty dict when there is no stacked parameter mapping.
-    bitsandbytes_stacked_params_mapping = {}
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index baf955f6b515d..1345b381f0a99 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1028,13 +1028,6 @@ class QWenLLM(QWenBaseModel):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "w2": ("gate_up_proj", 0),
-        "w1": ("gate_up_proj", 1),
-    }
-
 
 class QWenVL(QWenBaseModel, SupportsMultiModal):
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index d20fb150f7e39..0a99c87470850 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -418,16 +418,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 76a810e8f0c20..d00e5d362c8bc 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1038,16 +1038,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     embedding_modules = {}
     embedding_padding_modules = []
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
         "lm_head.": "language_model.lm_head.",
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index a7cf65a0e36e4..e83d316f74de2 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -401,14 +401,6 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "lm_head": "output_embeddings",
     }
     embedding_padding_modules = ["lm_head"]
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()

From 42f5e7c52a5852e20937001332572c8cb8115af0 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 15 Jan 2025 10:29:53 +0800
Subject: [PATCH 13/67] [Kernel] Support MulAndSilu (#11624)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 csrc/activation_kernels.cu               | 32 +++++++++++++++++-----
 csrc/ops.h                               |  2 ++
 csrc/torch_bindings.cpp                  |  3 ++
 tests/kernels/test_activation.py         | 20 +++++++++-----
 vllm/model_executor/layers/activation.py | 35 ++++++++++++++++++++++++
 vllm/model_executor/models/molmo.py      | 14 ++--------
 vllm/model_executor/models/ultravox.py   | 13 ++-------
 7 files changed, 83 insertions(+), 36 deletions(-)

diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 839dc36ba4e29..88275dbdd83a1 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -9,8 +9,16 @@
 
 namespace vllm {
 
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          bool act_first>
+__device__ __forceinline__ scalar_t compute(const scalar_t& x,
+                                            const scalar_t& y) {
+  return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
+}
 // Activation and gating kernel template.
-template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          bool act_first>
 __global__ void act_and_mul_kernel(
     scalar_t* __restrict__ out,          // [..., d]
     const scalar_t* __restrict__ input,  // [..., 2, d]
@@ -19,7 +27,7 @@ __global__ void act_and_mul_kernel(
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
     const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = ACT_FN(x) * y;
+    out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
   }
 }
 
@@ -55,7 +63,9 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
 }  // namespace vllm
 
 // Launch activation and gating kernel.
-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                            \
+// Use ACT_FIRST (bool) indicating whether to apply the activation function
+// first.
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, ACT_FIRST)                 \
   int d = input.size(-1) / 2;                                            \
   int64_t num_tokens = input.numel() / input.size(-1);                   \
   dim3 grid(num_tokens);                                                 \
@@ -64,7 +74,7 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
   VLLM_DISPATCH_FLOATING_TYPES(                                          \
       input.scalar_type(), "act_and_mul_kernel", [&] {                   \
-        vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
+        vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>, ACT_FIRST>  \
             <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
                                          input.data_ptr<scalar_t>(), d); \
       });
@@ -72,19 +82,27 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
 void silu_and_mul(torch::Tensor& out,    // [..., d]
                   torch::Tensor& input)  // [..., 2 * d]
 {
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true);
+}
+
+void mul_and_silu(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
+{
+  // The difference between mul_and_silu and silu_and_mul is that mul_and_silu
+  // applies the silu to the latter half of the input.
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false);
 }
 
 void gelu_and_mul(torch::Tensor& out,    // [..., d]
                   torch::Tensor& input)  // [..., 2 * d]
 {
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true);
 }
 
 void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
                        torch::Tensor& input)  // [..., 2 * d]
 {
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true);
 }
 
 namespace vllm {
diff --git a/csrc/ops.h b/csrc/ops.h
index 9efd9b0c24700..5a194a0dd3654 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -86,6 +86,8 @@ void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
 
 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 
+void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
+
 void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
 
 void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 956258c1001d3..fb53d122487d3 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -55,6 +55,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
   ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
 
+  ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()");
+  ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu);
+
   // Activation function used in GeGLU with `none` approximation.
   ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);
diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index a84501f9c303f..dac26efe866b8 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -6,8 +6,9 @@
 
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
-                                                   GeluAndMul, NewGELU,
-                                                   QuickGELU, SiluAndMul)
+                                                   GeluAndMul, MulAndSilu,
+                                                   NewGELU, QuickGELU,
+                                                   SiluAndMul)
 from vllm.platforms import current_platform
 
 from .allclose_default import get_default_atol, get_default_rtol
@@ -21,8 +22,9 @@
 ]
 
 
-@pytest.mark.parametrize("activation",
-                         ["silu", "gelu", "gelu_tanh", "fatrelu"])
+@pytest.mark.parametrize(
+    "activation",
+    ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"])
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -40,9 +42,12 @@ def test_act_and_mul(
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, 2 * d, dtype=dtype)
-    if activation == "silu":
+    if activation == "silu_and_mul":
         layer = SiluAndMul()
         fn = torch.ops._C.silu_and_mul
+    if activation == "mul_and_silu":
+        layer = MulAndSilu()
+        fn = torch.ops._C.mul_and_silu
     elif activation == "gelu":
         layer = GeluAndMul(approximate="none")
         fn = torch.ops._C.gelu_and_mul
@@ -55,8 +60,9 @@ def test_act_and_mul(
         fn = torch.ops._C.fatrelu_and_mul
     out = layer(x)
     ref_out = layer.forward_native(x)
-    # The SiLU, GELU and FatReLU implementations are equivalent to the native
-    # PyTorch implementations, so we can do exact comparison.
+    # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
+    # equivalent to the native PyTorch implementations, so we can do exact
+    # comparison.
     torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
 
     d = x.shape[-1] // 2
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 2475190d197d3..af7894b42c560 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -87,6 +87,41 @@ def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         return out
 
 
+@CustomOp.register("mul_and_silu")
+class MulAndSilu(CustomOp):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.mul_and_silu
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.silu_and_mul
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        d = x.shape[-1] // 2
+        return x[..., :d] * F.silu(x[..., d:])
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        self.op(out, x)
+        return out
+
+    # TODO implement forward_xpu for MulAndSilu
+    # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+
+
 @CustomOp.register("gelu_and_mul")
 class GeluAndMul(CustomOp):
     """An activation function for GeGLU.
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index a2fd1701316f2..5c7ae0deefcd8 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -23,7 +23,8 @@
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
+from vllm.model_executor.layers.activation import (MulAndSilu, QuickGELU,
+                                                   SiluAndMul)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
@@ -462,15 +463,6 @@ def forward(
         return output
 
 
-class SwiGLU(nn.Module):
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x, gate = x.chunk(2, dim=-1)
-        # Note that the order is reversed compared to
-        # SiluAndMul.
-        return x * F.silu(gate)
-
-
 class LanuageModelMLP(nn.Module):
     """Molmo's LLM mlp."""
 
@@ -489,7 +481,7 @@ def __init__(self,
             quant_config=quant_config,
         )
         # Activation function.
-        self.act_fn = SwiGLU()
+        self.act_fn = MulAndSilu()
         # Feed-forward output projection.
         self.down_proj = RowParallelLinear(
             self.intermediate_size,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 3edfb5107683a..587f18ccaf98f 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -16,7 +16,7 @@
 from vllm import envs
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
+from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
@@ -248,15 +248,6 @@ def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
         return audio_embeds
 
 
-class FlippedSiluAndMul(SiluAndMul):
-    """Ultravox is trained with SwiGLU with flipped halves."""
-
-    def forward(self, x: torch.Tensor):
-        a, b = x.chunk(2, dim=-1)
-        flipped = torch.cat((b, a), dim=-1)
-        return super().forward(flipped)
-
-
 class UltravoxProjector(nn.Module):
 
     def __init__(self, config: UltravoxConfig):
@@ -269,7 +260,7 @@ def __init__(self, config: UltravoxConfig):
         dim = self.hidden_dim
 
         if config.projector_act == "swiglu":
-            self.act = FlippedSiluAndMul()
+            self.act = MulAndSilu()
             dim = dim // 2
         else:
             self.act = get_act_fn(config.projector_act)

From 1a51b9f87226b2290c78c65c1de0f585d31f17ce Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 15 Jan 2025 03:59:18 +0100
Subject: [PATCH 14/67] [HPU][Bugfix] Don't use /dev/accel/accel0 for HPU
 autodetection in setup.py (#12046)

Signed-off-by: Konrad Zawora <kzawora@habana.ai>
---
 setup.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index b6c1f5bc8ac3f..7dfcec7f9f0c5 100644
--- a/setup.py
+++ b/setup.py
@@ -324,21 +324,26 @@ def run(self) -> None:
 
 
 def _is_hpu() -> bool:
-    is_hpu_available = True
+    # if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection
+    if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE:
+        return VLLM_TARGET_DEVICE == "hpu"
+
+    # if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds,
+    # and if it doesn't, check if habanalabs driver is loaded
+    is_hpu_available = False
     try:
-        subprocess.run(["hl-smi"], capture_output=True, check=True)
+        out = subprocess.run(["hl-smi"], capture_output=True, check=True)
+        is_hpu_available = out.returncode == 0
     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
-        if not os.path.exists('/dev/accel/accel0') and not os.path.exists(
-                '/dev/accel/accel_controlD0'):
-            # last resort...
+        if sys.platform.startswith("linux"):
             try:
                 output = subprocess.check_output(
                     'lsmod | grep habanalabs | wc -l', shell=True)
                 is_hpu_available = int(output) > 0
             except (ValueError, FileNotFoundError, PermissionError,
                     subprocess.CalledProcessError):
-                is_hpu_available = False
-    return is_hpu_available or VLLM_TARGET_DEVICE == "hpu"
+                pass
+    return is_hpu_available
 
 
 def _no_device() -> bool:

From 9ddac56311b28f08e40a941296eb66fbb1be0a7a Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Wed, 15 Jan 2025 11:38:25 +0800
Subject: [PATCH 15/67] [Platform] move current_memory_usage() into platform
 (#11369)

Signed-off-by: Shanshan Shen <467638484@qq.com>
---
 vllm/platforms/cuda.py      | 7 +++++++
 vllm/platforms/interface.py | 9 +++++++++
 vllm/platforms/rocm.py      | 7 +++++++
 vllm/platforms/xpu.py       | 7 +++++++
 vllm/utils.py               | 8 +-------
 5 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 80cefcb492531..2587e3a11dde3 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -143,6 +143,13 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.cuda.reset_peak_memory_stats(device)
+        return torch.cuda.max_memory_allocated(device)
+
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                              kv_cache_dtype, block_size, use_v1) -> str:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 85fde76796901..f2ecec3203fb7 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -277,6 +277,15 @@ def is_pin_memory_available(cls) -> bool:
             return False
         return True
 
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        """
+        Return the memory usage in bytes.
+        """
+        raise NotImplementedError
+
     @classmethod
     def get_punica_wrapper(cls) -> str:
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 43105d7855e79..67a9e816cb658 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -157,3 +157,10 @@ def verify_quantization(cls, quant: str) -> None:
     @classmethod
     def get_punica_wrapper(cls) -> str:
         return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.cuda.reset_peak_memory_stats(device)
+        return torch.cuda.max_memory_allocated(device)
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index f34376b44e689..031abdc05d517 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -94,3 +94,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on XPU.")
         return False
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.xpu.reset_peak_memory_stats(device)
+        return torch.xpu.max_memory_allocated(device)
diff --git a/vllm/utils.py b/vllm/utils.py
index 9a509da3c1ef1..7477e7028f5ef 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -710,13 +710,7 @@ def __init__(self, device: Optional[torch.types.Device] = None):
     def current_memory_usage(self) -> float:
         # Return the memory usage in bytes.
         from vllm.platforms import current_platform
-        if current_platform.is_cuda_alike():
-            torch.cuda.reset_peak_memory_stats(self.device)
-            mem = torch.cuda.max_memory_allocated(self.device)
-        elif current_platform.is_xpu():
-            torch.xpu.reset_peak_memory_stats(self.device)  # type: ignore
-            mem = torch.xpu.max_memory_allocated(self.device)  # type: ignore
-        return mem
+        return current_platform.get_current_memory_usage(self.device)
 
     def __enter__(self):
         self.initial_memory = self.current_memory_usage()

From b7ee940a828de9d339345e28eee8b13d60d97f26 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 14 Jan 2025 20:21:28 -0800
Subject: [PATCH 16/67] [V1][BugFix] Fix edge case in VLM scheduling (#12065)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/scheduler.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index f04e529891287..2503d136aea7e 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -373,18 +373,22 @@ def _try_schedule_encoder_inputs(
             if self.encoder_cache_manager.has_cache(request, i):
                 # The encoder input is already computed and cached.
                 continue
-            if not self.encoder_cache_manager.can_allocate(request, i):
-                # The encoder cache is full. We can only schedule the decoder
-                # tokens just before the encoder input.
-                num_new_tokens = start_pos - num_computed_tokens
-                break
-            if num_encoder_tokens > encoder_budget:
-                # The encoder budget is exhausted. We can only schedule the
-                # decoder tokens up until the encoder input.
-                # NOTE(woosuk): We assume that the encoder tokens should be
-                # processed altogether, as the encoder usually uses
+            if (not self.encoder_cache_manager.can_allocate(request, i)
+                    or num_encoder_tokens > encoder_budget):
+                # The encoder cache is full or the encoder budget is exhausted.
+                # NOTE(woosuk): We assume that the encoder input tokens should
+                # be processed altogether, as the encoder usually uses
                 # bidirectional attention.
-                num_new_tokens = start_pos - num_computed_tokens
+                if num_computed_tokens < start_pos:
+                    # We only schedule the decoder tokens just before the
+                    # encoder input.
+                    num_new_tokens = start_pos - num_computed_tokens
+                else:
+                    # Because of prefix caching, num_computed_tokens is greater
+                    # than start_pos even though its encoder input is not
+                    # available. In this case, we can't schedule any token for
+                    # the request in this step.
+                    num_new_tokens = 0
                 break
 
             encoder_budget -= num_encoder_tokens

From 0794e7446efca1fd7b8ea1cde96777897660cdea Mon Sep 17 00:00:00 2001
From: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Date: Tue, 14 Jan 2025 20:47:49 -0800
Subject: [PATCH 17/67] [Misc] Add multipstep chunked-prefill support for
 FlashInfer (#10467)

---
 csrc/prepare_inputs/advance_step.cu      |  10 ++
 tests/multi_step/test_correctness_llm.py |  17 +-
 vllm/attention/backends/flashinfer.py    |  29 ++-
 vllm/worker/model_runner.py              | 220 ++++++++++++-----------
 vllm/worker/multi_step_model_runner.py   |   2 +-
 5 files changed, 169 insertions(+), 109 deletions(-)

diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu
index bd184ee22682e..c3902f4c2a163 100644
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -95,6 +95,16 @@ __global__ void advance_step_flashinfer_kernel(
     long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
     int const* block_tables_ptr, int64_t const block_tables_stride,
     int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
+  int const n_pad = num_seqs - num_queries;
+  if (n_pad && blockIdx.x == 0) {
+    // Handle cuda graph padding
+    int const offset = num_queries;
+    for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
+      input_tokens_ptr[offset + i] = 0;
+      input_positions_ptr[offset + i] = 0;
+      slot_mapping_ptr[offset + i] = -1;
+    }
+  }
   int num_query_blocks = div_ceil(num_queries, num_threads);
 
   if (blockIdx.x < num_query_blocks) {
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
index cc1fd19252019..34030d9d6ac60 100644
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -5,6 +5,8 @@
 
 import pytest
 
+from tests.kernels.utils import override_backend_env_variable
+
 from ..models.utils import check_logprobs_close, check_outputs_equal
 
 MODELS = [
@@ -19,10 +21,11 @@
 @pytest.mark.parametrize("tp_size", [1])
 @pytest.mark.parametrize("enable_chunked_prefill", [False, True])
 @pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.parametrize("enforce_eager", [True, False])
 @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
 @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
 @pytest.mark.parametrize("num_logprobs", [None, 5])
+@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN", "FLASHINFER"])
 def test_multi_step_llm(
     hf_runner,
     vllm_runner,
@@ -36,6 +39,8 @@ def test_multi_step_llm(
     num_scheduler_steps: int,
     num_prompts: int,
     num_logprobs: Optional[int],
+    attention_backend: str,
+    monkeypatch,
 ) -> None:
     """Test vLLM engine with multi-step scheduling via sync LLM Engine.
 
@@ -63,6 +68,7 @@ def test_multi_step_llm(
       num_logprobs: corresponds to the `logprobs` argument to the OpenAI
                     completions endpoint; `None` -> 1 logprob returned.
     """
+    override_backend_env_variable(monkeypatch, attention_backend)
 
     prompts = example_prompts
     if len(prompts) < num_prompts:
@@ -114,6 +120,7 @@ def test_multi_step_llm(
 @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
 @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
 @pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)])
+@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
 def test_multi_step_llm_w_prompt_logprobs(
     vllm_runner,
     example_prompts,
@@ -126,6 +133,8 @@ def test_multi_step_llm_w_prompt_logprobs(
     num_prompts: int,
     num_logprobs: Optional[int],
     num_prompt_logprobs: Optional[int],
+    attention_backend: str,
+    monkeypatch,
 ) -> None:
     """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
 
@@ -155,6 +164,7 @@ def test_multi_step_llm_w_prompt_logprobs(
                            note that this argument is not supported by the
                            OpenAI completions endpoint.
     """
+    override_backend_env_variable(monkeypatch, attention_backend)
 
     prompts = example_prompts
     if len(prompts) < num_prompts:
@@ -205,6 +215,7 @@ def test_multi_step_llm_w_prompt_logprobs(
 @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
 @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
 @pytest.mark.parametrize("num_logprobs", [None, 5])
+@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
 def test_multi_step_llm_chunked_prefill_prefix_cache(
     vllm_runner,
     example_prompts,
@@ -216,6 +227,8 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
     num_scheduler_steps: int,
     num_prompts: int,
     num_logprobs: Optional[int],
+    attention_backend: str,
+    monkeypatch,
 ) -> None:
     """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
 
@@ -278,6 +291,8 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
     #
     # The Incorrect scheduling behavior - if it occurs - will cause an exception
     # in the model runner resulting from `do_sample=False`.
+    override_backend_env_variable(monkeypatch, attention_backend)
+
     assert len(example_prompts) >= 2
     challenge_prompts = copy.deepcopy(example_prompts)
     challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index a11462b2068a5..6ca75fabdfc38 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -256,7 +256,12 @@ def prepare_graph_input_buffers(self,
     def begin_forward(self, model_input):
         assert not self._is_graph_capturing
         state = self
-        if model_input.attn_metadata.use_cuda_graph:
+        use_cuda_graph = model_input.attn_metadata.use_cuda_graph
+        is_decode = model_input.attn_metadata.num_prefills == 0
+        # In case of multistep chunked-prefill, there might be prefill requests
+        # scheduled while CUDA graph mode is enabled. We don't run graph in that
+        # case.
+        if use_cuda_graph and is_decode:
             batch_size = model_input.input_tokens.shape[0]
             state = (self.runner.graph_runners[model_input.virtual_engine]
                      [batch_size].attn_state)
@@ -429,10 +434,24 @@ def advance_step(self,
         Update metadata in-place to advance one decode step.
         """
 
-        assert not turn_prefills_into_decodes, \
-            ("Chunked prefill is not supported with flashinfer yet."
-             "turn_prefills_into_decodes is a Multi-Step + Chunked-Prefill "
-             "specific parameter.")
+        if turn_prefills_into_decodes:
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes. This update reflects that
+            # conversion.
+            assert self.num_decode_tokens + self.num_prefills == num_seqs
+            # Flashinfer doesn't support speculative decoding + chunked-prefill
+            # + multi-step scheduling yet.
+            assert self.decode_query_len == 1
+            self.num_decode_tokens += self.num_prefills
+            self.num_prefills = 0
+            self.num_prefill_tokens = 0
+            self.max_prefill_seq_len = 0
+            self.max_query_len = 1
+
+            self.slot_mapping = self.slot_mapping[:num_seqs]
+        else:
+            assert self.seq_lens_tensor is not None
 
         assert num_seqs > 0
         assert num_queries > 0
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 2b918483d3675..ae8b7f97c827d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -5,6 +5,7 @@
 import time
 import warnings
 import weakref
+from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set,
                     Tuple, Type, TypeVar, Union)
@@ -1028,6 +1029,8 @@ def __init__(
 
         self.has_inner_state = model_config.has_inner_state
 
+        self.in_profile_run = False
+
         # When using CUDA graph, the input block tables must be padded to
         # max_seq_len_to_capture. However, creating the block table in
         # Python can be expensive. To optimize this, we cache the block table
@@ -1228,110 +1231,123 @@ def _prepare_model_input_tensors(
 
         return builder.build()  # type: ignore
 
+    @contextmanager
+    def set_in_profile_run(self):
+        self.in_profile_run = True
+        try:
+            yield
+        finally:
+            self.in_profile_run = False
+
     @torch.inference_mode()
     def profile_run(self) -> None:
-        # Enable top-k sampling to reflect the accurate memory usage.
-        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
-        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
-        max_num_seqs = self.scheduler_config.max_num_seqs
-        # This represents the maximum number of different requests
-        # that will have unique loras, an therefore the max amount of memory
-        # consumption create dummy lora request copies from the lora request
-        # passed in, which contains a lora from the lora warmup path.
-        dummy_lora_requests: List[LoRARequest] = []
-        dummy_lora_requests_per_seq: List[LoRARequest] = []
-        if self.lora_config:
-            assert self.lora_manager is not None
-            with self.lora_manager.dummy_lora_cache():
-                for idx in range(self.lora_config.max_loras):
-                    lora_id = idx + 1
-                    dummy_lora_request = LoRARequest(
-                        lora_name=f"warmup_{lora_id}",
-                        lora_int_id=lora_id,
-                        lora_path="/not/a/real/path",
-                    )
-                    self.lora_manager.add_dummy_lora(dummy_lora_request,
-                                                     rank=LORA_WARMUP_RANK)
-                    dummy_lora_requests.append(dummy_lora_request)
-                dummy_lora_requests_per_seq = [
-                    dummy_lora_requests[idx % len(dummy_lora_requests)]
-                    for idx in range(max_num_seqs)
-                ]
-
-        # Profile memory usage with max_num_sequences sequences and the total
-        # number of tokens equal to max_num_batched_tokens.
-        seqs: List[SequenceGroupMetadata] = []
-        # Additional GPU memory may be needed for multi-modal encoding, which
-        # needs to be accounted for when calculating the GPU blocks for
-        # vLLM blocker manager.
-        # To exercise the worst scenario for GPU memory consumption,
-        # the number of seqs (batch_size) is chosen to maximize the number
-        # of images processed.
-
-        max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
-            self.model_config)
-        if max_mm_tokens > 0:
-            max_num_seqs_orig = max_num_seqs
-            max_num_seqs = min(max_num_seqs,
-                               max_num_batched_tokens // max_mm_tokens)
-            if max_num_seqs < 1:
-                expr = (f"min({max_num_seqs_orig}, "
-                        f"{max_num_batched_tokens} // {max_mm_tokens})")
-                logger.warning(
-                    "Computed max_num_seqs (%s) to be less than 1. "
-                    "Setting it to the minimum value of 1.", expr)
-                max_num_seqs = 1
-
-        batch_size = 0
-        for group_id in range(max_num_seqs):
-            seq_len = (max_num_batched_tokens // max_num_seqs +
-                       (group_id < max_num_batched_tokens % max_num_seqs))
-            batch_size += seq_len
-
-            dummy_data = self.input_registry \
-                .dummy_data_for_profiling(self.model_config,
-                                          seq_len,
-                                          self.mm_registry)
-
-            seq = SequenceGroupMetadata(
-                request_id=str(group_id),
-                is_prompt=True,
-                seq_data={group_id: dummy_data.seq_data},
-                sampling_params=sampling_params,
-                block_tables=None,
-                lora_request=dummy_lora_requests_per_seq[group_id]
-                if dummy_lora_requests_per_seq else None,
-                multi_modal_data=dummy_data.multi_modal_data,
-                multi_modal_placeholders=dummy_data.multi_modal_placeholders,
-            )
-            seqs.append(seq)
-
-        # Run the model with the dummy inputs.
-        num_layers = self.model_config.get_num_layers(self.parallel_config)
-        # use an empty tensor instead of `None`` to force Dynamo to pass
-        # it by reference, rather by specializing on the value ``None``.
-        # the `dtype` argument does not matter, and we use `float32` as
-        # a placeholder (it has wide hardware support).
-        # it is important to create tensors inside the loop, rather than
-        # multiplying the list, to avoid Dynamo from treating them as
-        # tensor aliasing.
-        kv_caches = [
-            torch.tensor([], dtype=torch.float32, device=self.device)
-            for _ in range(num_layers)
-        ]
-        finished_requests_ids = [seq.request_id for seq in seqs]
-        model_input = self.prepare_model_input(
-            seqs, finished_requests_ids=finished_requests_ids)
-        intermediate_tensors = None
-        if not get_pp_group().is_first_rank:
-            intermediate_tensors = self.model.make_empty_intermediate_tensors(
-                batch_size=batch_size,
-                dtype=self.model_config.dtype,
-                device=self.device)
-
-        self.execute_model(model_input, kv_caches, intermediate_tensors)
-        torch.cuda.synchronize()
-        return
+        with self.set_in_profile_run():
+            # Enable top-k sampling to reflect the accurate memory usage.
+            sampling_params = \
+                SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
+            max_num_batched_tokens = \
+                self.scheduler_config.max_num_batched_tokens
+            max_num_seqs = self.scheduler_config.max_num_seqs
+            # This represents the maximum number of different requests
+            # that will have unique loras, an therefore the max amount of memory
+            # consumption create dummy lora request copies from the lora request
+            # passed in, which contains a lora from the lora warmup path.
+            dummy_lora_requests: List[LoRARequest] = []
+            dummy_lora_requests_per_seq: List[LoRARequest] = []
+            if self.lora_config:
+                assert self.lora_manager is not None
+                with self.lora_manager.dummy_lora_cache():
+                    for idx in range(self.lora_config.max_loras):
+                        lora_id = idx + 1
+                        dummy_lora_request = LoRARequest(
+                            lora_name=f"warmup_{lora_id}",
+                            lora_int_id=lora_id,
+                            lora_path="/not/a/real/path",
+                        )
+                        self.lora_manager.add_dummy_lora(dummy_lora_request,
+                                                         rank=LORA_WARMUP_RANK)
+                        dummy_lora_requests.append(dummy_lora_request)
+                    dummy_lora_requests_per_seq = [
+                        dummy_lora_requests[idx % len(dummy_lora_requests)]
+                        for idx in range(max_num_seqs)
+                    ]
+
+            # Profile memory usage with max_num_sequences sequences and the
+            # total number of tokens equal to max_num_batched_tokens.
+            seqs: List[SequenceGroupMetadata] = []
+            # Additional GPU memory may be needed for multi-modal encoding,
+            # which needs to be accounted for when calculating the GPU blocks
+            # for vLLM blocker manager.
+            # To exercise the worst scenario for GPU memory consumption,
+            # the number of seqs (batch_size) is chosen to maximize the number
+            # of images processed.
+
+            max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
+                self.model_config)
+            if max_mm_tokens > 0:
+                max_num_seqs_orig = max_num_seqs
+                max_num_seqs = min(max_num_seqs,
+                                   max_num_batched_tokens // max_mm_tokens)
+                if max_num_seqs < 1:
+                    expr = (f"min({max_num_seqs_orig}, "
+                            f"{max_num_batched_tokens} // {max_mm_tokens})")
+                    logger.warning(
+                        "Computed max_num_seqs (%s) to be less than 1. "
+                        "Setting it to the minimum value of 1.", expr)
+                    max_num_seqs = 1
+
+            batch_size = 0
+            for group_id in range(max_num_seqs):
+                seq_len = (max_num_batched_tokens // max_num_seqs +
+                           (group_id < max_num_batched_tokens % max_num_seqs))
+                batch_size += seq_len
+
+                dummy_data = self.input_registry \
+                    .dummy_data_for_profiling(self.model_config,
+                                            seq_len,
+                                            self.mm_registry)
+
+                seq = SequenceGroupMetadata(
+                    request_id=str(group_id),
+                    is_prompt=True,
+                    seq_data={group_id: dummy_data.seq_data},
+                    sampling_params=sampling_params,
+                    block_tables=None,
+                    lora_request=dummy_lora_requests_per_seq[group_id]
+                    if dummy_lora_requests_per_seq else None,
+                    multi_modal_data=dummy_data.multi_modal_data,
+                    multi_modal_placeholders=dummy_data.
+                    multi_modal_placeholders,
+                )
+                seqs.append(seq)
+
+            # Run the model with the dummy inputs.
+            num_layers = self.model_config.get_num_layers(self.parallel_config)
+            # use an empty tensor instead of `None`` to force Dynamo to pass
+            # it by reference, rather by specializing on the value ``None``.
+            # the `dtype` argument does not matter, and we use `float32` as
+            # a placeholder (it has wide hardware support).
+            # it is important to create tensors inside the loop, rather than
+            # multiplying the list, to avoid Dynamo from treating them as
+            # tensor aliasing.
+            kv_caches = [
+                torch.tensor([], dtype=torch.float32, device=self.device)
+                for _ in range(num_layers)
+            ]
+            finished_requests_ids = [seq.request_id for seq in seqs]
+            model_input = self.prepare_model_input(
+                seqs, finished_requests_ids=finished_requests_ids)
+            intermediate_tensors = None
+            if not get_pp_group().is_first_rank:
+                intermediate_tensors = \
+                    self.model.make_empty_intermediate_tensors(
+                    batch_size=batch_size,
+                    dtype=self.model_config.dtype,
+                    device=self.device)
+
+            self.execute_model(model_input, kv_caches, intermediate_tensors)
+            torch.cuda.synchronize()
+            return
 
     def remove_all_loras(self):
         if not self.lora_manager:
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index acce923498d7e..4aab09c80826b 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -32,7 +32,7 @@
 MULTI_STEP_ATTENTION_BACKENDS = [
     "FLASH_ATTN", "ROCM_FLASH", "FLASHINFER", "NO_ATTENTION"
 ]
-MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN"]
+MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN", "FLASHINFER"]
 
 def _get_supported_attention_backends(chunked_prefill_enabled: bool) \
     -> List[str]:

From f218f9c24d224800e0ea4488aa71bd8215c8bdcd Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Tue, 14 Jan 2025 21:19:55 -0800
Subject: [PATCH 18/67] [core] Turn off GPU communication overlap for Ray
 executor (#12051)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 vllm/envs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index c4a568c680db0..b7b597ea15af3 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -45,7 +45,7 @@
     VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
-    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = True
+    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
@@ -340,11 +340,11 @@ def get_default_config_root():
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
                  ),
 
-    # If the env var is set, it enables GPU communication overlap in
-    # Ray's compiled DAG. This flag is ignored if
+    # If the env var is set, it enables GPU communication overlap
+    # (experimental feature) in Ray's compiled DAG. This flag is ignored if
     # VLLM_USE_RAY_COMPILED_DAG is not set.
     "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM":
-    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "1"))
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0"))
                  ),
 
     # Use dedicated multiprocess context for workers.

From ad34c0df0f1b26b303a590133685b29e3daad20e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 15 Jan 2025 13:45:21 +0800
Subject: [PATCH 19/67] [core] platform agnostic executor via collective_rpc
 (#11256)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/engine/test_custom_executor.py          |  28 +-
 tests/engine/test_multiproc_workers.py        |  12 +-
 tests/test_utils.py                           |   6 +-
 vllm/config.py                                |  12 +-
 vllm/distributed/parallel_state.py            |   6 +-
 vllm/engine/async_llm_engine.py               |  88 +--
 vllm/engine/llm_engine.py                     |  94 +---
 vllm/engine/multiprocessing/engine.py         |  11 +-
 vllm/executor/cpu_executor.py                 | 299 ----------
 vllm/executor/distributed_gpu_executor.py     | 212 -------
 vllm/executor/executor_base.py                | 267 +++++++--
 vllm/executor/gpu_executor.py                 | 145 -----
 vllm/executor/hpu_executor.py                 | 202 -------
 ...executor.py => mp_distributed_executor.py} |  81 +--
 vllm/executor/multiproc_worker_utils.py       |  12 +-
 vllm/executor/multiproc_xpu_executor.py       |  26 -
 vllm/executor/neuron_executor.py              | 114 ----
 vllm/executor/openvino_executor.py            | 125 -----
 ...xecutor.py => ray_distributed_executor.py} | 240 ++++----
 vllm/executor/ray_hpu_executor.py             | 515 ------------------
 vllm/executor/ray_tpu_executor.py             | 343 ------------
 vllm/executor/ray_utils.py                    |  26 +-
 vllm/executor/ray_xpu_executor.py             |  40 --
 vllm/executor/tpu_executor.py                 | 142 -----
 vllm/executor/uniproc_executor.py             |  57 ++
 vllm/executor/xpu_executor.py                 |  39 --
 vllm/platforms/cpu.py                         |  27 +
 vllm/platforms/cuda.py                        |  22 +
 vllm/platforms/neuron.py                      |   8 +
 vllm/platforms/openvino.py                    |  12 +-
 vllm/platforms/tpu.py                         |  10 +
 vllm/platforms/xpu.py                         |  22 +-
 vllm/spec_decode/medusa_worker.py             |   8 +-
 vllm/spec_decode/multi_step_worker.py         |  21 +-
 vllm/spec_decode/spec_decode_worker.py        |   9 +-
 vllm/v1/executor/abstract.py                  |   5 +-
 vllm/v1/executor/multiproc_executor.py        |  17 +-
 vllm/v1/executor/uniproc_executor.py          |   2 +-
 vllm/v1/worker/gpu_worker.py                  |   3 +-
 vllm/worker/hpu_worker.py                     |  66 +++
 vllm/worker/neuron_worker.py                  |  28 +-
 vllm/worker/openvino_worker.py                |   6 +-
 vllm/worker/worker_base.py                    |  86 ++-
 43 files changed, 852 insertions(+), 2642 deletions(-)
 delete mode 100644 vllm/executor/cpu_executor.py
 delete mode 100644 vllm/executor/distributed_gpu_executor.py
 delete mode 100644 vllm/executor/gpu_executor.py
 delete mode 100644 vllm/executor/hpu_executor.py
 rename vllm/executor/{multiproc_gpu_executor.py => mp_distributed_executor.py} (75%)
 delete mode 100644 vllm/executor/multiproc_xpu_executor.py
 delete mode 100644 vllm/executor/neuron_executor.py
 delete mode 100644 vllm/executor/openvino_executor.py
 rename vllm/executor/{ray_gpu_executor.py => ray_distributed_executor.py} (78%)
 delete mode 100644 vllm/executor/ray_hpu_executor.py
 delete mode 100644 vllm/executor/ray_tpu_executor.py
 delete mode 100644 vllm/executor/ray_xpu_executor.py
 delete mode 100644 vllm/executor/tpu_executor.py
 create mode 100644 vllm/executor/uniproc_executor.py
 delete mode 100644 vllm/executor/xpu_executor.py

diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py
index bbabb936e92ba..2a057ca488a50 100644
--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
@@ -1,12 +1,13 @@
 import asyncio
 import os
+from typing import Any, Dict, List, Optional, Tuple
 
 import pytest
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
-from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync
+from vllm.executor.uniproc_executor import UniProcExecutor
 from vllm.sampling_params import SamplingParams
 
 
@@ -14,21 +15,20 @@ class Mock:
     ...
 
 
-class CustomGPUExecutor(GPUExecutor):
+class CustomUniExecutor(UniProcExecutor):
 
-    def execute_model(self, *args, **kwargs):
+    def collective_rpc(self,
+                       method: str,
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
         # Drop marker to show that this was ran
         with open(".marker", "w"):
             ...
-        return super().execute_model(*args, **kwargs)
+        return super().collective_rpc(method, timeout, args, kwargs)
 
 
-class CustomGPUExecutorAsync(GPUExecutorAsync):
-
-    async def execute_model_async(self, *args, **kwargs):
-        with open(".marker", "w"):
-            ...
-        return await super().execute_model_async(*args, **kwargs)
+CustomUniExecutorAsync = CustomUniExecutor
 
 
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
@@ -41,10 +41,6 @@ def test_custom_executor_type_checking(model):
         engine_args = AsyncEngineArgs(model=model,
                                       distributed_executor_backend=Mock)
         AsyncLLMEngine.from_engine_args(engine_args)
-    with pytest.raises(TypeError):
-        engine_args = AsyncEngineArgs(
-            model=model, distributed_executor_backend=CustomGPUExecutor)
-        AsyncLLMEngine.from_engine_args(engine_args)
 
 
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
@@ -55,7 +51,7 @@ def test_custom_executor(model, tmp_path):
         assert not os.path.exists(".marker")
 
         engine_args = EngineArgs(
-            model=model, distributed_executor_backend=CustomGPUExecutor)
+            model=model, distributed_executor_backend=CustomUniExecutor)
         engine = LLMEngine.from_engine_args(engine_args)
         sampling_params = SamplingParams(max_tokens=1)
 
@@ -75,7 +71,7 @@ def test_custom_executor_async(model, tmp_path):
         assert not os.path.exists(".marker")
 
         engine_args = AsyncEngineArgs(
-            model=model, distributed_executor_backend=CustomGPUExecutorAsync)
+            model=model, distributed_executor_backend=CustomUniExecutorAsync)
         engine = AsyncLLMEngine.from_engine_args(engine_args)
         sampling_params = SamplingParams(max_tokens=1)
 
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
index e07dd6deef5bf..db70a808c008b 100644
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -6,16 +6,15 @@
 
 import pytest
 
+from vllm.config import VllmConfig
 from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                   ResultHandler, WorkerMonitor)
+from vllm.worker.worker_base import WorkerWrapperBase
 
 
-class DummyWorker:
+class DummyWorkerWrapper(WorkerWrapperBase):
     """Dummy version of vllm.worker.worker.Worker"""
 
-    def __init__(self, rank: int):
-        self.rank = rank
-
     def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
         sleep(0.05)
 
@@ -28,9 +27,10 @@ def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
 
 def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
     result_handler = ResultHandler()
+    vllm_config = VllmConfig()
     workers = [
-        ProcessWorkerWrapper(result_handler, partial(DummyWorker, rank=rank))
-        for rank in range(8)
+        ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config,
+                             rank) for rank in range(8)
     ]
 
     worker_monitor = WorkerMonitor(workers, result_handler)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 6810e0302f897..c68d730af7f8a 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,6 +2,7 @@
 import os
 import socket
 from typing import AsyncIterator, Tuple
+from unittest.mock import patch
 
 import pytest
 import torch
@@ -390,7 +391,10 @@ def test_bind_kv_cache_encoder_decoder():
 
 
 def test_bind_kv_cache_pp():
-    cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
+    with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
+        # this test runs with 1 GPU, but we simulate 2 GPUs
+        cfg = VllmConfig(
+            parallel_config=ParallelConfig(pipeline_parallel_size=2))
     with set_current_vllm_config(cfg):
         from vllm.attention import Attention
 
diff --git a/vllm/config.py b/vllm/config.py
index 59b509d5a961e..4a42aefb75026 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1294,8 +1294,11 @@ def __post_init__(self) -> None:
             from vllm.executor import ray_utils
             backend = "mp"
             ray_found = ray_utils.ray_is_available()
-            if (current_platform.is_cuda()
-                    and cuda_device_count_stateless() < self.world_size):
+            if current_platform.is_neuron():
+                # neuron uses single process to control multiple devices
+                backend = "uni"
+            elif (current_platform.is_cuda()
+                  and cuda_device_count_stateless() < self.world_size):
                 if not ray_found:
                     raise ValueError("Unable to load Ray which is "
                                      "required for multi-node inference, "
@@ -1328,13 +1331,14 @@ def _verify_args(self) -> None:
         from vllm.executor.executor_base import ExecutorBase
         from vllm.platforms import current_platform
         if self.distributed_executor_backend not in (
-                "ray", "mp", None) and not (isinstance(
+                "ray", "mp", "uni", None) and not (isinstance(
                     self.distributed_executor_backend, type) and issubclass(
                         self.distributed_executor_backend, ExecutorBase)):
             raise ValueError(
                 "Unrecognized distributed executor backend "
                 f"{self.distributed_executor_backend}. Supported "
-                "values are 'ray', 'mp' or custom ExecutorBase subclass.")
+                "values are 'ray', 'mp' 'uni', or custom ExecutorBase"
+                " subclass.")
         if self.use_ray:
             from vllm.executor import ray_utils
             ray_utils.assert_ray_available()
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index be7f16ef52a47..bf8b30cccd5f6 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -862,12 +862,14 @@ def init_model_parallel_group(
 ) -> GroupCoordinator:
     if use_custom_allreduce is None:
         use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
+    from vllm.platforms import current_platform
     return GroupCoordinator(
         group_ranks=group_ranks,
         local_rank=local_rank,
         torch_distributed_backend=backend,
-        use_pynccl=True,
-        use_custom_allreduce=use_custom_allreduce,
+        use_pynccl=current_platform.is_cuda_alike(),
+        use_custom_allreduce=current_platform.is_cuda_alike()
+        and use_custom_allreduce,
         use_tpu_communicator=True,
         use_hpu_communicator=True,
         use_xpu_communicator=True,
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index da23ed19ef7be..08fef8250d483 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -18,9 +18,7 @@
 from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.engine.protocol import EngineClient
-from vllm.executor.executor_base import ExecutorAsyncBase
-from vllm.executor.gpu_executor import GPUExecutorAsync
-from vllm.executor.ray_utils import initialize_ray_cluster
+from vllm.executor.executor_base import ExecutorBase
 from vllm.inputs import PromptType
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
@@ -620,69 +618,9 @@ def __del__(self):
             rt.new_requests_event.set()
 
     @classmethod
-    def _get_executor_cls(
-            cls, engine_config: VllmConfig) -> Type[ExecutorAsyncBase]:
-        distributed_executor_backend = (
-            engine_config.parallel_config.distributed_executor_backend)
-        if isinstance(distributed_executor_backend, type):
-            if not issubclass(distributed_executor_backend, ExecutorAsyncBase):
-                raise TypeError(
-                    "distributed_executor_backend must be a subclass of "
-                    f"ExecutorAsyncBase. Got {distributed_executor_backend}.")
-            executor_class = distributed_executor_backend
-        elif engine_config.device_config.device_type == "neuron":
-            from vllm.executor.neuron_executor import NeuronExecutorAsync
-            executor_class = NeuronExecutorAsync
-        elif engine_config.device_config.device_type == "tpu":
-            if distributed_executor_backend == "ray":
-                from vllm.executor.ray_tpu_executor import RayTPUExecutorAsync
-                executor_class = RayTPUExecutorAsync
-            else:
-                assert distributed_executor_backend is None
-                from vllm.executor.tpu_executor import TPUExecutorAsync
-                executor_class = TPUExecutorAsync
-        elif engine_config.device_config.device_type == "cpu":
-            from vllm.executor.cpu_executor import CPUExecutorAsync
-            executor_class = CPUExecutorAsync
-        elif engine_config.device_config.device_type == "hpu":
-            if distributed_executor_backend == "ray":
-                initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync
-                executor_class = RayHPUExecutorAsync
-            else:
-                from vllm.executor.hpu_executor import HPUExecutorAsync
-                executor_class = HPUExecutorAsync
-        elif engine_config.device_config.device_type == "openvino":
-            assert distributed_executor_backend is None, (
-                "Distributed execution is not supported with "
-                "the OpenVINO backend.")
-            from vllm.executor.openvino_executor import OpenVINOExecutorAsync
-            executor_class = OpenVINOExecutorAsync
-        elif engine_config.device_config.device_type == "xpu":
-            if distributed_executor_backend is None:
-                from vllm.executor.xpu_executor import XPUExecutorAsync
-                executor_class = XPUExecutorAsync
-            elif distributed_executor_backend == "ray":
-                from vllm.executor.ray_xpu_executor import RayXPUExecutorAsync
-                executor_class = RayXPUExecutorAsync
-            elif distributed_executor_backend == "mp":
-                from vllm.executor.multiproc_xpu_executor import (
-                    MultiprocessingXPUExecutorAsync)
-                executor_class = MultiprocessingXPUExecutorAsync
-            else:
-                raise RuntimeError(
-                    "Not supported distributed execution model on XPU device.")
-        elif distributed_executor_backend == "ray":
-            from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
-            executor_class = RayGPUExecutorAsync
-        elif distributed_executor_backend == "mp":
-            from vllm.executor.multiproc_gpu_executor import (
-                MultiprocessingGPUExecutorAsync)
-            executor_class = MultiprocessingGPUExecutorAsync
-        else:
-            from vllm.executor.gpu_executor import GPUExecutorAsync
-            executor_class = GPUExecutorAsync
-        return executor_class
+    def _get_executor_cls(cls,
+                          engine_config: VllmConfig) -> Type[ExecutorBase]:
+        return LLMEngine._get_executor_cls(engine_config)
 
     @classmethod
     def from_engine_args(
@@ -700,9 +638,6 @@ def from_engine_args(
 
         executor_class = cls._get_executor_cls(engine_config)
 
-        if executor_class.uses_ray:
-            initialize_ray_cluster(engine_config.parallel_config)
-
         # Create the async LLM engine.
         engine = cls(
             vllm_config=engine_config,
@@ -1242,23 +1177,12 @@ def remove_logger(self, logger_name: str) -> None:
         self.engine.remove_logger(logger_name=logger_name)
 
     async def start_profile(self) -> None:
-        # using type instead of isinstance to check to avoid capturing
-        # inherited classes
-        if type(self.engine.model_executor) == GPUExecutorAsync:  # noqa: E721
-            self.engine.model_executor.start_profile()
-        else:
-            self.engine.model_executor._run_workers("start_profile")
+        self.engine.start_profile()
 
     async def stop_profile(self) -> None:
-        # using type instead of isinstance to check to avoid capturing
-        # inherited classes
-        if type(self.engine.model_executor) == GPUExecutorAsync:  # noqa: E721
-            self.engine.model_executor.stop_profile()
-        else:
-            self.engine.model_executor._run_workers("stop_profile")
+        self.engine.stop_profile()
 
     async def add_lora(self, lora_request: LoRARequest) -> None:
-        """Load a new LoRA adapter into the engine for future requests."""
         self.engine.add_lora(lora_request)
 
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 1db3e59ff3bae..49a1e9f505d9f 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -28,8 +28,6 @@
 from vllm.entrypoints.openai.logits_processors import (
     get_logits_processors as get_openai_logits_processors)
 from vllm.executor.executor_base import ExecutorBase
-from vllm.executor.gpu_executor import GPUExecutor
-from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                          PromptType, SingletonInputsAdapter)
 from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt
@@ -442,64 +440,26 @@ def _get_executor_cls(cls,
                 raise TypeError(
                     "distributed_executor_backend must be a subclass of "
                     f"ExecutorBase. Got {distributed_executor_backend}.")
-            if distributed_executor_backend.uses_ray:  # type: ignore
-                initialize_ray_cluster(engine_config.parallel_config)
             executor_class = distributed_executor_backend
-        elif engine_config.device_config.device_type == "neuron":
-            from vllm.executor.neuron_executor import NeuronExecutor
-            executor_class = NeuronExecutor
-        elif engine_config.device_config.device_type == "tpu":
+        elif engine_config.parallel_config.world_size > 1:
             if distributed_executor_backend == "ray":
-                initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_tpu_executor import RayTPUExecutor
-                executor_class = RayTPUExecutor
-            else:
-                assert distributed_executor_backend is None
-                from vllm.executor.tpu_executor import TPUExecutor
-                executor_class = TPUExecutor
-        elif engine_config.device_config.device_type == "cpu":
-            from vllm.executor.cpu_executor import CPUExecutor
-            executor_class = CPUExecutor
-        elif engine_config.device_config.device_type == "hpu":
-            if distributed_executor_backend == "ray":
-                initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_hpu_executor import RayHPUExecutor
-                executor_class = RayHPUExecutor
-            else:
-                from vllm.executor.hpu_executor import HPUExecutor
-                executor_class = HPUExecutor
-        elif engine_config.device_config.device_type == "openvino":
-            from vllm.executor.openvino_executor import OpenVINOExecutor
-            executor_class = OpenVINOExecutor
-        elif engine_config.device_config.device_type == "xpu":
-            if distributed_executor_backend == "ray":
-                initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_xpu_executor import RayXPUExecutor
-                executor_class = RayXPUExecutor
+                from vllm.executor.ray_distributed_executor import (
+                    RayDistributedExecutor)
+                executor_class = RayDistributedExecutor
             elif distributed_executor_backend == "mp":
-                # FIXME(kunshang):
-                # spawn needs calling `if __name__ == '__main__':``
-                # fork is not supported for xpu start new process.
-                logger.error(
-                    "Both start methods (spawn and fork) have issue "
-                    "on XPU if you use mp backend, Please try ray instead.")
-            else:
-                from vllm.executor.xpu_executor import XPUExecutor
-                executor_class = XPUExecutor
-        elif distributed_executor_backend == "ray":
-            initialize_ray_cluster(engine_config.parallel_config)
-            from vllm.executor.ray_gpu_executor import RayGPUExecutor
-            executor_class = RayGPUExecutor
-        elif distributed_executor_backend == "mp":
-            from vllm.executor.multiproc_gpu_executor import (
-                MultiprocessingGPUExecutor)
-            assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
-                "multiprocessing distributed executor backend does not "
-                "support VLLM_USE_RAY_SPMD_WORKER=1")
-            executor_class = MultiprocessingGPUExecutor
+                from vllm.executor.mp_distributed_executor import (
+                    MultiprocessingDistributedExecutor)
+                assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
+                    "multiprocessing distributed executor backend does not "
+                    "support VLLM_USE_RAY_SPMD_WORKER=1")
+                executor_class = MultiprocessingDistributedExecutor
+            elif distributed_executor_backend == "uni":
+                # JAX-style, single-process, multi-device executor.
+                from vllm.executor.uniproc_executor import UniProcExecutor
+                executor_class = UniProcExecutor
         else:
-            from vllm.executor.gpu_executor import GPUExecutor
-            executor_class = GPUExecutor
+            from vllm.executor.uniproc_executor import UniProcExecutor
+            executor_class = UniProcExecutor
         return executor_class
 
     @classmethod
@@ -1845,27 +1805,17 @@ def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
     def list_prompt_adapters(self) -> List[int]:
         return self.model_executor.list_prompt_adapters()
 
+    def start_profile(self) -> None:
+        self.model_executor.start_profile()
+
+    def stop_profile(self) -> None:
+        self.model_executor.stop_profile()
+
     def check_health(self) -> None:
         if self.tokenizer:
             self.tokenizer.check_health()
         self.model_executor.check_health()
 
-    def start_profile(self) -> None:
-        # using type instead of isinstance to check to avoid capturing
-        # inherited classes (MultiprocessingGPUExecutor)
-        if type(self.model_executor) == GPUExecutor:  # noqa: E721
-            self.model_executor.start_profile()
-        else:
-            self.model_executor._run_workers("start_profile")
-
-    def stop_profile(self) -> None:
-        # using type instead of isinstance to check to avoid capturing
-        # inherited classes (MultiprocessingGPUExecutor)
-        if type(self.model_executor) == GPUExecutor:  # noqa: E721
-            self.model_executor.stop_profile()
-        else:
-            self.model_executor._run_workers("stop_profile")
-
     def is_tracing_enabled(self) -> bool:
         return self.tracer is not None
 
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 36f4df4b02731..8f231de912c95 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -20,7 +20,6 @@
                                          RPCStartupResponse,
                                          RPCUProfileRequest)
 # yapf: enable
-from vllm.executor.gpu_executor import GPUExecutor
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.usage.usage_lib import UsageContext
@@ -356,16 +355,10 @@ def _set_errored(self, e: BaseException):
             self._errored_with = e
 
     def start_profile(self) -> None:
-        if type(self.engine.model_executor) is GPUExecutor:
-            self.engine.model_executor.start_profile()
-        else:
-            self.engine.model_executor._run_workers("start_profile")
+        self.engine.start_profile()
 
     def stop_profile(self) -> None:
-        if type(self.engine.model_executor) is GPUExecutor:
-            self.engine.model_executor.stop_profile()
-        else:
-            self.engine.model_executor._run_workers("stop_profile")
+        self.engine.stop_profile()
 
 
 def signal_handler(*_) -> None:
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
deleted file mode 100644
index b9a6bee5720fd..0000000000000
--- a/vllm/executor/cpu_executor.py
+++ /dev/null
@@ -1,299 +0,0 @@
-import os
-from functools import partial
-from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
-
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
-                                                  ResultHandler, WorkerMonitor)
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import get_distributed_init_method, get_open_port, make_async
-from vllm.worker.worker_base import WorkerWrapperBase
-
-logger = init_logger(__name__)
-
-
-class CPUExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        assert self.device_config.device_type == "cpu"
-
-        #
-        # Environment variables for CPU executor
-        #
-
-        # Disable torch async compiling which won't work with daemonic processes
-        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
-
-        # Intel OpenMP setting
-        ld_prealod_str = os.getenv("LD_PRELOAD", "")
-        if "libiomp5.so" in ld_prealod_str:
-            # The time(milliseconds) that a thread should wait after
-            # completing the execution of a parallel region, before sleeping.
-            os.environ['KMP_BLOCKTIME'] = "1"
-            # Prevents the CPU to run into low performance state
-            os.environ['KMP_TPAUSE'] = "0"
-            # Provides fine granularity parallelism
-            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
-            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
-            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
-
-        # To hint IPEX uses shared memory based AllReduce
-        os.environ["LOCAL_WORLD_SIZE"] = str(
-            self.parallel_config.tensor_parallel_size)
-
-        # Multiprocessing-based executor does not support multi-node setting.
-        # Since it only works for single node, we can use the loopback address
-        # 127.0.0.1 for communication.
-        ip = "127.0.0.1"
-        port = get_open_port()
-        self.distributed_init_method = get_distributed_init_method(ip, port)
-
-        is_async = isinstance(self, CPUExecutorAsync)
-
-        world_size = self.parallel_config.tensor_parallel_size
-        result_handler = ResultHandler()
-        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
-        self.workers = []
-
-        if is_async:
-            self.workers = [
-                ProcessWorkerWrapper(
-                    result_handler,
-                    partial(
-                        self._create_worker,
-                        rank=rank,
-                        local_rank=rank,
-                    )) for rank in range(0, world_size)
-            ]
-            self.driver_worker = self.workers[0]
-            self.workers = self.workers[1:]
-            self.driver_method_invoker = _async_driver_method_invoker
-        else:
-            self.driver_worker = self._create_worker()
-            self.driver_method_invoker = _driver_method_invoker
-
-            if world_size != 1:
-                self.workers = [
-                    ProcessWorkerWrapper(
-                        result_handler,
-                        partial(
-                            self._create_worker,
-                            rank=rank,
-                            local_rank=rank,
-                        )) for rank in range(1, world_size)
-                ]
-
-        self.worker_monitor = None
-        if world_size != 1 or is_async:
-            if is_async:
-                async_worker_list = self.workers + [self.driver_worker]
-            else:
-                async_worker_list = self.workers
-            self.worker_monitor = WorkerMonitor(async_worker_list,
-                                                result_handler)
-            result_handler.start()
-            self.worker_monitor.start()
-
-        self._run_workers("init_device")
-        self._run_workers("load_model")
-
-    def _create_worker(
-        self,
-        local_rank: int = 0,
-        rank: int = 0,
-    ):
-
-        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
-
-        assert self.distributed_init_method is not None
-
-        kwargs = dict(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=self.distributed_init_method,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=rank == 0,
-        )
-        wrapper.init_worker(**kwargs)
-
-        return wrapper.worker
-
-    def _run_workers(
-        self,
-        method: str,
-        *args,
-        async_run_remote_workers_only: bool = False,
-        max_concurrent_workers: Optional[int] = None,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers.
-
-        Args:
-            async_run_remote_workers_only: If True the method will be run only
-                in the remote workers, not the driver worker. It will also be
-                run asynchronously and return a list of futures rather than
-                blocking on the results.
-        """
-
-        if max_concurrent_workers:
-            raise NotImplementedError(
-                "max_concurrent_workers is not supported yet.")
-
-        # Start the workers first.
-        worker_outputs = [
-            worker.execute_method(method, *args, **kwargs)
-            for worker in self.workers
-        ]
-
-        if async_run_remote_workers_only:
-            # Just return futures
-            return worker_outputs
-
-        driver_worker_output = self.driver_method_invoker(
-            self.driver_worker, method, *args, **kwargs)
-
-        # Get the results of the workers.
-        return [driver_worker_output
-                ] + [output.get() for output in worker_outputs]
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker.
-        """
-        return self.driver_method_invoker(self.driver_worker,
-                                          "determine_num_available_blocks")
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache by invoking the underlying worker.
-        """
-        # NOTE: We log here to avoid multiple logs when number of workers is
-        # greater than one. We could log in the engine, but not all executors
-        # have GPUs.
-        # NOTE: `cpu block` for CPU backend is located on CPU memory but is
-        # referred as `gpu block`. Because we want to reuse the existing block
-        # management procedure.
-        logger.info("# CPU blocks: %d", num_gpu_blocks)
-
-        self._run_workers("initialize_cache",
-                          num_gpu_blocks=num_gpu_blocks,
-                          num_cpu_blocks=num_cpu_blocks)
-
-    def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if (self.parallel_config.tensor_parallel_size > 1
-                and self.parallel_worker_tasks is None):
-            self.parallel_worker_tasks = self._run_workers(
-                "start_worker_execution_loop",
-                async_run_remote_workers_only=True,
-            )
-        output = self.driver_method_invoker(self.driver_worker,
-                                            "execute_model", execute_model_req)
-        return output
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-        """
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        self.driver_method_invoker(self.driver_worker, "execute_model", None)
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        self._wait_for_tasks_completion(parallel_worker_tasks)
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return all(self._run_workers("add_lora", lora_request))
-
-    def remove_lora(self, lora_id: int) -> bool:
-        return all(self._run_workers("remove_lora", lora_id))
-
-    def pin_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return all(self._run_workers(
-            "pin_lora",
-            lora_id=lora_id,
-        ))
-
-    def list_loras(self) -> Set[int]:
-        return self.driver_method_invoker(self.driver_worker, "list_loras")
-
-    def add_prompt_adapter(
-            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        return all(
-            self._run_workers(
-                "add_prompt_adapter",
-                prompt_adapter_request,
-            ))
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        return all(
-            self._run_workers(
-                "remove_prompt_adapter",
-                prompt_adapter_id,
-            ))
-
-    def list_prompt_adapters(self) -> Set[int]:
-        return self.driver_method_invoker(self.driver_worker,
-                                          "list_prompt_adapters")
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        return all(self._run_workers(
-            "pin_prompt_adapter",
-            prompt_adapter_id,
-        ))
-
-    def check_health(self) -> None:
-        """Raises an error if engine is unhealthy."""
-        if self.worker_monitor is not None and not self.worker_monitor.is_alive(
-        ):
-            raise RuntimeError("Worker processes are not running")
-
-    def shutdown(self):
-        if (worker_monitor := getattr(self, "worker_monitor",
-                                      None)) is not None:
-            worker_monitor.close()
-
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        for result in parallel_worker_tasks:
-            result.get()
-
-    def start_profile(self) -> None:
-        self.driver_method_invoker(self.driver_worker, "start_profile")
-
-    def stop_profile(self) -> None:
-        self.driver_method_invoker(self.driver_worker, "stop_profile")
-
-
-class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        output = await make_async(self.execute_model
-                                  )(execute_model_req=execute_model_req, )
-        return output
-
-    async def check_health_async(self) -> None:
-        self.check_health()
-
-
-def _driver_method_invoker(driver, method: str, *args, **kwargs):
-    return getattr(driver, method)(*args, **kwargs)
-
-
-def _async_driver_method_invoker(driver, method: str, *args, **kwargs):
-    return driver.execute_method(method, *args, **kwargs).get()
diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py
deleted file mode 100644
index deb7cb1c97ef5..0000000000000
--- a/vllm/executor/distributed_gpu_executor.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import asyncio
-from abc import abstractmethod
-from typing import Any, Awaitable, Dict, List, Optional, Set, Tuple, Union
-
-from vllm.executor.executor_base import ExecutorAsyncBase
-from vllm.executor.gpu_executor import GPUExecutor
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-
-logger = init_logger(__name__)
-
-
-class DistributedGPUExecutor(GPUExecutor):
-    """Abstract superclass of multi-GPU executor implementations."""
-
-    def __init__(self, *args, **kwargs):
-        # This is non-None when the execute model loop is running
-        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
-        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
-        # Updated by implementations that require additional args to be passed
-        # to the _run_workers execute_model call
-        self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
-
-        super().__init__(*args, **kwargs)
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks.
-
-        This invokes `determine_num_available_blocks` on each worker and takes
-        the min of the results, guaranteeing that the selected cache sizes are
-        compatible with all workers.
-
-        Returns:
-            - tuple[num_gpu_blocks, num_cpu_blocks]
-        """
-        # Get the maximum number of blocks that can be allocated on GPU and CPU.
-        num_blocks = self._run_workers("determine_num_available_blocks", )
-
-        # Since we use a shared centralized controller, we take the minimum
-        # number of blocks across all workers to make sure all the memory
-        # operators can be applied to all workers.
-        num_gpu_blocks = min(b[0] for b in num_blocks)
-        num_cpu_blocks = min(b[1] for b in num_blocks)
-
-        return num_gpu_blocks, num_cpu_blocks
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache in all workers.
-        """
-
-        # NOTE: We log here to avoid multiple logs when number of workers is
-        # greater than one. We could log in the engine, but not all executors
-        # have GPUs.
-        logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
-                    num_cpu_blocks)
-        max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
-                           self.model_config.max_model_len)
-        logger.info("Maximum concurrency for %s tokens per request: %.2fx",
-                    self.model_config.max_model_len, max_concurrency)
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-        self._run_workers("initialize_cache",
-                          num_gpu_blocks=num_gpu_blocks,
-                          num_cpu_blocks=num_cpu_blocks)
-
-    def execute_model(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        if self.parallel_worker_tasks is None:
-            self.parallel_worker_tasks = self._run_workers(
-                "start_worker_execution_loop",
-                async_run_tensor_parallel_workers_only=True,
-                **self.extra_execute_model_run_workers_kwargs)
-
-        # Only the driver worker returns the sampling results.
-        driver_outputs = self._driver_execute_model(execute_model_req)
-        assert driver_outputs is not None
-        return driver_outputs
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        self._driver_execute_model(execute_model_req=None)
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        self._wait_for_tasks_completion(parallel_worker_tasks)
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
-        return self._run_workers(
-            "add_lora",
-            lora_request=lora_request,
-        )
-
-    def remove_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self._run_workers(
-            "remove_lora",
-            lora_id=lora_id,
-        )
-
-    def pin_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self._run_workers(
-            "pin_lora",
-            lora_id=lora_id,
-        )
-
-    def list_loras(self) -> Set[int]:
-        return self._run_workers("list_loras")
-
-    def save_sharded_state(
-        self,
-        path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
-    ) -> None:
-        self._run_workers("save_sharded_state",
-                          path=path,
-                          pattern=pattern,
-                          max_size=max_size)
-
-    @abstractmethod
-    def _driver_execute_model(
-        self, execute_model_req: Optional[ExecuteModelRequest]
-    ) -> Optional[List[SamplerOutput]]:
-        """Run execute_model in the driver worker.
-
-        Passing None will cause the driver to stop the model execution loop
-        running in each of the remote workers. In this case, this method
-        returns None. Otherwise, this method returns the model output.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def _run_workers(
-        self,
-        method: str,
-        *args,
-        async_run_tensor_parallel_workers_only: bool = False,
-        max_concurrent_workers: Optional[int] = None,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers.
-
-        Args:
-            async_run_tensor_parallel_workers_only: If True the method will be
-                run only in the remote TP workers, not the driver worker.
-                It will also be run asynchronously and return a list of futures
-                rather than blocking on the results.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        raise NotImplementedError
-
-
-class DistributedGPUExecutorAsync(DistributedGPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if self.parallel_worker_tasks is None:
-            # Start model execution loop running in the parallel workers
-            self.parallel_worker_tasks = asyncio.create_task(
-                self._start_worker_execution_loop())
-
-        # Only the driver worker returns the sampling results.
-        return await self._driver_execute_model_async(execute_model_req)
-
-    async def stop_remote_worker_execution_loop_async(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        await self._driver_execute_model_async()
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        await parallel_worker_tasks
-
-    @abstractmethod
-    async def _driver_execute_model_async(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> List[SamplerOutput]:
-        """Execute the model asynchronously in the driver worker.
-
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    async def _start_worker_execution_loop(self):
-        """Run execution loop on all workers. It guarantees all workers run
-        the loop or None of them is running the loop. Loop can be stopped by
-        `stop_remote_worker_execution_loop`.
-        The API is idempotent (guarantee only 1 loop run at any moment)."""
-        raise NotImplementedError
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 9cba189dd57f9..00ecadcf92667 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,18 +1,24 @@
+import asyncio
 from abc import ABC, abstractmethod
-from typing import List, Optional, Set, Tuple
+from typing import Any, Awaitable, Dict, List, Optional, Set, Tuple, Union
 
 from vllm.config import VllmConfig
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sequence import ExecuteModelRequest
+from vllm.sequence import ExecuteModelRequest, PoolerOutput
+from vllm.utils import make_async
+
+logger = init_logger(__name__)
 
 
 class ExecutorBase(ABC):
     """Base class for all executors.
 
-    An executor is responsible for executing the model on a specific device
-    type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor
+    An executor is responsible for executing the model on one device,
+    or it can be a distributed executor 
     that can execute the model on multiple devices.
     """
 
@@ -40,6 +46,20 @@ def _init_executor(self) -> None:
         pass
 
     @abstractmethod
+    def collective_rpc(self,
+                       method: str,
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        """
+        The main interface of the executor to run a method on all workers,
+        with homogeneous arguments.
+        If the args are heterogeneous, then we can pack them into a list,
+        and unpack them in the method of every worker, because every worker
+        knows their own rank.
+        """
+        pass
+
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available blocks for the GPU KV cache and
         swappable CPU KV cache.
@@ -53,58 +73,113 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
         appended to.
         """
-        raise NotImplementedError
+        results = self.collective_rpc("determine_num_available_blocks")
+        a = min([r[0] for r in results])
+        b = min([r[1] for r in results])
+        return a, b
 
-    @abstractmethod
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache with the given size in blocks.
+    def initialize(self, num_gpu_blocks: int) -> None:
         """
-        raise NotImplementedError
+        Initialize the KV caches and begin the model execution loop of the
+        underlying workers.
+        For V1 compatibility.
+        """
+        logger.info("# GPU blocks: %d", num_gpu_blocks)
+        self.collective_rpc("initialize_cache", args=(num_gpu_blocks, ))
+        self.collective_rpc("compile_or_warm_up_model")
+
+    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
+        """Initialize the KV cache by invoking the underlying worker.
+        """
+        # NOTE: This is logged in the executor because there can be >1 workers.
+        logger.info("# %s blocks: %d, # CPU blocks: %d",
+                    current_platform.dispatch_key, num_gpu_blocks,
+                    num_cpu_blocks)
+        max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
+                           self.model_config.max_model_len)
+        logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                    self.model_config.max_model_len, max_concurrency)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        self.collective_rpc("initialize_cache",
+                            args=(num_gpu_blocks, num_cpu_blocks))
 
-    @abstractmethod
     def execute_model(
         self, execute_model_req: ExecuteModelRequest
-    ) -> Optional[List[SamplerOutput]]:
-        """Executes at least one model step on the given sequences."""
-        raise NotImplementedError
+    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
+        output = self.collective_rpc("execute_model",
+                                     args=(execute_model_req, ))
+        return output[0]
 
     def stop_remote_worker_execution_loop(self) -> None:
         """Releases parallel workers from model loop."""
         return
 
-    @abstractmethod
     def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise NotImplementedError
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("add_lora", args=(lora_request, )))
 
-    @abstractmethod
     def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("remove_lora", args=(lora_id, )))
 
-    @abstractmethod
     def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError  # type: ignore
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("pin_lora", args=(lora_id, )))
 
-    @abstractmethod
     def list_loras(self) -> Set[int]:
-        raise NotImplementedError
+        sets = self.collective_rpc("list_loras")
+        for s in sets:
+            assert s == sets[0], "All workers should have the same LORAs."
+        return sets[0]
 
-    @abstractmethod
     def add_prompt_adapter(
             self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        raise NotImplementedError
+        assert prompt_adapter_request.prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return all(
+            self.collective_rpc("add_prompt_adapter",
+                                args=(prompt_adapter_request, )))
 
-    @abstractmethod
     def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError
+        assert prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return all(
+            self.collective_rpc("remove_prompt_adapter",
+                                args=(prompt_adapter_id, )))
 
-    @abstractmethod
     def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError  # type: ignore
+        assert prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return all(
+            self.collective_rpc("pin_prompt_adapter",
+                                args=(prompt_adapter_id, )))
 
-    @abstractmethod
     def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError
+        sets = self.collective_rpc("list_prompt_adapters")
+        for s in sets:
+            assert (s == sets[0]
+                    ), "All workers should have the same prompt adapters."
+        return sets[0]
+
+    def start_profile(self) -> None:
+        self.collective_rpc("start_profile")
+
+    def stop_profile(self) -> None:
+        self.collective_rpc("stop_profile")
+
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        self.collective_rpc("save_sharded_state",
+                            kwargs=dict(path=path,
+                                        pattern=pattern,
+                                        max_size=max_size))
 
     @abstractmethod
     def check_health(self) -> None:
@@ -119,15 +194,12 @@ def shutdown(self) -> None:
     def __del__(self):
         self.shutdown()
 
-
-class ExecutorAsyncBase(ExecutorBase):
-
-    @abstractmethod
     async def execute_model_async(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
         """Executes one model step on the given sequences."""
-        raise NotImplementedError
+        output = await make_async(self.execute_model)(execute_model_req)
+        return output
 
     async def stop_remote_worker_execution_loop_async(self) -> None:
         """Releases parallel workers from model loop."""
@@ -137,3 +209,128 @@ async def check_health_async(self) -> None:
         """Checks if the executor is healthy. If not, it should raise an
         exception."""
         self.check_health()
+
+
+class DistributedExecutorBase(ExecutorBase):
+    """Abstract superclass of distributed executor implementations."""
+
+    def __init__(self, *args, **kwargs):
+        # This is non-None when the execute model loop is running
+        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
+        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
+
+        super().__init__(*args, **kwargs)
+
+    def execute_model(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        # TODO: unify into collective_rpc
+        if self.parallel_worker_tasks is None:
+            self.parallel_worker_tasks = self._run_workers(
+                "start_worker_execution_loop",
+                async_run_tensor_parallel_workers_only=True)
+
+        # Only the driver worker returns the sampling results.
+        driver_outputs = self._driver_execute_model(execute_model_req)
+        assert driver_outputs is not None
+        return driver_outputs
+
+    def stop_remote_worker_execution_loop(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+
+        self._driver_execute_model(execute_model_req=None)
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        self._wait_for_tasks_completion(parallel_worker_tasks)
+
+    @abstractmethod
+    def _driver_execute_model(
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
+        """Run execute_model in the driver worker.
+
+        Passing None will cause the driver to stop the model execution loop
+        running in each of the remote workers. In this case, this method
+        returns None. Otherwise, this method returns the model output.
+        """
+        raise NotImplementedError
+
+    def collective_rpc(self,
+                       method: str,
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        return self._run_workers(method, *args, **(kwargs or {}))
+
+    @abstractmethod
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        async_run_tensor_parallel_workers_only: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers.
+
+        Args:
+            async_run_tensor_parallel_workers_only: If True the method will be
+                run only in the remote TP workers, not the driver worker.
+                It will also be run asynchronously and return a list of futures
+                rather than blocking on the results.
+        
+        # TODO: simplify and merge with collective_rpc
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        raise NotImplementedError
+
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if self.parallel_worker_tasks is None:
+            # Start model execution loop running in the parallel workers
+            self.parallel_worker_tasks = asyncio.create_task(
+                self._start_worker_execution_loop())
+
+        # Only the driver worker returns the sampling results.
+        return await self._driver_execute_model_async(execute_model_req)
+
+    async def stop_remote_worker_execution_loop_async(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+
+        await self._driver_execute_model_async()
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        await parallel_worker_tasks
+
+    @abstractmethod
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> List[SamplerOutput]:
+        """Execute the model asynchronously in the driver worker.
+
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    async def _start_worker_execution_loop(self):
+        """Run execution loop on all workers. It guarantees all workers run
+        the loop or None of them is running the loop. Loop can be stopped by
+        `stop_remote_worker_execution_loop`.
+        The API is idempotent (guarantee only 1 loop run at any moment)."""
+        raise NotImplementedError
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
deleted file mode 100644
index 7fa34456028dd..0000000000000
--- a/vllm/executor/gpu_executor.py
+++ /dev/null
@@ -1,145 +0,0 @@
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
-
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sequence import ExecuteModelRequest, PoolerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-from vllm.worker.worker_base import WorkerWrapperBase
-
-logger = init_logger(__name__)
-
-
-def create_worker(**kwargs):
-    vllm_config = kwargs.get("vllm_config")
-    wrapper = WorkerWrapperBase(vllm_config=vllm_config)
-    wrapper.init_worker(**kwargs)
-    return wrapper.worker
-
-
-class GPUExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        """Initialize the worker and load the model.
-        """
-        assert self.parallel_config.world_size == 1, (
-            "GPUExecutor only supports single GPU.")
-
-        self.driver_worker = self._create_worker()
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
-
-    def _get_worker_kwargs(
-            self,
-            local_rank: int = 0,
-            rank: int = 0,
-            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
-        """Return worker init args for a given rank."""
-        if distributed_init_method is None:
-            distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port())
-        return dict(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-            is_driver_worker=(not self.parallel_config)
-            or (rank % self.parallel_config.tensor_parallel_size == 0),
-        )
-
-    def _create_worker(self,
-                       local_rank: int = 0,
-                       rank: int = 0,
-                       distributed_init_method: Optional[str] = None):
-        return create_worker(**self._get_worker_kwargs(
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method))
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker.
-        """
-        return self.driver_worker.determine_num_available_blocks()
-
-    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
-        """Initialize the KV cache by invoking the underlying worker.
-        """
-        # NOTE: This is logged in the executor because there can be >1 worker
-        # with other executors. We could log in the engine level, but work
-        # remains to abstract away the device for non-GPU configurations.
-        logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
-                    num_cpu_blocks)
-        max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
-                           self.model_config.max_model_len)
-        logger.info("Maximum concurrency for %s tokens per request: %.2fx",
-                    self.model_config.max_model_len, max_concurrency)
-
-        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-
-    def execute_model(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
-        return self.driver_worker.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self.driver_worker.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self.driver_worker.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.driver_worker.list_loras()
-
-    def add_prompt_adapter(
-            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        assert prompt_adapter_request.prompt_adapter_id > 0, \
-            "prompt_adapter_id must be greater than 0."
-        return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        assert prompt_adapter_id > 0, \
-            "prompt_adapter_id must be greater than 0."
-        return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        assert prompt_adapter_id > 0, \
-                "prompt_adapter_id must be greater than 0."
-        return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
-
-    def list_prompt_adapters(self) -> Set[int]:
-        return self.driver_worker.list_prompt_adapters()
-
-    def check_health(self) -> None:
-        # GPUExecutor will always be healthy as long as
-        # it's running.
-        return
-
-    def start_profile(self) -> None:
-        self.driver_worker.start_profile()
-
-    def stop_profile(self) -> None:
-        self.driver_worker.stop_profile()
-
-
-class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[Union[SamplerOutput, PoolerOutput]]:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(execute_model_req=execute_model_req)
-        return output
diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py
deleted file mode 100644
index c9b7bfa71edfa..0000000000000
--- a/vllm/executor/hpu_executor.py
+++ /dev/null
@@ -1,202 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-###############################################################################
-
-import contextlib
-import os
-from typing import Any, Dict, List, Optional, Set, Tuple
-
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-from vllm.worker.worker_base import WorkerWrapperBase
-
-logger = init_logger(__name__)
-
-
-class HPUExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        """Initialize the worker and load the model."""
-        self._init_worker()
-
-    def _get_worker_kwargs(
-            self,
-            local_rank: int = 0,
-            rank: int = 0,
-            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
-        """Return worker init args for a given rank."""
-        if distributed_init_method is None:
-            distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port())
-        return dict(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-            is_driver_worker=rank == 0,
-        )
-
-    def _create_worker(self,
-                       local_rank: int = 0,
-                       rank: int = 0,
-                       distributed_init_method: Optional[str] = None):
-        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
-        wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
-                                                      distributed_init_method))
-        return wrapper.worker
-
-    def _init_worker(self):
-        assert self.parallel_config.world_size == 1, (
-            "GPUExecutor only supports single GPU.")
-
-        self.driver_worker = self._create_worker()
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker.
-        """
-        return self.driver_worker.determine_num_available_blocks()
-
-    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
-        """Initialize the KV cache by invoking the underlying worker.
-        """
-        # NOTE: This is logged in the executor because there can be >1 worker
-        # with other executors. We could log in the engine level, but work
-        # remains to abstract away the device for non-GPU configurations.
-        logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
-                    num_cpu_blocks)
-        from vllm_hpu_extension.profiler import HabanaMemoryProfiler
-        with HabanaMemoryProfiler() as cache_init_m:
-            self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-        msg = f"init_cache_engine took {cache_init_m.get_summary_string()}"
-        logger.info(msg)
-
-    def finish_measurements(self):
-        self.driver_worker.finish_measurements()
-
-    def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
-        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
-        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any # noqa:E501
-        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL     - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501
-        log_graph_compilation_all = os.environ.get(
-            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
-        log_graph_compilation = os.environ.get(
-            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION',
-            '0') != '0' or log_graph_compilation_all
-        log_cpu_fallbacks_all = os.environ.get(
-            'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
-        log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
-                                           '0') != '0' or log_cpu_fallbacks_all
-        if log_graph_compilation or log_cpu_fallbacks:
-            from habana_frameworks.torch.hpu.metrics import metric_localcontext
-            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
-            is_prompt = any([
-                seq_group_metadata.is_prompt
-                for seq_group_metadata in seq_group_metadata_list
-            ])
-            max_context_len = max([
-                max([
-                    len(v.prompt_token_ids) + len(v.output_token_ids)
-                    for v in seq_group_metadata.seq_data.values()
-                ]) for seq_group_metadata in seq_group_metadata_list
-            ])  # whoa, that's some spicy stuff right here
-            max_num_blocks = (
-                (max_context_len - 1) // self.cache_config.block_size) + 1
-            input_stats = (f'is_prompt: {is_prompt}, '
-                           f'num_seqs: {len(seq_group_metadata_list)}, '
-                           f'max_context_len: {max_context_len}, '
-                           f'max_num_blocks {max_num_blocks}')
-            gc_ctx = metric_localcontext(
-                "graph_compilation"
-            ) if log_graph_compilation else contextlib.nullcontext()
-            cpu_fallback_ctx = metric_localcontext(
-                "cpu_fallback"
-            ) if log_cpu_fallbacks else contextlib.nullcontext()
-            with gc_ctx as gc_local_metric, \
-                cpu_fallback_ctx as cpu_fallback_local_metric:
-                output = self.driver_worker.execute_model(execute_model_req)
-            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
-                ) or log_graph_compilation_all:
-                msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
-                       f"{gc_local_metric.stats()}, {input_stats}")
-                logger.warning(msg)
-            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
-                    0) or log_cpu_fallbacks_all:
-                msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
-                       f"{cpu_fallback_local_metric.stats()}, {input_stats}")
-                logger.warning(msg)
-
-            return output
-
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
-        return self.driver_worker.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self.driver_worker.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self.driver_worker.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.driver_worker.list_loras()
-
-    def add_prompt_adapter(
-            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        raise NotImplementedError(
-            "Prompt Adapter is not implemented for HPU backend.")
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Prompt Adapter is not implemented for HPU backend.")
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Prompt Adapter is not implemented for HPU backend.")
-
-    def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError(
-            "Prompt Adapter is not implemented for HPU backend.")
-
-    def check_health(self) -> None:
-        # GPUExecutor will always be healthy as long as
-        # it's running.
-        return
-
-    def start_profile(self) -> None:
-        self.driver_worker.start_profile()
-
-    def stop_profile(self) -> None:
-        self.driver_worker.stop_profile()
-
-    def shutdown(self) -> None:
-        self.driver_worker.shutdown_inc()
-
-
-class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(execute_model_req=execute_model_req, )
-        return output
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/mp_distributed_executor.py
similarity index 75%
rename from vllm/executor/multiproc_gpu_executor.py
rename to vllm/executor/mp_distributed_executor.py
index fc58163cade64..d9dde949b844a 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -1,32 +1,26 @@
 import asyncio
-import os
-from functools import partial
 from typing import Any, List, Optional
 
-from vllm.executor.distributed_gpu_executor import (  # yapf: disable
-    DistributedGPUExecutor, DistributedGPUExecutorAsync)
-from vllm.executor.gpu_executor import create_worker
+from vllm.executor.executor_base import DistributedExecutorBase
 from vllm.executor.multiproc_worker_utils import (
     ProcessWorkerWrapper, ResultHandler, WorkerMonitor,
     set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
-                        get_distributed_init_method, get_open_port, make_async,
-                        update_environment_variables)
+from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
+                        get_ip, get_open_port, make_async)
+from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
 
 
-class MultiprocessingGPUExecutor(DistributedGPUExecutor):
-    """Python multiprocessing-based multi-GPU executor"""
+class MultiprocessingDistributedExecutor(DistributedExecutorBase):
+    """Python multiprocessing-based distributed executor"""
 
     uses_ray: bool = False
 
     def _init_executor(self) -> None:
-        self._check_executor_parameters()
-
         # Create the parallel GPU workers.
         world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
@@ -55,15 +49,9 @@ def _init_executor(self) -> None:
         else:
             result_handler = ResultHandler()
             for rank in range(1, world_size):
-                worker = ProcessWorkerWrapper(
-                    result_handler,
-                    partial(
-                        create_worker,
-                        **self._get_worker_kwargs(
-                            rank=rank,
-                            local_rank=rank,
-                            distributed_init_method=distributed_init_method,
-                        )))
+                worker = ProcessWorkerWrapper(result_handler,
+                                              WorkerWrapperBase,
+                                              self.vllm_config, rank)
                 self.workers.append(worker)
                 if rank % tensor_parallel_size == 0:
                     self.tp_driver_workers.append(worker)
@@ -77,32 +65,30 @@ def _init_executor(self) -> None:
         # Set up signal handlers to shutdown the executor cleanly
         # sometimes gc does not work well
 
-        self.driver_worker = self._create_worker(
-            distributed_init_method=distributed_init_method)
+        self.driver_worker = WorkerWrapperBase(self.vllm_config, 0)
+
+        all_kwargs = []
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        for i in range(world_size):
+            local_rank = i
+            rank = i
+            kwargs = dict(
+                vllm_config=self.vllm_config,
+                local_rank=local_rank,
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+                is_driver_worker=(not self.parallel_config)
+                or (rank % self.parallel_config.tensor_parallel_size == 0),
+            )
+            all_kwargs.append(kwargs)
+        self._run_workers("init_worker", all_kwargs)
         self._run_workers("init_device")
         self._run_workers("load_model",
                           max_concurrent_workers=self.parallel_config.
                           max_parallel_loading_workers)
-
-    def _check_executor_parameters(self):
-        world_size = self.parallel_config.world_size
-        tensor_parallel_size = self.parallel_config.tensor_parallel_size
-
-        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
-        if "CUDA_VISIBLE_DEVICES" not in os.environ:
-            update_environment_variables({
-                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
-            })
-
-        cuda_device_count = cuda_device_count_stateless()
-        # Use confusing message for more common TP-only case.
-        assert tensor_parallel_size <= cuda_device_count, (
-            f"please set tensor_parallel_size ({tensor_parallel_size}) "
-            f"to less than max local gpu count ({cuda_device_count})")
-
-        assert world_size <= cuda_device_count, (
-            f"please ensure that world_size ({world_size}) "
-            f"is less than than max local gpu count ({cuda_device_count})")
+        self.driver_exec_model = make_async(self.driver_worker.execute_model)
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
 
     def shutdown(self):
         if (worker_monitor := getattr(self, "worker_monitor",
@@ -172,15 +158,6 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
         for result in parallel_worker_tasks:
             result.get()
 
-
-class MultiprocessingGPUExecutorAsync(MultiprocessingGPUExecutor,
-                                      DistributedGPUExecutorAsync):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.driver_exec_model = make_async(self.driver_worker.execute_model)
-        self.pp_locks: Optional[List[asyncio.Lock]] = None
-
     async def _driver_execute_model_async(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index bc32826529eef..c9fb3c664c575 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -12,6 +12,7 @@
 
 import torch
 
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.triton_utils.importing import HAS_TRITON
 from vllm.utils import _check_multiproc_method, get_mp_context
@@ -147,7 +148,8 @@ class ProcessWorkerWrapper:
     for handling single-node multi-GPU tensor parallel."""
 
     def __init__(self, result_handler: ResultHandler,
-                 worker_factory: Callable[[], Any]) -> None:
+                 worker_factory: Callable[[VllmConfig, int], Any],
+                 vllm_config: VllmConfig, rank: int) -> None:
         self.mp = get_mp_context()
         self._task_queue = self.mp.Queue()
         self.result_queue = result_handler.result_queue
@@ -159,6 +161,8 @@ def __init__(self, result_handler: ResultHandler,
                 worker_factory=worker_factory,
                 task_queue=self._task_queue,
                 result_queue=self.result_queue,
+                vllm_config=vllm_config,
+                rank=rank,
             ),
             daemon=True)
 
@@ -199,9 +203,11 @@ def kill_worker(self):
 
 
 def _run_worker_process(
-    worker_factory: Callable[[], Any],
+    worker_factory: Callable[[VllmConfig, int], Any],
     task_queue: Queue,
     result_queue: Queue,
+    vllm_config: VllmConfig,
+    rank: int,
 ) -> None:
     """Worker process event loop"""
 
@@ -212,7 +218,7 @@ def _run_worker_process(
     _add_prefix(sys.stderr, process_name, pid)
 
     # Initialize worker
-    worker = worker_factory()
+    worker = worker_factory(vllm_config, rank)
     del worker_factory
 
     # Accept tasks from the engine in task_queue
diff --git a/vllm/executor/multiproc_xpu_executor.py b/vllm/executor/multiproc_xpu_executor.py
deleted file mode 100644
index a66afbf939ef0..0000000000000
--- a/vllm/executor/multiproc_xpu_executor.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import vllm.envs as envs
-from vllm.executor.multiproc_gpu_executor import (
-    MultiprocessingGPUExecutor, MultiprocessingGPUExecutorAsync)
-from vllm.executor.xpu_executor import XPUExecutor
-from vllm.logger import init_logger
-from vllm.utils import make_async
-
-logger = init_logger(__name__)
-
-
-class MultiprocessingXPUExecutor(MultiprocessingGPUExecutor, XPUExecutor):
-    """Python multiprocessing-based multi-XPU executor"""
-
-    def _check_executor_parameters(self):
-        mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
-        if mp_method != "spawn":
-            raise RuntimeError(
-                "XPU multiprocess executor only support spawn as mp method")
-
-
-class MultiprocessingXPUExecutorAsync(MultiprocessingXPUExecutor,
-                                      MultiprocessingGPUExecutorAsync):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.driver_exec_model = make_async(self.driver_worker.execute_model)
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
deleted file mode 100644
index a9efc4f9a801c..0000000000000
--- a/vllm/executor/neuron_executor.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from typing import List, Set, Tuple
-
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-from vllm.worker.worker_base import WorkerWrapperBase
-
-logger = init_logger(__name__)
-
-
-class NeuronExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        assert (self.lora_config is
-                None), "LoRA is not supported for Neuron backend."
-        assert (not self.speculative_config
-                ), "Speculative decoding not yet supported for Neuron backend."
-
-        # Instantiate the worker and load the model to the device.
-        self._init_worker()
-
-    def _init_worker(self):
-        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        wrapper.init_worker(
-            vllm_config=self.vllm_config,
-            local_rank=0,
-            rank=0,
-            distributed_init_method=distributed_init_method,
-        )
-        self.driver_worker = wrapper.worker
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker.
-        """
-        return self.driver_worker.determine_num_available_blocks()
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache by invoking the underlying worker.
-        """
-        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-
-    def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        assert (not execute_model_req.blocks_to_swap_in
-                and not execute_model_req.blocks_to_swap_out
-                and not execute_model_req.blocks_to_copy), (
-                    "Cache operations are not supported for Neuron backend.")
-        assert execute_model_req.num_lookahead_slots == 0, (
-            "lookahead not supported for Neuron backend.")
-
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.driver_worker.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        return self.driver_worker.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        return self.driver_worker.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.driver_worker.list_loras()
-
-    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the Neuron backend.")
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the Neuron backend.")
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the Neuron backend.")
-
-    def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the Neuron backend.")
-
-    def check_health(self) -> None:
-        # NeuronExecutor will always be healthy as long as
-        # it's running.
-        return
-
-
-class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(execute_model_req=execute_model_req, )
-        return output
-
-    async def check_health_async(self) -> None:
-        # NeuronExecutor will always be healthy as long as
-        # it's running.
-        return
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
deleted file mode 100644
index 057a32364e512..0000000000000
--- a/vllm/executor/openvino_executor.py
+++ /dev/null
@@ -1,125 +0,0 @@
-from typing import List, Set, Tuple
-
-import openvino as ov
-
-import vllm.envs as envs
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.platforms import current_platform
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-from vllm.worker.worker_base import WorkerWrapperBase
-
-logger = init_logger(__name__)
-
-
-class OpenVINOExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        assert self.device_config.device_type == "openvino"
-        assert self.lora_config is None, "OpenVINO backend doesn't support LoRA"
-        assert current_platform.is_openvino_cpu() or \
-            current_platform.is_openvino_gpu(), \
-            "OpenVINO backend supports only CPU and GPU devices"
-
-        # Instantiate the worker and load the model to CPU.
-        self._init_worker()
-
-    def _init_worker(self):
-
-        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
-
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        wrapper.init_worker(
-            ov_core=ov.Core(),
-            vllm_config=self.vllm_config,
-            local_rank=0,
-            rank=0,
-            distributed_init_method=distributed_init_method,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=True,
-        )
-        self.driver_worker = wrapper.worker
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker.
-        """
-        return self.driver_worker.determine_num_available_blocks()
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache by invoking the underlying worker."""
-        # NOTE: We log here to avoid multiple logs when number of workers is
-        # greater than one. We could log in the engine, but not all executors
-        # have GPUs.
-        # NOTE: In case of a CPU device, `cpu block` for OpenVINO backend
-        # is located on CPU memory but is referred as `gpu block`.
-        # Because we want to reuse the existing block management procedure.
-        device_blocks = num_gpu_blocks
-        swap_blocks = num_cpu_blocks
-        logger.info("OpenVINO %s: # device blocks: %d; # swap blocks: %d",
-                    envs.VLLM_OPENVINO_DEVICE, device_blocks, swap_blocks)
-        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-
-    def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.driver_worker.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        return self.driver_worker.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        return self.driver_worker.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.driver_worker.list_loras()
-
-    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the OPENVINO backend.")
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the OPENVINO backend.")
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the OPENVINO backend.")
-
-    def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the OPENVINO backend.")
-
-    def check_health(self) -> None:
-        # OpenVINOExecutor will always be healthy as long as
-        # it's running.
-        return
-
-
-class OpenVINOExecutorAsync(OpenVINOExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(execute_model_req=execute_model_req, )
-        return output
-
-    async def check_health_async(self) -> None:
-        # OpenVINOExecutor will always be healthy as long as
-        # it's running.
-        return
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_distributed_executor.py
similarity index 78%
rename from vllm/executor/ray_gpu_executor.py
rename to vllm/executor/ray_distributed_executor.py
index e2c549cbd5331..edceece4b68dc 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -1,24 +1,29 @@
 import asyncio
 import os
 from collections import defaultdict
-from itertools import islice, repeat
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 import msgspec
 
 import vllm.envs as envs
-from vllm.executor.distributed_gpu_executor import (  # yapf: disable
-    DistributedGPUExecutor, DistributedGPUExecutorAsync)
+from vllm.executor.executor_base import (
+    DistributedExecutorBase)  # yapf: disable
 from vllm.executor.msgspec_utils import encode_hook
-from vllm.executor.ray_utils import RayWorkerWrapper, ray
+from vllm.executor.ray_utils import (RayWorkerWrapper, initialize_ray_cluster,
+                                     ray)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
                         get_ip, get_open_port, make_async)
 
 if ray is not None:
+    from ray.actor import ActorHandle
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+else:
+    ActorHandle = None
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -26,12 +31,29 @@
 logger = init_logger(__name__)
 
 
-class RayGPUExecutor(DistributedGPUExecutor):
+@dataclass
+class RayWorkerMetaData:
+    """
+    Metadata for a Ray worker.
+    The order of ray worker creation can be random,
+    and we need to reset the rank after creating all workers.
+    """
+    worker: ActorHandle
+    created_rank: int
+    adjusted_rank: int = -1
+    ip: str = ""
+
+
+class RayDistributedExecutor(DistributedExecutorBase):
 
     uses_ray: bool = True
 
     def _init_executor(self) -> None:
         self.forward_dag: Optional[ray.dag.CompiledDAG] = None
+        if envs.VLLM_USE_V1:
+            # v1 always uses the compiled DAG and SPMD worker.
+            os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1"
+            os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1"
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
         # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
@@ -53,6 +75,7 @@ def _init_executor(self) -> None:
                 "VLLM_USE_RAY_COMPILED_DAG=1")
 
         assert self.uses_ray
+        initialize_ray_cluster(self.parallel_config)
         placement_group = self.parallel_config.placement_group
 
         # Disable Ray usage stats collection.
@@ -66,6 +89,13 @@ def _init_executor(self) -> None:
         self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
         self.output_decoder = msgspec.msgpack.Decoder(
             Optional[List[SamplerOutput]])
+        self.use_v1 = envs.VLLM_USE_V1
+
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
+        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
+        if not self.use_ray_compiled_dag:
+            self.driver_exec_method = make_async(
+                self.driver_worker.execute_method)
 
     def shutdown(self) -> None:
         if hasattr(self, "forward_dag") and self.forward_dag is not None:
@@ -123,9 +153,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         # Create the workers.
         driver_ip = get_ip()
-        workers = []
+        rank = 0
+        worker_metadata: List[RayWorkerMetaData] = []
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get("GPU", 0):
+            if not bundle.get(current_platform.ray_device_key, 0):
                 continue
             scheduling_strategy = PlacementGroupSchedulingStrategy(
                 placement_group=placement_group,
@@ -133,38 +164,51 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 placement_group_bundle_index=bundle_id,
             )
 
-            worker = ray.remote(
-                num_cpus=0,
-                num_gpus=num_gpus,
-                scheduling_strategy=scheduling_strategy,
-                **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
-            workers.append(worker)
-
-        worker_ip_refs = [
-            worker.get_node_ip.remote()  # type: ignore[attr-defined]
-            for worker in workers
-        ]
-        worker_ips = ray.get(worker_ip_refs)
+            if current_platform.ray_device_key == "GPU":
+                # NV+AMD GPUs, and Intel XPUs
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=num_gpus,
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
+                                           rank=rank)
+            else:
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=0,
+                    resources={current_platform.ray_device_key: num_gpus},
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
+                                           rank=rank)
+            worker_metadata.append(
+                RayWorkerMetaData(worker=worker, created_rank=rank))
+            rank += 1
+
+        worker_ips = ray.get([
+            each.worker.get_node_ip.remote()  # type: ignore[attr-defined]
+            for each in worker_metadata
+        ])
+
+        for each, ip in zip(worker_metadata, worker_ips):
+            each.ip = ip
 
         if not self.use_ray_spmd_worker:
-            for i in range(len(workers)):
-                worker = workers[i]
-                worker_ip = worker_ips[i]
+            for i, each in enumerate(worker_metadata):
+                # find and remove the dummy worker from the list
+                worker = each.worker
+                worker_ip = each.ip
                 if self.driver_dummy_worker is None and worker_ip == driver_ip:
                     # If the worker is on the same node as the driver, we use it
                     # as the resource holder for the driver process.
                     self.driver_dummy_worker = worker
                     self.driver_worker = RayWorkerWrapper(
-                        vllm_config=self.vllm_config)
-                    workers.pop(i)
-                    worker_ips.pop(i)
-                    self.workers = workers
+                        vllm_config=self.vllm_config, rank=0)
+                    worker_metadata.pop(i)
                     break
-        else:
-            self.workers = workers
 
-        logger.debug("workers: %s", self.workers)
+        logger.debug("workers: %s", worker_metadata)
         logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
         if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
             raise ValueError(
@@ -176,9 +220,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         for ip in worker_ips:
             ip_counts[ip] = ip_counts.get(ip, 0) + 1
 
-        worker_to_ip = dict(zip(self.workers, worker_ips))
-
-        def sort_by_driver_then_worker_ip(worker):
+        def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
             """
             Sort the workers based on 3 properties:
             1. If the worker is on the same node as the driver (vllm engine),
@@ -188,13 +230,23 @@ def sort_by_driver_then_worker_ip(worker):
             3. Finally, if the work is on a node with smaller IP address, it
                 should be placed first.
             """
-            ip = worker_to_ip[worker]
-            return (ip != driver_ip, ip_counts[ip], ip)
+            ip = item.ip
+            return (0 if ip == driver_ip else 1, ip_counts[ip], ip)
 
         # After sorting, the workers on the same node will be
         # close to each other, and the workers on the driver
         # node will be placed first.
-        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
+        sorted_worker_metadata = sorted(worker_metadata,
+                                        key=sort_by_driver_then_worker_ip)
+        start_rank = 0 if self.use_ray_spmd_worker else 1
+        for i, item in enumerate(sorted_worker_metadata):
+            item.adjusted_rank = i + start_rank
+        self.workers = [item.worker for item in sorted_worker_metadata]
+        rerank_mapping = {
+            item.created_rank: item.adjusted_rank
+            for item in sorted_worker_metadata
+        }
+        self._run_workers("adjust_rank", rerank_mapping)
 
         # Get the set of GPU IDs used on each node.
         worker_node_and_gpu_ids = []
@@ -235,21 +287,29 @@ def sort_by_driver_then_worker_ip(worker):
                 " each node.")
 
         # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = [({
-            "CUDA_VISIBLE_DEVICES":
+        all_args_to_update_environment_variables = [{
+            current_platform.device_control_env_var:
             ",".join(map(str, node_gpus[node_id])),
-            "VLLM_TRACE_FUNCTION":
-            str(envs.VLLM_TRACE_FUNCTION),
-            **({
-                "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND
-            } if envs.VLLM_ATTENTION_BACKEND is not None else {})
-        }, ) for (node_id, _) in worker_node_and_gpu_ids]
+        } for (node_id, _) in worker_node_and_gpu_ids]
+
+        for args in all_args_to_update_environment_variables:
+            # some carry-over env vars from the driver
+            # TODO: refactor platform-specific env vars
+            for name in [
+                    "VLLM_ATTENTION_BACKEND",
+                    "TPU_CHIPS_PER_HOST_BOUNDS",
+                    "TPU_HOST_BOUNDS",
+                    "VLLM_USE_V1",
+                    "VLLM_TRACE_FUNCTION",
+            ]:
+                if name in os.environ:
+                    args[name] = os.environ[name]
 
         self._env_vars_for_all_workers = (
             all_args_to_update_environment_variables)
 
         self._run_workers("update_environment_variables",
-                          all_args=self._get_env_vars_to_be_updated())
+                          self._get_env_vars_to_be_updated())
 
         if len(node_gpus) == 1:
             # in single node case, we don't need to get the IP address.
@@ -265,14 +325,19 @@ def sort_by_driver_then_worker_ip(worker):
             driver_ip, get_open_port())
 
         # Initialize the actual workers inside worker wrapper.
-        init_worker_all_kwargs = [
-            self._get_worker_kwargs(
-                local_rank=node_workers[node_id].index(rank),
+        all_kwargs = []
+        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+            local_rank = node_workers[node_id].index(rank)
+            kwargs = dict(
+                vllm_config=self.vllm_config,
+                local_rank=local_rank,
                 rank=rank,
                 distributed_init_method=distributed_init_method,
-            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
-        ]
-        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
+                is_driver_worker=(not self.parallel_config)
+                or (rank % self.parallel_config.tensor_parallel_size == 0),
+            )
+            all_kwargs.append(kwargs)
+        self._run_workers("init_worker", all_kwargs)
 
         self._run_workers("init_device")
         self._run_workers("load_model",
@@ -332,9 +397,15 @@ def execute_model(
         if self.forward_dag is None:
             self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
 
-        serialized_data = self.input_encoder.encode(execute_model_req)
+        if self.use_v1:
+            serialized_data = execute_model_req
+        else:
+            serialized_data = self.input_encoder.encode(execute_model_req)
         outputs = ray.get(self.forward_dag.execute(serialized_data))
-        output = self.output_decoder.decode(outputs[0])
+        if self.use_v1:
+            output = outputs[0]
+        else:
+            output = self.output_decoder.decode(outputs[0])
         return output
 
     def _run_workers(
@@ -342,8 +413,6 @@ def _run_workers(
         method: str,
         *args,
         async_run_tensor_parallel_workers_only: bool = False,
-        all_args: Optional[List[Tuple[Any, ...]]] = None,
-        all_kwargs: Optional[List[Dict[str, Any]]] = None,
         max_concurrent_workers: Optional[int] = None,
         **kwargs,
     ) -> Any:
@@ -356,8 +425,6 @@ def _run_workers(
           It will also be run asynchronously and return a list of futures
           rather than blocking on the results.
         - args/kwargs: All workers share the same args/kwargs
-        - all_args/all_kwargs: args/kwargs for each worker are specified
-          individually
         """
         if self.use_ray_spmd_worker:
             assert not async_run_tensor_parallel_workers_only, (
@@ -368,26 +435,13 @@ def _run_workers(
             raise NotImplementedError(
                 "max_concurrent_workers is not supported yet.")
 
-        count = len(self.workers) if not \
-            async_run_tensor_parallel_workers_only \
-            else len(self.non_driver_workers)
-        # If using SPMD worker, all workers are the same, so we should execute
-        # the args on all workers. Otherwise, we skip the first worker's args
-        # because those args will go to the driver worker.
-        first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1
-        all_worker_args = repeat(args, count) if all_args is None \
-            else islice(all_args, first_worker_args_index, None)
-        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
-            else islice(all_kwargs, first_worker_args_index, None)
-
         # Start the ray workers first.
         ray_workers = self.workers
         if async_run_tensor_parallel_workers_only:
             ray_workers = self.non_driver_workers
         ray_worker_outputs = [
-            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
-            for (worker, worker_args, worker_kwargs
-                 ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
+            worker.execute_method.remote(method, *args, **kwargs)
+            for worker in ray_workers
         ]
 
         if async_run_tensor_parallel_workers_only:
@@ -399,13 +453,9 @@ def _run_workers(
         # so we only explicitly execute on the driver worker if using a
         # non-SPMD worker class.
         if not self.use_ray_spmd_worker:
-            driver_args = args if all_args is None else all_args[0]
-            driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
-
             # Start the driver worker after all the ray workers.
             driver_worker_output = [
-                self.driver_worker.execute_method(method, *driver_args,
-                                                  **driver_kwargs)
+                self.driver_worker.execute_method(method, *args, **kwargs)
             ]
 
         # Get the results of the ray workers.
@@ -467,11 +517,18 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
             for pp_rank, tp_group in enumerate(self.pp_tp_workers):
                 # Each PP worker takes in the output of the previous PP worker,
                 # and the TP group executes in SPMD fashion.
-                outputs = [
-                    worker.execute_model_spmd.
-                    bind(  # type: ignore[attr-defined]
-                        outputs[i]) for i, worker in enumerate(tp_group)
-                ]
+                if self.use_v1:
+                    outputs = [
+                        worker.execute_model.
+                        bind(  # type: ignore[attr-defined]
+                            outputs[i]) for i, worker in enumerate(tp_group)
+                    ]
+                else:
+                    outputs = [
+                        worker.execute_model_spmd.
+                        bind(  # type: ignore[attr-defined]
+                            outputs[i]) for i, worker in enumerate(tp_group)
+                    ]
 
                 last_pp_rank = len(self.pp_tp_workers) - 1
                 if pp_rank < last_pp_rank:
@@ -497,17 +554,6 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
     def __del__(self):
         self.shutdown()
 
-
-class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.pp_locks: Optional[List[asyncio.Lock]] = None
-        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
-        if not self.use_ray_compiled_dag:
-            self.driver_exec_method = make_async(
-                self.driver_worker.execute_method)
-
     async def execute_model_async(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
@@ -568,5 +614,7 @@ async def _start_worker_execution_loop(self):
         ]
         return await asyncio.gather(*coros)
 
-    def __del__(self):
-        self.shutdown()
+    def check_health(self) -> None:
+        # Assume that the Ray workers are healthy.
+        # TODO: check the health of the Ray workers
+        return
diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py
deleted file mode 100644
index f3025cb537ab8..0000000000000
--- a/vllm/executor/ray_hpu_executor.py
+++ /dev/null
@@ -1,515 +0,0 @@
-import asyncio
-import os
-from collections import defaultdict
-from itertools import islice, repeat
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
-
-import msgspec
-
-import vllm.envs as envs
-from vllm.executor.distributed_gpu_executor import (  # yapf: disable
-    DistributedGPUExecutor, DistributedGPUExecutorAsync)
-from vllm.executor.msgspec_utils import encode_hook
-from vllm.executor.ray_utils import RayWorkerWrapper, ray
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
-                        get_ip, get_open_port, make_async)
-
-if ray is not None:
-    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-
-if TYPE_CHECKING:
-    from ray.util.placement_group import PlacementGroup
-
-logger = init_logger(__name__)
-
-
-class RayHPUExecutor(DistributedGPUExecutor):
-
-    uses_ray: bool = True
-
-    def _init_executor(self) -> None:
-        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
-        # If the env var is set, it uses the Ray's compiled DAG API
-        # which optimizes the control plane overhead.
-        # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
-        # Currently, this requires USE_RAY_SPMD_WORKER=True.
-        self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG
-        # If the env var is set, then we do not distinguish between the
-        # "driver worker" vs other workers. Also, the rank 0 worker will
-        # be executed in a remote Ray worker. Currently this requires
-        # USE_RAY_COMPILED_DAG=True.
-        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
-        if self.use_ray_compiled_dag:
-            assert self.use_ray_spmd_worker, (
-                "VLLM_USE_RAY_COMPILED_DAG=1 requires "
-                "VLLM_USE_RAY_SPMD_WORKER=1")
-        if self.use_ray_spmd_worker:
-            # TODO: Support SPMD worker for non-DAG Ray executor.
-            assert self.use_ray_compiled_dag, (
-                "VLLM_USE_RAY_SPMD_WORKER=1 requires "
-                "VLLM_USE_RAY_COMPILED_DAG=1")
-
-        assert self.uses_ray
-        placement_group = self.parallel_config.placement_group
-
-        # Disable Ray usage stats collection.
-        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
-        if ray_usage != "1":
-            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
-
-        # Create the parallel GPU workers.
-        self._init_workers_ray(placement_group)
-
-        self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
-        self.output_decoder = msgspec.msgpack.Decoder(
-            Optional[List[SamplerOutput]])
-
-    def shutdown(self) -> None:
-        if hasattr(self, "forward_dag") and self.forward_dag is not None:
-            self.forward_dag.teardown()
-            import ray
-            for worker in self.workers:
-                ray.kill(worker)
-            self.forward_dag = None
-
-    def finish_measurements(self):
-        self._run_workers("finish_measurements")
-
-    def _init_workers_ray(self, placement_group: "PlacementGroup",
-                          **ray_remote_kwargs):
-        # Otherwise, the ray workers are allocated with a full GPU.
-        num_gpus = 1
-
-        # The driver dummy worker does not actually use any resources.
-        # It holds the resource for the driver worker.
-        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
-        # The remaining workers are the actual ray actors.
-        self.workers: List[RayWorkerWrapper] = []
-
-        # Used in ray compiled DAG: indexed first by PP rank,
-        # and then TP rank. In other words, the inner list is
-        # the TP group of workers for a PP rank.
-        self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
-
-        logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
-
-        # Create the workers.
-        driver_ip = get_ip()
-        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get("HPU", 0):
-                continue
-            scheduling_strategy = PlacementGroupSchedulingStrategy(
-                placement_group=placement_group,
-                placement_group_capture_child_tasks=True,
-                placement_group_bundle_index=bundle_id,
-            )
-
-            worker = ray.remote(
-                num_cpus=0,
-                num_gpus=0,
-                resources={'HPU': num_gpus},
-                scheduling_strategy=scheduling_strategy,
-                **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
-
-            if self.use_ray_spmd_worker:
-                self.workers.append(worker)
-            else:
-                worker_ip = ray.get(worker.get_node_ip.remote())
-                if worker_ip == driver_ip and self.driver_dummy_worker is None:
-                    # If the worker is on the same node as the driver, we use it
-                    # as the resource holder for the driver process.
-                    self.driver_dummy_worker = worker
-                    self.driver_worker = RayWorkerWrapper(
-                        vllm_config=self.vllm_config)
-                else:
-                    # Else, added to the list of workers.
-                    self.workers.append(worker)
-
-        logger.debug("workers: %s", self.workers)
-        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
-        if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
-            raise ValueError(
-                "Ray does not allocate any GPUs on the driver node. Consider "
-                "adjusting the Ray placement group or running the driver on a "
-                "GPU node.")
-
-        worker_ips = [
-            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
-            for worker in self.workers
-        ]
-        ip_counts: Dict[str, int] = {}
-        for ip in worker_ips:
-            ip_counts[ip] = ip_counts.get(ip, 0) + 1
-
-        def sort_by_driver_then_worker_ip(worker):
-            """
-            Sort the workers based on 3 properties:
-            1. If the worker is on the same node as the driver (vllm engine),
-                it should be placed first.
-            2. Then, if the worker is on a node with fewer workers, it should
-                be placed first.
-            3. Finally, if the work is on a node with smaller IP address, it
-                should be placed first.
-            """
-            ip = ray.get(worker.get_node_ip.remote())
-            return (ip != driver_ip, ip_counts[ip], ip)
-
-        # After sorting, the workers on the same node will be
-        # close to each other, and the workers on the driver
-        # node will be placed first.
-        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
-
-        worker_node_and_gpu_ids = []
-        for worker in [self.driver_dummy_worker] + self.workers:
-            if worker is None:
-                # driver_dummy_worker can be None when using ray spmd worker.
-                continue
-            worker_node_and_gpu_ids.append(
-                ray.get(worker.get_node_and_gpu_ids.remote()) \
-            ) # type: ignore
-
-        node_workers = defaultdict(list)  # node id -> list of worker ranks
-        node_gpus = defaultdict(list)  # node id -> list of gpu ids
-
-        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
-            node_workers[node_id].append(i)
-            # `gpu_ids` can be a list of strings or integers.
-            # convert them to integers for consistency.
-            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
-            # string sorting is not sufficient.
-            # see https://github.com/vllm-project/vllm/issues/5590
-            gpu_ids = [int(x) for x in gpu_ids]
-            node_gpus[node_id].extend(gpu_ids)
-        for node_id, gpu_ids in node_gpus.items():
-            node_gpus[node_id] = sorted(gpu_ids)
-
-        all_ips = set(worker_ips + [driver_ip])
-        n_ips = len(all_ips)
-        n_nodes = len(node_workers)
-
-        if n_nodes != n_ips:
-            raise RuntimeError(
-                f"Every node should have a unique IP address. Got {n_nodes}"
-                f" nodes with node ids {list(node_workers.keys())} and "
-                f"{n_ips} unique IP addresses {all_ips}. Please check your"
-                " network configuration. If you set `VLLM_HOST_IP` "
-                "environment variable, make sure it is unique for"
-                " each node.")
-
-        # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = [({
-            "VLLM_TRACE_FUNCTION":
-            str(envs.VLLM_TRACE_FUNCTION),
-        }, ) for (node_id, _) in worker_node_and_gpu_ids]
-        self._run_workers("update_environment_variables",
-                          all_args=all_args_to_update_environment_variables)
-
-        if len(node_gpus) == 1:
-            # in single node case, we don't need to get the IP address.
-            # the loopback address is sufficient
-            # NOTE: a node may have several IP addresses, one for each
-            # network interface. `get_ip()` might return any of them,
-            # while they might not work for communication inside the node
-            # if the network setup is complicated. Using the loopback address
-            # solves this issue, as it always works for communication inside
-            # the node.
-            driver_ip = "127.0.0.1"
-        distributed_init_method = get_distributed_init_method(
-            driver_ip, get_open_port())
-
-        # Initialize the actual workers inside worker wrapper.
-        init_worker_all_kwargs = [
-            self._get_worker_kwargs(
-                local_rank=node_workers[node_id].index(rank),
-                rank=rank,
-                distributed_init_method=distributed_init_method,
-            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
-        ]
-        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
-
-        self._run_workers("init_device")
-        self._run_workers("load_model",
-                          max_concurrent_workers=self.parallel_config.
-                          max_parallel_loading_workers)
-
-        if self.use_ray_spmd_worker:
-            for pp_rank in range(self.parallel_config.pipeline_parallel_size):
-                self.pp_tp_workers.append([])
-                for tp_rank in range(
-                        self.parallel_config.tensor_parallel_size):
-                    # PP=2, TP=4
-                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
-                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
-                            ) + tp_rank
-                    assert len(self.pp_tp_workers[pp_rank]) == tp_rank
-                    assert pp_rank < len(self.pp_tp_workers)
-                    self.pp_tp_workers[pp_rank].append(self.workers[rank])
-
-        # This is the list of workers that are rank 0 of each TP group EXCEPT
-        # global rank 0. These are the workers that will broadcast to the
-        # rest of the workers.
-        self.tp_driver_workers: List[RayWorkerWrapper] = []
-        # This is the list of workers that are not drivers and not the first
-        # worker in a TP group. These are the workers that will be
-        # broadcasted to.
-        self.non_driver_workers: List[RayWorkerWrapper] = []
-
-        # Enforce rank order for correct rank to return final output.
-        for index, worker in enumerate(self.workers):
-            # The driver worker is rank 0 and not in self.workers.
-            rank = index + 1
-            if rank % self.parallel_config.tensor_parallel_size == 0:
-                self.tp_driver_workers.append(worker)
-            else:
-                self.non_driver_workers.append(worker)
-
-    def _driver_execute_model(
-        self, execute_model_req: Optional[ExecuteModelRequest]
-    ) -> Optional[List[SamplerOutput]]:
-        """Run execute_model in the driver worker.
-
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        assert not self.use_ray_spmd_worker, (
-            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
-        return self.driver_worker.execute_method("execute_model",
-                                                 execute_model_req)
-
-    def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if not self.use_ray_spmd_worker:
-            return super().execute_model(execute_model_req)
-
-        if self.forward_dag is None:
-            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
-
-        serialized_data = self.input_encoder.encode(execute_model_req)
-        outputs = ray.get(self.forward_dag.execute(serialized_data))
-        output = self.output_decoder.decode(outputs[0])
-        return output
-
-    def _run_workers(
-        self,
-        method: str,
-        *args,
-        async_run_tensor_parallel_workers_only: bool = False,
-        all_args: Optional[List[Tuple[Any, ...]]] = None,
-        all_kwargs: Optional[List[Dict[str, Any]]] = None,
-        max_concurrent_workers: Optional[int] = None,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers. Can be used in the following
-        ways:
-
-        Args:
-        - async_run_tensor_parallel_workers_only: If True the method will be
-          run only in the remote TP workers, not the driver worker.
-          It will also be run asynchronously and return a list of futures
-          rather than blocking on the results.
-        - args/kwargs: All workers share the same args/kwargs
-        - all_args/all_kwargs: args/kwargs for each worker are specified
-          individually
-        """
-        if self.use_ray_spmd_worker:
-            assert not async_run_tensor_parallel_workers_only, (
-                "async_run_tensor_parallel_workers_only is not supported for "
-                "spmd mode.")
-
-        if max_concurrent_workers:
-            raise NotImplementedError(
-                "max_concurrent_workers is not supported yet.")
-
-        count = len(self.workers) if not \
-            async_run_tensor_parallel_workers_only \
-            else len(self.non_driver_workers)
-        # If using SPMD worker, all workers are the same, so we should execute
-        # the args on all workers. Otherwise, we skip the first worker's args
-        # because those args will go to the driver worker.
-        first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1
-        all_worker_args = repeat(args, count) if all_args is None \
-            else islice(all_args, first_worker_args_index, None)
-        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
-            else islice(all_kwargs, first_worker_args_index, None)
-
-        # Start the ray workers first.
-        ray_workers = self.workers
-        if async_run_tensor_parallel_workers_only:
-            ray_workers = self.non_driver_workers
-        ray_worker_outputs = [
-            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
-            for (worker, worker_args, worker_kwargs
-                 ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
-        ]
-
-        if async_run_tensor_parallel_workers_only:
-            # Just return futures
-            return ray_worker_outputs
-
-        driver_worker_output = []
-        # In SPMD mode, the driver worker is the same as any other worker,
-        # so we only explicitly execute on the driver worker if using a
-        # non-SPMD worker class.
-        if not self.use_ray_spmd_worker:
-            driver_args = args if all_args is None else all_args[0]
-            driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
-
-            # Start the driver worker after all the ray workers.
-            driver_worker_output = [
-                self.driver_worker.execute_method(method, *driver_args,
-                                                  **driver_kwargs)
-            ]
-
-        # Get the results of the ray workers.
-        if self.workers:
-            ray_worker_outputs = ray.get(ray_worker_outputs)
-
-        return driver_worker_output + ray_worker_outputs
-
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        ray.get(parallel_worker_tasks)
-
-    def _check_ray_adag_installation(self):
-        import pkg_resources
-        from packaging import version
-
-        required_version = version.parse("2.35")
-        current_version = version.parse(
-            pkg_resources.get_distribution("ray").version)
-        # TODO: update the constraint once we adapt to the backward
-        # incompatible API change from ray 2.36
-        if current_version != required_version:
-            raise ValueError(f"Ray version {required_version} is "
-                             f"required, but found {current_version}")
-
-        import importlib.util
-        adag_spec = importlib.util.find_spec(
-            "ray.experimental.compiled_dag_ref")
-        if adag_spec is None:
-            raise ValueError("Ray accelerated DAG is not installed. "
-                             "Run `pip install ray[adag]` to install it.")
-
-    def _compiled_ray_dag(self, enable_asyncio: bool):
-        assert self.parallel_config.use_ray
-        self._check_ray_adag_installation()
-        from ray.dag import InputNode, MultiOutputNode
-        from ray.experimental.channel.torch_tensor_type import TorchTensorType
-
-        with InputNode() as input_data:
-            # Example DAG: PP=2, TP=4
-            # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
-            #                         -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput   # noqa: E501
-            #                         -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput   # noqa: E501
-            #                         -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput   # noqa: E501
-
-            # All workers in the first TP group will take in the
-            # ExecuteModelRequest as input.
-            outputs = [input_data for _ in self.pp_tp_workers[0]]
-            for pp_rank, tp_group in enumerate(self.pp_tp_workers):
-                # Each PP worker takes in the output of the previous PP worker,
-                # and the TP group executes in SPMD fashion.
-                outputs = [
-                    worker.execute_model_spmd.
-                    bind(  # type: ignore[attr-defined]
-                        outputs[i]) for i, worker in enumerate(tp_group)
-                ]
-
-                last_pp_rank = len(self.pp_tp_workers) - 1
-                if pp_rank < last_pp_rank:
-                    # Specify how intermediate tensors should be passed
-                    # between pp stages, no need to specify for the last
-                    # pp stage.
-                    transport = "auto"
-                    outputs = [
-                        output.with_type_hint(
-                            TorchTensorType(transport=transport))
-                        for output in outputs
-                    ]
-
-            forward_dag = MultiOutputNode(outputs)
-
-        return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
-
-    def __del__(self):
-        self.shutdown()
-
-
-class RayHPUExecutorAsync(RayHPUExecutor, DistributedGPUExecutorAsync):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.pp_locks: Optional[List[asyncio.Lock]] = None
-        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
-        if not self.use_ray_compiled_dag:
-            self.driver_exec_method = make_async(
-                self.driver_worker.execute_method)
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if not self.use_ray_spmd_worker:
-            return await super().execute_model_async(execute_model_req)
-
-        if self.forward_dag is None:
-            self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)
-
-        serialized_data = self.input_encoder.encode(execute_model_req)
-        dag_future = await self.forward_dag.execute_async(serialized_data)
-        outputs = await dag_future
-        return self.output_decoder.decode(outputs[0])
-
-    async def _driver_execute_model_async(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        assert not self.use_ray_spmd_worker, (
-            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
-        if not self.tp_driver_workers:
-            return await self.driver_exec_method("execute_model",
-                                                 execute_model_req)
-        if self.pp_locks is None:
-            # This locks each pipeline parallel stage so multiple virtual
-            # engines can't execute on the same stage at the same time
-            # We create the locks here to avoid creating them in the constructor
-            # which uses a different asyncio loop.
-            self.pp_locks = [
-                asyncio.Lock()
-                for _ in range(self.parallel_config.pipeline_parallel_size)
-            ]
-
-        tasks = [
-            asyncio.create_task(
-                _run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
-                                    "execute_model", execute_model_req))
-        ]
-        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
-                                                start=1):
-            tasks.append(
-                asyncio.create_task(
-                    _run_task_with_lock(driver_worker.execute_method.remote,
-                                        self.pp_locks[pp_rank],
-                                        "execute_model", execute_model_req)))
-
-        results = await asyncio.gather(*tasks)
-
-        # Only the last PP stage has the final results.
-        return results[-1]
-
-    async def _start_worker_execution_loop(self):
-        assert not self.use_ray_spmd_worker, (
-            "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1")
-        coros = [
-            worker.execute_method.remote("start_worker_execution_loop")
-            for worker in self.non_driver_workers
-        ]
-        return await asyncio.gather(*coros)
-
-    def __del__(self):
-        self.shutdown()
diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py
deleted file mode 100644
index 5118c13934f0d..0000000000000
--- a/vllm/executor/ray_tpu_executor.py
+++ /dev/null
@@ -1,343 +0,0 @@
-import asyncio
-import os
-from collections import defaultdict
-from itertools import islice, repeat
-from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple,
-                    Union)
-
-import vllm.envs as envs
-from vllm.executor.executor_base import ExecutorAsyncBase
-from vllm.executor.ray_utils import RayWorkerWrapper, ray
-from vllm.executor.tpu_executor import TPUExecutor
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-
-if ray is not None:
-    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-
-if TYPE_CHECKING:
-    from ray.util.placement_group import PlacementGroup
-
-logger = init_logger(__name__)
-
-
-class RayTPUExecutor(TPUExecutor):
-
-    uses_ray: bool = True
-
-    def __init__(self, *args, **kwargs):
-        # This is non-None when the execute model loop is running
-        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
-        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
-        # Updated by implementations that require additional args to be passed
-        # to the _run_workers execute_model call
-        self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
-
-        super().__init__(*args, **kwargs)
-
-    def _init_executor(self) -> None:
-        assert self.parallel_config.distributed_executor_backend == "ray"
-        placement_group = self.parallel_config.placement_group
-
-        # Disable Ray usage stats collection.
-        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
-        if ray_usage != "1":
-            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
-
-        # Create the parallel TPU workers.
-        self._init_workers_ray(placement_group)
-
-    def _init_workers_ray(self, placement_group: "PlacementGroup",
-                          **ray_remote_kwargs):
-        # The driver dummy worker does not actually use any resources.
-        # It holds the resource for the driver worker.
-        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
-        # The remaining workers are the actual ray actors.
-        self.workers: List[RayWorkerWrapper] = []
-
-        # Create the workers.
-        driver_ip = get_ip()
-        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get("TPU", 0):
-                continue
-            scheduling_strategy = PlacementGroupSchedulingStrategy(
-                placement_group=placement_group,
-                placement_group_capture_child_tasks=True,
-                placement_group_bundle_index=bundle_id,
-            )
-
-            # GKE does not fetch environment information from metadata server
-            # and instead sets these from within the Ray process. Therefore we
-            # need to override the Ray environment variables manually.
-            override_env = {}
-            if "TPU_CHIPS_PER_HOST_BOUNDS" in os.environ:
-                override_env.update({
-                    "TPU_CHIPS_PER_HOST_BOUNDS":
-                    os.environ["TPU_CHIPS_PER_HOST_BOUNDS"]
-                })
-            if "TPU_HOST_BOUNDS" in os.environ:
-                override_env.update(
-                    {"TPU_HOST_BOUNDS": os.environ["TPU_HOST_BOUNDS"]})
-
-            worker = ray.remote(
-                num_cpus=0,
-                resources={"TPU": 1},
-                scheduling_strategy=scheduling_strategy,
-                **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
-            if override_env:
-                worker.override_env_vars.remote(override_env)
-
-            worker_ip = ray.get(worker.get_node_ip.remote())
-            if worker_ip == driver_ip and self.driver_dummy_worker is None:
-                # If the worker is on the same node as the driver, we use it
-                # as the resource holder for the driver process.
-                self.driver_dummy_worker = worker
-                self.driver_worker = RayWorkerWrapper(
-                    vllm_config=self.vllm_config)
-            else:
-                # Else, added to the list of workers.
-                self.workers.append(worker)
-
-        logger.debug("workers: %s", self.workers)
-        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
-        if self.driver_dummy_worker is None:
-            raise ValueError(
-                "Ray does not allocate any TPUs on the driver node. Consider "
-                "adjusting the Ray placement group or running the driver on a "
-                "TPU node.")
-
-        worker_ips = [
-            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
-            for worker in self.workers
-        ]
-        ip_counts: Dict[str, int] = {}
-        for ip in worker_ips:
-            ip_counts[ip] = ip_counts.get(ip, 0) + 1
-
-        def sort_by_driver_then_worker_ip(worker):
-            """
-            Sort the workers based on 3 properties:
-            1. If the worker is on the same node as the driver (vllm engine),
-                it should be placed first.
-            2. Then, if the worker is on a node with fewer workers, it should
-                be placed first.
-            3. Finally, if the work is on a node with smaller IP address, it
-                should be placed first.
-            """
-            ip = ray.get(worker.get_node_ip.remote())
-            return (ip != driver_ip, ip_counts[ip], ip)
-
-        # After sorting, the workers on the same node will be
-        # close to each other, and the workers on the driver
-        # node will be placed first.
-        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
-
-        # Get the set of TPU IDs used on each node.
-        worker_node_and_gpu_ids = []
-        for worker in [self.driver_dummy_worker] + self.workers:
-            if worker is None:
-                # driver_dummy_worker can be None when using ray spmd worker.
-                continue
-            worker_node_and_gpu_ids.append(
-                ray.get(worker.get_node_and_gpu_ids.remote()) \
-            ) # type: ignore
-
-        node_workers = defaultdict(list)
-        for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
-            node_workers[node_id].append(i)
-
-        # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = [({
-            "VLLM_TRACE_FUNCTION":
-            str(envs.VLLM_TRACE_FUNCTION),
-        }, ) for _ in worker_node_and_gpu_ids]
-        self._run_workers("update_environment_variables",
-                          all_args=all_args_to_update_environment_variables)
-
-        if len(node_workers) == 1:
-            # in single node case, we don't need to get the IP address.
-            # the loopback address is sufficient
-            # NOTE: a node may have several IP addresses, one for each
-            # network interface. `get_ip()` might return any of them,
-            # while they might not work for communication inside the node
-            # if the network setup is complicated. Using the loopback address
-            # solves this issue, as it always works for communication inside
-            # the node.
-            driver_ip = "127.0.0.1"
-        distributed_init_method = get_distributed_init_method(
-            driver_ip, get_open_port())
-
-        # Initialize the actual workers inside worker wrapper.
-        init_worker_all_kwargs = [
-            self._get_worker_kwargs(
-                local_rank=node_workers[node_id].index(rank),
-                rank=rank,
-                distributed_init_method=distributed_init_method,
-            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
-        ]
-        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
-
-        self._run_workers("init_device")
-        self._run_workers("load_model",
-                          max_concurrent_workers=self.parallel_config.
-                          max_parallel_loading_workers)
-
-    def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        """Run execute_model in the driver worker.
-
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        return self.driver_worker.execute_method("execute_model",
-                                                 execute_model_req)
-
-    def _run_workers(
-        self,
-        method: str,
-        *args,
-        async_run_remote_workers_only: bool = False,
-        all_args: Optional[List[Tuple[Any, ...]]] = None,
-        all_kwargs: Optional[List[Dict[str, Any]]] = None,
-        max_concurrent_workers: Optional[int] = None,
-        use_ray_compiled_dag: bool = False,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers. Can be used in the following
-        ways:
-
-        - async_run_remote_workers_only: If True the method will be run only
-          in the remote workers, not the driver worker. It will also be
-          run asynchronously and return a list of futures rather than blocking
-          on the results.
-        - args/kwargs: All workers share the same args/kwargs
-        - all_args/all_kwargs: args/kwargs for each worker are specified
-          individually
-        """
-
-        if max_concurrent_workers:
-            raise NotImplementedError(
-                "max_concurrent_workers is not supported yet.")
-
-        count = len(self.workers)
-        all_worker_args = repeat(args, count) if all_args is None \
-            else islice(all_args, 1, None)
-        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
-            else islice(all_kwargs, 1, None)
-
-        # Start the ray workers first.
-        ray_worker_outputs = [
-            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
-            for (worker, worker_args, worker_kwargs
-                 ) in zip(self.workers, all_worker_args, all_worker_kwargs)
-        ]
-
-        if async_run_remote_workers_only:
-            # Just return futures
-            return ray_worker_outputs
-
-        driver_args = args if all_args is None else all_args[0]
-        driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
-
-        # Start the driver worker after all the ray workers.
-        driver_worker_output = self.driver_worker.execute_method(
-            method, *driver_args, **driver_kwargs)
-        # Get the results of the ray workers.
-        if self.workers:
-            ray_worker_outputs = ray.get(ray_worker_outputs)
-
-        return [driver_worker_output] + ray_worker_outputs
-
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        ray.get(parallel_worker_tasks)
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        num_blocks = self._run_workers("determine_num_available_blocks", )
-        num_tpu_blocks = min(b[0] for b in num_blocks)
-        num_cpu_blocks = min(b[1] for b in num_blocks)
-        return num_tpu_blocks, num_cpu_blocks
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
-                    num_cpu_blocks)
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-        self._run_workers("initialize_cache",
-                          num_gpu_blocks=num_gpu_blocks,
-                          num_cpu_blocks=num_cpu_blocks)
-
-    def execute_model(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        if self.parallel_worker_tasks is None:
-            self.parallel_worker_tasks = self._run_workers(
-                "start_worker_execution_loop",
-                async_run_remote_workers_only=True,
-                **self.extra_execute_model_run_workers_kwargs)
-
-        # Only the driver worker returns the sampling results.
-        return self._driver_execute_model(execute_model_req)
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        self._driver_execute_model()
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        self._wait_for_tasks_completion(parallel_worker_tasks)
-
-
-class RayTPUExecutorAsync(RayTPUExecutor, ExecutorAsyncBase):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.driver_exec_method = make_async(self.driver_worker.execute_method)
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if self.parallel_worker_tasks is None:
-            # Start model execution loop running in the parallel workers
-            self.parallel_worker_tasks = asyncio.create_task(
-                self._start_worker_execution_loop())
-
-        # Only the driver worker returns the sampling results.
-        return await self._driver_execute_model_async(execute_model_req)
-
-    async def stop_remote_worker_execution_loop_async(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        await self._driver_execute_model_async()
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        await parallel_worker_tasks
-
-    async def _driver_execute_model_async(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        return await self.driver_exec_method("execute_model",
-                                             execute_model_req)
-
-    async def _start_worker_execution_loop(self):
-        coros = [
-            worker.execute_method.remote("start_worker_execution_loop")
-            for worker in self.workers
-        ]
-        return await asyncio.gather(*coros)
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 9f40f6a65dcd7..e55155ea06225 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -1,7 +1,7 @@
 import os
 import time
 from collections import defaultdict
-from typing import Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import msgspec
 
@@ -13,6 +13,10 @@
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
 
+if TYPE_CHECKING:
+    from vllm.v1.core.scheduler import SchedulerOutput
+    from vllm.v1.outputs import ModelRunnerOutput
+
 logger = init_logger(__name__)
 PG_WAIT_TIMEOUT = 1800
 
@@ -95,6 +99,26 @@ def execute_model_spmd(
 
             return output
 
+        def setup_device_if_necessary(self):
+            # TODO(swang): This is needed right now because Ray CG executes
+            # on a background thread, so we need to reset torch's current
+            # device.
+            # We can remove this API after it is fixed in compiled graph.
+            import torch
+            assert self.worker is not None, "Worker is not initialized"
+            if not self.compiled_dag_cuda_device_set:
+                torch.cuda.set_device(self.worker.device)
+                self.compiled_dag_cuda_device_set = True
+
+        def execute_model(
+            self,
+            scheduler_output: "SchedulerOutput",
+        ) -> "ModelRunnerOutput":
+            self.setup_device_if_necessary()
+            assert self.worker is not None, "Worker is not initialized"
+            output = self.worker.model_runner.execute_model(scheduler_output)
+            return output
+
         def override_env_vars(self, vars: Dict[str, str]):
             os.environ.update(vars)
 
diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py
deleted file mode 100644
index d2086f5fef26c..0000000000000
--- a/vllm/executor/ray_xpu_executor.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import asyncio
-from typing import List, Optional
-
-import ray
-
-import vllm.envs as envs
-from vllm.executor.ray_gpu_executor import RayGPUExecutor, RayGPUExecutorAsync
-from vllm.executor.xpu_executor import XPUExecutor
-from vllm.logger import init_logger
-from vllm.utils import make_async
-
-logger = init_logger(__name__)
-
-
-class RayXPUExecutor(RayGPUExecutor, XPUExecutor):
-
-    def _get_env_vars_to_be_updated(self):
-        # Get the set of GPU IDs used on each node.
-        worker_node_and_gpu_ids = []
-        for worker in [self.driver_dummy_worker] + self.workers:
-            if worker is None:
-                # driver_dummy_worker can be None when using ray spmd worker.
-                continue
-            worker_node_and_gpu_ids.append(
-                ray.get(worker.get_node_and_gpu_ids.remote()))  # type: ignore
-
-        # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = [({
-            "VLLM_TRACE_FUNCTION":
-            str(envs.VLLM_TRACE_FUNCTION),
-        }, ) for (_, _) in worker_node_and_gpu_ids]
-        return all_args_to_update_environment_variables
-
-
-class RayXPUExecutorAsync(RayXPUExecutor, RayGPUExecutorAsync):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.driver_exec_method = make_async(self.driver_worker.execute_method)
-        self.pp_locks: Optional[List[asyncio.Lock]] = None
diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
deleted file mode 100644
index e37e8973790db..0000000000000
--- a/vllm/executor/tpu_executor.py
+++ /dev/null
@@ -1,142 +0,0 @@
-from typing import Any, Dict, List, Optional, Set, Tuple
-
-import torch
-
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-
-logger = init_logger(__name__)
-
-
-class TPUExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        assert not self.scheduler_config.chunked_prefill_enabled, (
-            "Chunked prefill is not yet supported for TPU backend")
-        assert not self.speculative_config, (
-            "Speculative decoding is not yet supported for TPU backend")
-        if self.model_config.dtype in (torch.float16, torch.float32):
-            logger.warning(
-                "The TPU backend currently does not support %s. "
-                "Using bfloat16 instead.", self.model_config.dtype)
-            self.model_config.dtype = torch.bfloat16
-
-        # Instantiate the worker and load the model to the device.
-        self.driver_worker = self._create_worker()
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
-
-    def _get_worker_kwargs(
-        self,
-        local_rank: int = 0,
-        rank: int = 0,
-        distributed_init_method: Optional[str] = None,
-    ) -> Dict[str, Any]:
-        """Return worker init args for a given rank."""
-        if distributed_init_method is None:
-            distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port())
-        return dict(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-            is_driver_worker=rank == 0,
-        )
-
-    def _create_worker(
-        self,
-        local_rank: int = 0,
-        rank: int = 0,
-        distributed_init_method: Optional[str] = None,
-    ):
-        if self.scheduler_config.is_multi_step:
-            from vllm.worker.multi_step_tpu_worker import MultiStepTPUWorker
-            worker = MultiStepTPUWorker(**self._get_worker_kwargs(
-                local_rank, rank, distributed_init_method))
-            return worker
-        else:
-            from vllm.worker.tpu_worker import TPUWorker
-
-            worker = TPUWorker(**self._get_worker_kwargs(
-                local_rank, rank, distributed_init_method))
-            return worker
-
-    def initialize_cache(
-        self,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-    ) -> None:
-        """Initialize the KV cache by invoking the underlying worker."""
-        # NOTE: This is logged in the executor because there can be >1 worker
-        # with other executors. We could log in the engine level, but work
-        # remains to abstract away the device for non-GPU configurations.
-        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
-                    num_cpu_blocks)
-        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker."""
-        return self.driver_worker.determine_num_available_blocks()
-
-    def execute_model(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise NotImplementedError(
-            "LoRA is currently not supported by the TPU backend.")
-
-    def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "LoRA is currently not supported by the TPU backend.")
-
-    def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "LoRA is currently not supported by the TPU backend.")
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError(
-            "LoRA is currently not supported by the TPU backend.")
-
-    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the TPU backend.")
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the TPU backend.")
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the TPU backend.")
-
-    def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the TPU backend.")
-
-    def check_health(self) -> None:
-        # TPUExecutor will always be healthy as long as it's running.
-        return
-
-
-class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-        self,
-        sexecute_model_req: ExecuteModelRequest,
-    ) -> SamplerOutput:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(sexecute_model_req)
-        return output
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
new file mode 100644
index 0000000000000..da1d77343cf3b
--- /dev/null
+++ b/vllm/executor/uniproc_executor.py
@@ -0,0 +1,57 @@
+from typing import Any, Dict, List, Optional, Tuple
+
+from vllm.executor.executor_base import ExecutorBase
+from vllm.logger import init_logger
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.worker.worker_base import WorkerWrapperBase
+
+logger = init_logger(__name__)
+
+
+class UniProcExecutor(ExecutorBase):
+
+    uses_ray: bool = False
+
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model.
+        """
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
+                                               rank=0)
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        local_rank = 0
+        rank = 0
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=(not self.parallel_config)
+            or (rank % self.parallel_config.tensor_parallel_size == 0),
+        )
+        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+
+    def collective_rpc(self,
+                       method: str,
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        if kwargs is None:
+            kwargs = {}
+        try:
+            func = getattr(self.driver_worker, method)
+        except AttributeError:
+            raise NotImplementedError(f"Method {method} is not implemented.") \
+                from None
+        answer = func(*args, **kwargs)
+        return [answer]
+
+    def check_health(self) -> None:
+        # UniProcExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
+UniProcExecutorAsync = UniProcExecutor
diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py
deleted file mode 100644
index 722b86a95ff8a..0000000000000
--- a/vllm/executor/xpu_executor.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from typing import List, Optional, Union
-
-from vllm.executor.executor_base import ExecutorAsyncBase
-from vllm.executor.gpu_executor import GPUExecutor
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest, PoolerOutput
-from vllm.utils import make_async
-
-logger = init_logger(__name__)
-
-
-class XPUExecutor(GPUExecutor):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        assert self.device_config.device_type == "xpu"
-        assert self.speculative_config is None, (
-            "Speculative decoding not yet supported for XPU backend")
-
-        GPUExecutor._init_executor(self)
-
-    def execute_model(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-
-class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(execute_model_req=execute_model_req)
-        return output
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 4d3b84fea887f..74948202cbe48 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,3 +1,4 @@
+import os
 from typing import TYPE_CHECKING, Optional
 
 import psutil
@@ -105,6 +106,32 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             else:
                 parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
 
+        assert vllm_config.device_config.device_type == "cpu"
+
+        #
+        # Environment variables for CPU executor
+        #
+
+        # Disable torch async compiling which won't work with daemonic processes
+        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+
+        # Intel OpenMP setting
+        ld_prealod_str = os.getenv("LD_PRELOAD", "")
+        if "libiomp5.so" in ld_prealod_str:
+            # The time(milliseconds) that a thread should wait after
+            # completing the execution of a parallel region, before sleeping.
+            os.environ['KMP_BLOCKTIME'] = "1"
+            # Prevents the CPU to run into low performance state
+            os.environ['KMP_TPAUSE'] = "0"
+            # Provides fine granularity parallelism
+            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
+
+        # To hint IPEX uses shared memory based AllReduce
+        os.environ["LOCAL_WORLD_SIZE"] = str(
+            vllm_config.parallel_config.tensor_parallel_size)
+
     @classmethod
     def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on CPU.")
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 2587e3a11dde3..8350177b68ade 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -139,6 +139,28 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 else:
                     parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
+        world_size = parallel_config.world_size
+        tensor_parallel_size = parallel_config.tensor_parallel_size
+
+        from vllm.utils import (cuda_device_count_stateless,
+                                update_environment_variables)
+
+        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
+        if "CUDA_VISIBLE_DEVICES" not in os.environ:
+            update_environment_variables({
+                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
+            })
+
+        cuda_device_count = cuda_device_count_stateless()
+        # Use confusing message for more common TP-only case.
+        assert tensor_parallel_size <= cuda_device_count, (
+            f"please set tensor_parallel_size ({tensor_parallel_size}) "
+            f"to less than max local gpu count ({cuda_device_count})")
+
+        assert world_size <= cuda_device_count, (
+            f"please ensure that world_size ({world_size}) "
+            f"is less than than max local gpu count ({cuda_device_count})")
+
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 0696f73cc17b4..ead3dab05a6b1 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -35,6 +35,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             parallel_config.worker_cls = \
                 "vllm.worker.neuron_worker.NeuronWorker"
 
+        if parallel_config.world_size > 1:
+            parallel_config.distributed_executor_backend = "uni"
+
+        assert (vllm_config.lora_config is
+                None), "LoRA is not supported for Neuron backend."
+        assert (not vllm_config.speculative_config
+                ), "Speculative decoding not yet supported for Neuron backend."
+
         cache_config = vllm_config.cache_config
         if cache_config:
             # neuron needs block_size = max_model_len
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 9390eda535c8f..7d414165a8188 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -66,9 +66,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         from vllm.utils import GiB_bytes
 
         parallel_config = vllm_config.parallel_config
-        assert (
-            parallel_config.world_size == 1
-        ), "OpenVINOExecutor only supports single CPU socket currently."
+        assert (parallel_config.world_size == 1
+                ), "OpenVINO only supports single CPU socket currently."
 
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = \
@@ -141,3 +140,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             raise RuntimeError(
                 "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
                 f" {kv_cache_space}, expect a positive integer value.")
+
+        assert vllm_config.device_config.device_type == "openvino"
+        assert vllm_config.lora_config is None, \
+            "OpenVINO backend doesn't support LoRA"
+        assert cls.is_openvino_cpu() or \
+            cls.is_openvino_gpu(), \
+            "OpenVINO backend supports only CPU and GPU devices"
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index ff9487daac7a7..05a3aa4305cfa 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -72,6 +72,16 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         assert vllm_config.speculative_config is None, \
             "TPU does not support speculative decoding"
 
+        assert not vllm_config.scheduler_config.chunked_prefill_enabled, (
+            "Chunked prefill is not yet supported for TPU backend")
+        assert not vllm_config.speculative_config, (
+            "Speculative decoding is not yet supported for TPU backend")
+        if vllm_config.model_config.dtype in (torch.float16, torch.float32):
+            logger.warning(
+                "The TPU backend currently does not support %s. "
+                "Using bfloat16 instead.", vllm_config.model_config.dtype)
+            vllm_config.model_config.dtype = torch.bfloat16
+
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 031abdc05d517..c34b5b58672e7 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -78,17 +78,31 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             raise NotImplementedError(
                 "XPU does not support speculative decoding")
 
+        if vllm_config.device_config is not None:
+            assert vllm_config.device_config.device_type == "xpu"
+
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
-        if (parallel_config.distributed_executor_backend is not None
-                and parallel_config.distributed_executor_backend != "ray"):
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
+
+        if parallel_config.distributed_executor_backend is None:
+            parallel_config.distributed_executor_backend = "ray"
+        elif parallel_config.distributed_executor_backend == "mp":
+            # FIXME(kunshang):
+            # spawn needs calling `if __name__ == '__main__':``
+            # fork is not supported for xpu start new process.
+            logger.error(
+                "Both start methods (spawn and fork) have issue "
+                "on XPU if you use mp backend, setting it to ray instead.")
+            parallel_config.distributed_executor_backend = "ray"
+
+        elif parallel_config.distributed_executor_backend != "ray":
             logger.warning(
                 "%s is not supported on XPU, fallback to ray distributed"
                 " executor backend.",
                 parallel_config.distributed_executor_backend)
             parallel_config.distributed_executor_backend = "ray"
-        if parallel_config.worker_cls == "auto":
-            parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
 
     @classmethod
     def is_pin_memory_available(cls):
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
index 1ab691a7ef047..21a58fc426275 100644
--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -9,17 +9,15 @@
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.worker.worker_base import DelegateWorkerBase
 
 
-class MedusaWorker(NonLLMProposerWorkerBase, WorkerWrapperBase):
+class MedusaWorker(NonLLMProposerWorkerBase, DelegateWorkerBase):
     """Worker for Medusa.
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(kwargs.get("vllm_config"))
-        self.init_worker(*args, **kwargs)
-
+        DelegateWorkerBase.__init__(self, *args, **kwargs)
         # Lazy initialization list.
         self._proposer: Top1Proposer
 
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 676ac5eb3609d..32197f8cc8f2f 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -16,10 +16,10 @@
                                          SpeculativeProposer)
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.worker.worker_base import DelegateWorkerBase
 
 
-class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase):
+class MultiStepWorker(ProposerWorkerBase, DelegateWorkerBase):
     """The MultiStepWorker is equivalent to a Worker except that it allows
     multiple forward passes in a single call, assuming the scheduler has
     allocated enough space to store the additional KV. This reduces overhead
@@ -32,15 +32,12 @@ class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase):
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(kwargs.get("vllm_config"))
-        self.init_worker(*args, **kwargs)
-
+        DelegateWorkerBase.__init__(self, *args, **kwargs)
         # Lazy initialization list.
         self._proposer: SpeculativeProposer
 
     def init_device(self) -> None:
         self.worker.init_device()
-
         self._proposer = Top1Proposer(
             weakref.proxy(self),  # type: ignore[arg-type]
             self.device,
@@ -56,18 +53,6 @@ def set_should_modify_greedy_probs_inplace(self) -> None:
         self.model_runner.model.sampler.should_modify_greedy_probs_inplace = (
             True)
 
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        return self.worker.determine_num_available_blocks()
-
-    def get_cache_block_size_bytes(self) -> int:
-        return self.worker.get_cache_block_size_bytes()
-
-    def initialize_cache(self, *args, **kwargs) -> None:
-        self.worker.initialize_cache(*args, **kwargs)
-
-    def execute_model(self, *args, **kwargs) -> List[SamplerOutput]:
-        return self.worker.execute_model(*args, **kwargs)
-
     @torch.inference_mode()
     def sampler_output(
         self,
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index e369da1a70c23..540d118d65ecb 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -40,8 +40,8 @@
                                    get_all_num_logprobs,
                                    get_sampled_token_logprobs, nvtx_range,
                                    split_batch_by_proposal_len)
-from vllm.worker.worker_base import (LoraNotSupportedWorkerBase, WorkerBase,
-                                     WorkerWrapperBase)
+from vllm.utils import resolve_obj_by_qualname
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
 
 logger = init_logger(__name__)
 
@@ -64,8 +64,9 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     target_worker_config = copy.deepcopy(vllm_config)
     target_worker_config.parallel_config.worker_cls =\
         target_worker_config.parallel_config.sd_worker_cls
-    target_worker = WorkerWrapperBase(vllm_config=target_worker_config)
-    target_worker.init_worker(*args, **kwargs)
+    cls = resolve_obj_by_qualname(
+        target_worker_config.parallel_config.worker_cls)
+    target_worker = cls(*args, **kwargs)
     # Set the disable_logprobs variable in the TargetModelRunner instance
     # as per its value specified in the SpeculativeConfig.
     target_worker.model_runner.disable_logprobs =\
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 5d74d4b01f500..7c17f60510ae1 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -14,8 +14,9 @@ def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
         distributed_executor_backend = (
             vllm_config.parallel_config.distributed_executor_backend)
         if distributed_executor_backend == "ray":
-            from vllm.v1.executor.ray_executor import RayExecutor
-            executor_class = RayExecutor
+            from vllm.executor.ray_distributed_executor import (  # noqa
+                RayDistributedExecutor)
+            executor_class = RayDistributedExecutor
         elif distributed_executor_backend == "mp":
             from vllm.v1.executor.multiproc_executor import MultiprocExecutor
             executor_class = MultiprocExecutor
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 41e6abbd67956..cee0fcc0bad68 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -246,9 +246,18 @@ def __init__(
         ready_path: str,
     ):
         self.rank = rank
-        wrapper = WorkerWrapperBase(vllm_config=vllm_config)
-        wrapper.init_worker(vllm_config, local_rank, rank,
-                            distributed_init_method)
+        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rank=rank)
+        # TODO: move `init_worker` to executor level as a collective rpc call
+        all_kwargs: List[Dict] = [
+            {} for _ in range(vllm_config.parallel_config.world_size)
+        ]
+        all_kwargs[rank] = {
+            "vllm_config": vllm_config,
+            "local_rank": local_rank,
+            "rank": rank,
+            "distributed_init_method": distributed_init_method,
+        }
+        wrapper.init_worker(all_kwargs)
         self.worker = wrapper.worker
 
         pid = os.getpid()
@@ -270,7 +279,7 @@ def __init__(
             ready_socket.send_string(WorkerProc.READY_STR)
             ready_socket.send(payload)
 
-        self.worker.initialize()
+        self.worker.init_device()
         self.worker.load_model()
 
     @staticmethod
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index be058318de58b..a9adc0114f76d 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -27,7 +27,7 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         self.observability_config = vllm_config.observability_config
 
         self.worker: Worker = self._create_worker()
-        self.worker.initialize()
+        self.worker.init_device()
         self.worker.load_model()
 
     def _create_worker(
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index e83bce4283555..e6feaee972a35 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -33,6 +33,7 @@ def __init__(
         local_rank: int,
         rank: int,
         distributed_init_method: str,
+        is_driver_worker: bool = False,
     ):
 
         # TODO: use WorkerBase.__init__(self, vllm_config=vllm_config)
@@ -75,7 +76,7 @@ def __init__(
         else:
             self.profiler = None
 
-    def initialize(self):
+    def init_device(self):
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until
             # the synchronization point. This causes the memory usage to grow
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 8b2d8aaed2803..9401241073c7d 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -2,6 +2,7 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
+import contextlib
 import gc
 import os
 from typing import List, Optional, Set, Tuple, Type
@@ -18,6 +19,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
+from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import bind_kv_cache
@@ -124,6 +126,70 @@ def init_device(self) -> None:
     def load_model(self):
         self.model_runner.load_model()
 
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> Optional[List[SamplerOutput]]:
+        assert execute_model_req is not None
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any # noqa:E501
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL     - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501
+        log_graph_compilation_all = os.environ.get(
+            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
+        log_graph_compilation = os.environ.get(
+            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION',
+            '0') != '0' or log_graph_compilation_all
+        log_cpu_fallbacks_all = os.environ.get(
+            'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
+        log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
+                                           '0') != '0' or log_cpu_fallbacks_all
+        if log_graph_compilation or log_cpu_fallbacks:
+            from habana_frameworks.torch.hpu.metrics import metric_localcontext
+            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+            is_prompt = any([
+                seq_group_metadata.is_prompt
+                for seq_group_metadata in seq_group_metadata_list
+            ])
+            max_context_len = max([
+                max([
+                    len(v.prompt_token_ids) + len(v.output_token_ids)
+                    for v in seq_group_metadata.seq_data.values()
+                ]) for seq_group_metadata in seq_group_metadata_list
+            ])  # whoa, that's some spicy stuff right here
+            max_num_blocks = (
+                (max_context_len - 1) // self.cache_config.block_size) + 1
+            input_stats = (f'is_prompt: {is_prompt}, '
+                           f'num_seqs: {len(seq_group_metadata_list)}, '
+                           f'max_context_len: {max_context_len}, '
+                           f'max_num_blocks {max_num_blocks}')
+            gc_ctx = metric_localcontext(
+                "graph_compilation"
+            ) if log_graph_compilation else contextlib.nullcontext()
+            cpu_fallback_ctx = metric_localcontext(
+                "cpu_fallback"
+            ) if log_cpu_fallbacks else contextlib.nullcontext()
+            with gc_ctx as gc_local_metric, \
+                cpu_fallback_ctx as cpu_fallback_local_metric:
+                output = LocalOrDistributedWorkerBase.execute_model(
+                    self, execute_model_req)
+            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
+                ) or log_graph_compilation_all:
+                msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
+                       f"{gc_local_metric.stats()}, {input_stats}")
+                logger.warning(msg)
+            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
+                    0) or log_cpu_fallbacks_all:
+                msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
+                       f"{cpu_fallback_local_metric.stats()}, {input_stats}")
+                logger.warning(msg)
+
+            return output
+
+        output = LocalOrDistributedWorkerBase.execute_model(
+            self, execute_model_req)
+        return output
+
     @torch.inference_mode()
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Profiles the peak memory usage of the model to determine how many
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 3f6269684ac93..e02c72faace70 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -8,6 +8,7 @@
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.model_executor import set_random_seed
+from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.worker.neuron_model_runner import NeuronModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
@@ -25,6 +26,7 @@ def __init__(
         local_rank: int,
         rank: int,
         distributed_init_method: str,
+        is_driver_worker: bool = True,
     ) -> None:
         WorkerBase.__init__(self, vllm_config=vllm_config)
         self.local_rank = local_rank
@@ -37,7 +39,22 @@ def __init__(
 
         self.model_runner: NeuronModelRunner = NeuronModelRunner(
             vllm_config=vllm_config)
-        self.is_driver_worker = True
+        self.is_driver_worker = is_driver_worker
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> Optional[List[SamplerOutput]]:
+        assert execute_model_req is not None
+        assert (not execute_model_req.blocks_to_swap_in
+                and not execute_model_req.blocks_to_swap_out
+                and not execute_model_req.blocks_to_copy), (
+                    "Cache operations are not supported for Neuron backend.")
+        assert execute_model_req.num_lookahead_slots == 0, (
+            "lookahead not supported for Neuron backend.")
+        output = LocalOrDistributedWorkerBase.execute_model(
+            self, execute_model_req)
+        return output
 
     def init_device(self) -> None:
         self.init_distributed_environment()
@@ -103,13 +120,14 @@ def get_cache_block_size_bytes(self) -> int:
 
     def init_distributed_environment(self):
         """Neuron uses transformers-neuronx for tensor parallelism.
-
-        vLLM still needs the environment inited when TP/PP > 1
+        It has only one process to control multiple devices.
+        vLLM still needs the environment initialized when TP/PP > 1,
+        so we initialize a distributed environment with one process.
         """
         init_distributed_environment(
             world_size=1,
-            rank=self.rank,
-            local_rank=self.local_rank,
+            rank=0,
+            local_rank=0,
             distributed_init_method=self.distributed_init_method,
             backend="gloo",
         )
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index 3482073566215..50a155d22c666 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -211,16 +211,14 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
 
     def __init__(
         self,
-        ov_core: ov.Core,
         vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
-        kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined,
         is_driver_worker: bool = False,
     ) -> None:
-        self.ov_core = ov_core
         WorkerBase.__init__(self, vllm_config)
+        self.ov_core = ov.Core()
         self.parallel_config.rank = rank
         self.local_rank = local_rank
         self.rank = rank
@@ -237,7 +235,7 @@ def __init__(
         self.model_runner = OpenVINOModelRunner(
             self.ov_core,
             vllm_config=self.vllm_config,
-            kv_cache_dtype=kv_cache_dtype,
+            kv_cache_dtype=self.vllm_config.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
         )
         # Uninitialized cache engine. Will be initialized by
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index a835718e1db19..7c14b8344b49e 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -88,7 +88,6 @@ def start_worker_execution_loop(self) -> None:
                 if output is None:
                     return None
 
-    @abstractmethod
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
@@ -119,6 +118,58 @@ def list_loras(self) -> Set[int]:
         raise NotImplementedError
 
 
+class DelegateWorkerBase(WorkerBase):
+    """
+    A class that delegates all methods to another WorkerBase instance. This is
+    useful for creating a WorkerBase that wraps another WorkerBase instance,
+    e.g. speculative decoding.
+    """
+    worker: WorkerBase
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        vllm_config: VllmConfig = kwargs.get("vllm_config")
+        cls = resolve_obj_by_qualname(vllm_config.parallel_config.worker_cls)
+        self.worker = cls(*args, **kwargs)
+
+    def init_device(self) -> None:
+        self.worker.init_device()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        return self.worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[List[SamplerOutput]]:
+        return self.worker.execute_model(execute_model_req)
+
+    def get_cache_block_size_bytes(self) -> int:
+        return self.worker.get_cache_block_size_bytes()
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.worker.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.worker.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.worker.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.worker.list_loras()
+
+    def __getattr__(self, attr):
+        return getattr(self.worker, attr)
+
+
 class LoraNotSupportedWorkerBase(WorkerBase):
     """Partial implementation of WorkerBase that raises exceptions when LoRA
     methods are invoked.
@@ -419,17 +470,31 @@ class WorkerWrapperBase:
     def __init__(
         self,
         vllm_config: VllmConfig,
+        rank: int = 0,
     ) -> None:
+        self.rank = rank
         self.vllm_config = vllm_config
-        trust_remote_code = vllm_config.model_config.trust_remote_code
         self.worker: Optional[WorkerBase] = None
-        if trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
+        if vllm_config.model_config is not None:
+            # it can be None in tests
+            trust_remote_code = vllm_config.model_config.trust_remote_code
+            if trust_remote_code:
+                # note: lazy import to avoid importing torch before initializing
+                from vllm.utils import init_cached_hf_modules
+                init_cached_hf_modules()
+
+    def adjust_rank(self, rank_mapping: Dict[int, int]) -> None:
+        """
+        Adjust the rank based on the given mapping.
+        It is only used during the initialization of the executor,
+        to adjust the rank of workers after we create all workers.
+        """
+        if self.rank in rank_mapping:
+            self.rank = rank_mapping[self.rank]
 
-    @staticmethod
-    def update_environment_variables(envs: Dict[str, str]) -> None:
+    def update_environment_variables(self, envs_list: List[Dict[str,
+                                                                str]]) -> None:
+        envs = envs_list[self.rank]
         key = 'CUDA_VISIBLE_DEVICES'
         if key in envs and key in os.environ:
             # overwriting CUDA_VISIBLE_DEVICES is desired behavior
@@ -437,11 +502,12 @@ def update_environment_variables(envs: Dict[str, str]) -> None:
             del os.environ[key]
         update_environment_variables(envs)
 
-    def init_worker(self, *args, **kwargs):
+    def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
         """
         Here we inject some common logic before initializing the worker.
         Arguments are passed to the worker class constructor.
         """
+        kwargs = all_kwargs[self.rank]
         enable_trace_function_call_for_thread(self.vllm_config)
 
         # see https://github.com/NVIDIA/nccl/issues/1234
@@ -452,7 +518,7 @@ def init_worker(self, *args, **kwargs):
 
         worker_class = resolve_obj_by_qualname(
             self.vllm_config.parallel_config.worker_cls)
-        self.worker = worker_class(*args, **kwargs)
+        self.worker = worker_class(**kwargs)
         assert self.worker is not None
 
     def execute_method(self, method: str, *args, **kwargs):

From 3f9b7ab9f59f83ab0551a6a2f1894e30bc0cb41c Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 15 Jan 2025 01:36:01 -0500
Subject: [PATCH 20/67] [Doc] Update examples to remove
 SparseAutoModelForCausalLM (#12062)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 docs/source/features/quantization/fp8.md  | 11 +++++------
 docs/source/features/quantization/int8.md |  7 +++----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md
index da49cd2747228..1398e8a324201 100644
--- a/docs/source/features/quantization/fp8.md
+++ b/docs/source/features/quantization/fp8.md
@@ -54,16 +54,15 @@ The quantization process involves three main steps:
 
 ### 1. Loading the Model
 
-Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models:
+Load your model and tokenizer using the standard `transformers` AutoModel classes:
 
 ```python
-from llmcompressor.transformers import SparseAutoModelForCausalLM
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-
-model = SparseAutoModelForCausalLM.from_pretrained(
-  MODEL_ID, device_map="auto", torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
 
diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
index 82a15d76d352f..592a60d3988b2 100644
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@@ -30,14 +30,13 @@ The quantization process involves four main steps:
 
 ### 1. Loading the Model
 
-Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models:
+Load your model and tokenizer using the standard `transformers` AutoModel classes:
 
 ```python
-from llmcompressor.transformers import SparseAutoModelForCausalLM
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID, device_map="auto", torch_dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

From 994fc655b71f59f61b82cc44e868091dae493a84 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Wed, 15 Jan 2025 15:55:30 +0800
Subject: [PATCH 21/67] [V1][Prefix Cache] Move the logic of
 num_computed_tokens into KVCacheManager (#12003)

---
 tests/v1/core/test_prefix_caching.py | 71 ++++++++++++++++++----------
 vllm/v1/core/kv_cache_manager.py     | 17 +++++--
 vllm/v1/core/scheduler.py            |  8 +---
 3 files changed, 61 insertions(+), 35 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index b97f55b8c6535..fafd9d0ce4455 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -49,9 +49,10 @@ def test_prefill():
     unique_token_ids = [3] * 7
     all_token_ids = common_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert len(req0.kv_block_hashes) == 3
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
 
@@ -73,9 +74,10 @@ def test_prefill():
     # Incomplete 1 block (5 tokens)
     unique_token_ids = [3] * 5
     req1 = make_request("1", common_token_ids + unique_token_ids)
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(req1.kv_block_hashes) == 3
     assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
     assert [b.block_id for b in blocks] == [5, 6]
@@ -91,7 +93,7 @@ def test_prefill():
     # All blocks should be available.
     assert manager.free_block_queue.num_free_blocks == 10
     # The order should be
-    # [unallocated (7, 8)]
+    # [unallocated (7, 8, 9)]
     # [unique_req0 (4, 3)]
     # [unique_req1 (6, 5)]
     # [common (2, 1, 0)]
@@ -103,9 +105,10 @@ def test_prefill():
     # Incomplete 1 block (6 tokens)
     unique_token_ids = [3] * 6
     req2 = make_request("2", common_token_ids + unique_token_ids)
-    computed_blocks = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(req2.kv_block_hashes) == 3
     assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
     assert [b.block_id for b in blocks] == [7, 8]
@@ -123,8 +126,9 @@ def test_prefill():
 
     # Cache miss and eviction.
     req3 = make_request("3", [99] * (16 * 9))
-    computed_blocks = manager.get_computed_blocks(req3)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
     # This block ID order also checks the eviction order.
     assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
@@ -150,8 +154,9 @@ def test_decode():
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
     req0 = make_request("0", common_token_ids + unique_token_ids)
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
 
@@ -197,16 +202,18 @@ def test_evict():
 
     last_token_id = 5 * 16 + 7
     req0 = make_request("0", list(range(last_token_id)))
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 5 * 16 + 7, computed_blocks)
     assert len(blocks) == 7  # 5 full + 1 partial + 1 preallocated
 
     # 3 blocks.
     req1 = make_request("1", list(range(last_token_id,
                                         last_token_id + 3 * 16)))
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, 3 * 16, computed_blocks)
     assert len(blocks) == 3  # 3 full blocks
     last_token_id += 3 * 16
@@ -222,8 +229,9 @@ def test_evict():
 
     # Touch the first 2 blocks.
     req2 = make_request("2", list(range(2 * 16 + 3)))
-    computed_blocks = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert [b.block_id for b in computed_blocks] == [0, 1]
+    assert num_computed_tokens == 2 * 16
     blocks = manager.allocate_slots(req2, 3, computed_blocks)
     assert [b.block_id for b in blocks] == [6, 5]
     assert manager.free_block_queue.num_free_blocks == 6
@@ -247,8 +255,9 @@ def test_hash_block_correct_reuse():
     # Allocate 1 block and cache it.
     num_tokens = block_size * 1
     req = make_request("0", list(range(num_tokens)))
-    computed_blocks = manager.get_computed_blocks(req)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req, num_tokens, computed_blocks)
     assert len(blocks) == 1
 
@@ -258,8 +267,9 @@ def test_hash_block_correct_reuse():
     # Allocate a new block that's not full, make sure hash info on the
     # block is cleared.
     req = make_request("1", list(range(num_tokens - 1)))
-    computed_blocks = manager.get_computed_blocks(req)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req, num_tokens - 1, computed_blocks)
     assert len(blocks) == 1
 
@@ -284,16 +294,18 @@ def test_computed_blocks_not_evicted():
     # Allocate a block and cache it.
     num_tokens = block_size * 1
     req0 = make_request("0", list(range(num_tokens)))
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, num_tokens, computed_blocks)
     assert len(blocks) == 1
     assert blocks[0].block_id == 0
 
     # Allocate another block.
     req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, num_tokens, computed_blocks)
     assert len(blocks) == 1
     assert blocks[0].block_id == 1
@@ -305,9 +317,10 @@ def test_computed_blocks_not_evicted():
     # Now if we have a cache hit on the first block, we should evict the second
     # cached block rather than the first one.
     req2 = make_request("2", list(range(num_tokens * 2)))
-    computed_blocks = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(computed_blocks) == 1
     assert computed_blocks[0].block_id == 0
+    assert num_computed_tokens == block_size
 
     blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
                                     computed_blocks)
@@ -331,8 +344,9 @@ def test_basic_prefix_caching_disabled():
 
     req1 = make_request("1", list(range(10)))  # 2 blocks and some more
 
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, 10, computed_blocks)
     assert len(blocks) == 3
 
@@ -341,15 +355,17 @@ def test_basic_prefix_caching_disabled():
 
     # No caching.
     req2 = make_request("2", list(range(16)))  # shared prefix
-    computed_blocks = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req2, 16, computed_blocks)
     assert len(blocks) == 4
 
     # New requests should not have any blocks.
     req3 = make_request("3", list(range(4)))
-    computed_blocks = manager.get_computed_blocks(req3)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req3, 4, computed_blocks)
     assert not blocks
 
@@ -371,8 +387,9 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
     num_preallocated_blocks = cdiv(num_preallocate_tokens, block_size)
 
     req = make_request("0", list(range(block_size * 30)))
-    computed_blocks = manager.get_computed_blocks(req)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     # Just ask for 1 block.
     blocks = manager.allocate_slots(req, block_size, computed_blocks)
     req.num_computed_tokens = block_size
@@ -469,10 +486,11 @@ def test_mm_prefix_caching():
                         all_token_ids,
                         mm_positions=mm_positions,
                         mm_hashes=mm_hashes)
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
     # Completed block should have hashes with extra keys.
     assert not computed_blocks
+    assert num_computed_tokens == 0
     assert len(req0.kv_block_hashes) == 3
     assert req0.kv_block_hashes[0].extra_keys == ("aaa", )
     assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb")
@@ -503,8 +521,9 @@ def test_mm_prefix_caching():
                         all_token_ids,
                         mm_positions=mm_positions,
                         mm_hashes=mm_hashes)
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(computed_blocks) == 3
+    assert num_computed_tokens == 3 * 16
 
 
 def test_prefill_not_enough_free_blocks_with_computed_blocks():
@@ -527,15 +546,17 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # | Common-0 | Common-1 | Common-2 | ... |
     common_token_ids = [i for i in range(3) for _ in range(16)]
     req0 = make_request("0", common_token_ids)
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     manager.allocate_slots(req0, 48, computed_blocks)
     block_part0 = manager.req_to_blocks[req0.request_id]
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
     req1 = make_request("1", common_token_ids * 2)
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert computed_blocks == block_part0
+    assert num_computed_tokens == 3 * 16
     manager.allocate_slots(req1, 48, computed_blocks)
     block_part1 = manager.req_to_blocks[req1.request_id]
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
@@ -547,8 +568,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
     # | Req1-5(F)| Req2-0   | Req2-1   | ... |
     req2 = make_request("2", [7] * block_size * 2)
-    computed_blocks = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     manager.allocate_slots(req2, block_size * 2, computed_blocks)
 
     # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
@@ -556,8 +578,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # In this case, the ref_cnt of the computed blocks should not be changed.
     assert manager.free_block_queue.num_free_blocks == 5
     req3 = make_request("3", common_token_ids * 3)
-    computed_blocks = manager.get_computed_blocks(req3)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert computed_blocks == block_part1
+    assert num_computed_tokens == 6 * 16
     # Req3 cannot be allocated.
     assert manager.allocate_slots(req3, 48, computed_blocks) is None
     # Block 0-2 are used by Req 1.
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 1cbff1e2d767e..bac77443c8560 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,5 +1,5 @@
 from collections import defaultdict
-from typing import Dict, Iterable, List, Optional
+from typing import Dict, Iterable, List, Optional, Tuple
 
 from vllm.logger import init_logger
 from vllm.utils import cdiv
@@ -69,7 +69,8 @@ def __init__(
         # is finished.
         self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
 
-    def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
+    def get_computed_blocks(
+            self, request: Request) -> Tuple[List[KVCacheBlock], int]:
         """Get the computed (cached) blocks for the request.
         Note that the computed blocks must be full.
 
@@ -77,11 +78,13 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
             request: The request to get the computed blocks.
 
         Returns:
-            A list of blocks that are computed for the request.
+            A tuple containing:
+                - A list of blocks that are computed for the request.
+                - The number of computed tokens.
         """
         if not self.enable_caching:
             # Prefix caching is disabled.
-            return []
+            return [], 0
 
         computed_blocks = []
 
@@ -101,7 +104,11 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
             else:
                 break
 
-        return computed_blocks
+        # NOTE(woosuk): Since incomplete blocks are not eligible for
+        # sharing, `num_computed_tokens` is always a multiple of
+        # `block_size`.
+        num_computed_tokens = len(computed_blocks) * self.block_size
+        return computed_blocks, num_computed_tokens
 
     def append_slots(
         self,
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 2503d136aea7e..45e67c94f8f15 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -184,12 +184,8 @@ def schedule(self) -> "SchedulerOutput":
 
                 request = self.waiting[0]
                 # Get already-cached tokens.
-                computed_blocks = self.kv_cache_manager.get_computed_blocks(
-                    request)
-                # NOTE(woosuk): Since incomplete blocks are not eligible for
-                # sharing, `num_computed_tokens` is always a multiple of
-                # `block_size`.
-                num_computed_tokens = len(computed_blocks) * self.block_size
+                computed_blocks, num_computed_tokens = \
+                    self.kv_cache_manager.get_computed_blocks(request)
                 # Number of tokens to be scheduled.
                 # We use `request.num_tokens` instead of
                 # `request.num_prompt_tokens` to consider the resumed requests,

From cbe94391eb04aa9ae1be15711fec4eb453c1e053 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Wed, 15 Jan 2025 04:41:24 -0500
Subject: [PATCH 22/67] Fix: cases with empty sparsity config (#12057)

Signed-off-by: Rahul Tuli <rahul@neuralmagic.com>
---
 .../quantization/compressed_tensors/compressed_tensors.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 0c1fc18228f5c..f0e4eda76734b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -113,7 +113,7 @@ def _sparsity_scheme_map_from_config(
         :return: A dictionary mapping target layer names to their corresponding
             sparsity compression configurations
         """
-        if (sparsity_config := config.get(SPARSITY_CONFIG_NAME)) is None:
+        if not (sparsity_config := config.get(SPARSITY_CONFIG_NAME)):
             return dict()
 
         sparsity_config = SparsityCompressionConfig.model_validate(

From ad388d25a8e668545ef91c3634b67a241155e2ea Mon Sep 17 00:00:00 2001
From: Keyun Tong <tongkeyun@gmail.com>
Date: Wed, 15 Jan 2025 01:44:56 -0800
Subject: [PATCH 23/67] Type-fix: make execute_model output type optional
 (#12020)

---
 vllm/v1/executor/uniproc_executor.py | 1 +
 vllm/v1/worker/gpu_worker.py         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index a9adc0114f76d..c63d7a4c47c15 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -70,6 +70,7 @@ def execute_model(
         scheduler_output,
     ) -> ModelRunnerOutput:
         output = self.worker.execute_model(scheduler_output)
+        assert output is not None
         return output
 
     def profile(self, is_start: bool = True):
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index e6feaee972a35..81b247e07ef4a 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -200,7 +200,7 @@ def compile_or_warm_up_model(self) -> None:
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> ModelRunnerOutput:
+    ) -> Optional[ModelRunnerOutput]:
         output = self.model_runner.execute_model(scheduler_output)
         return output if self.rank == 0 else None
 

From 3adf0ffda8de31ff32f294324e53b6cfbf16f187 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Wed, 15 Jan 2025 18:14:15 +0800
Subject: [PATCH 24/67] [Platform] Do not raise error if _Backend is not found
 (#12023)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
---
 tests/kernels/test_attention_selector.py      | 11 +++++++---
 .../dummy_attention_backend.py                |  8 ++++++++
 .../vllm_add_dummy_platform/dummy_platform.py |  4 ++++
 tests/plugins_tests/test_platform_plugins.py  | 14 +++++++++++++
 vllm/attention/layer.py                       |  8 ++++----
 vllm/attention/selector.py                    | 20 ++++++++++---------
 6 files changed, 49 insertions(+), 16 deletions(-)
 create mode 100644 tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py

diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index a08c874407e3f..492acb91e8ed9 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -94,7 +94,12 @@ def test_flash_attn(monkeypatch):
 
 
 def test_invalid_env(monkeypatch):
-    """Throw an exception if the backend name is invalid."""
+    """Ignore the invalid env variable if it is set."""
     override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
-    with pytest.raises(ValueError):
-        get_attn_backend(16, torch.float16, None, 16, False)
+    with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+        backend = get_attn_backend(32, torch.float16, None, 16, False)
+        assert backend.get_name() == "FLASH_ATTN"
+
+        # when block size == 16, backend will fall back to XFORMERS
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() == "XFORMERS"
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
new file mode 100644
index 0000000000000..5634be3c8d882
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
@@ -0,0 +1,8 @@
+from vllm.attention.backends.flash_attn import FlashAttentionBackend
+
+
+class DummyAttentionBackend(FlashAttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "Dummy_Backend"
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index fde93142f1103..84721d5971ccf 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -3,3 +3,7 @@
 
 class DummyPlatform(CudaPlatform):
     device_name = "DummyDevice"
+
+    def get_attn_backend_cls(self, backend_name, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1):
+        return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 69698b34c71a3..661aa5f649ab9 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -1,3 +1,10 @@
+import torch
+
+from tests.kernels.utils import override_backend_env_variable
+from vllm.attention.selector import get_attn_backend
+from vllm.utils import STR_INVALID_VAL
+
+
 def test_platform_plugins():
     # simulate workload by running an example
     import runpy
@@ -14,3 +21,10 @@ def test_platform_plugins():
         f"Expected DummyDevice, got {current_platform.device_name}, "
         "possibly because current_platform is imported before the plugin"
         f" is loaded. The first import:\n{_init_trace}")
+
+
+def test_oot_attention_backend(monkeypatch):
+    # ignore the backend env variable if it is set
+    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
+    backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+    assert backend.get_name() == "Dummy_Backend"
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index a283e87d84070..9b03fd73fe690 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -190,11 +190,11 @@ def __init__(
                                         kv_cache_dtype=None,
                                         block_size=16,
                                         is_attention_free=False)
-        attn_backend = backend_name_to_enum(attn_backend.get_name())
-        if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
-            attn_backend = _Backend.XFORMERS
+        backend = backend_name_to_enum(attn_backend.get_name())
+        if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
+            backend = _Backend.XFORMERS
 
-        self.attn_backend = attn_backend if attn_backend in {
+        self.attn_backend = backend if backend in {
             _Backend.TORCH_SDPA, _Backend.XFORMERS
         } else _Backend.TORCH_SDPA
 
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 0ff007c87b1c9..81ea6eefb5410 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -14,16 +14,18 @@
 logger = init_logger(__name__)
 
 
-def backend_name_to_enum(backend_name: str) -> _Backend:
-    assert backend_name is not None
-
-    backend_members = _Backend.__members__
-    if backend_name not in backend_members:
-        raise ValueError(f"Invalid attention backend '{backend_name}'. "
-                         f"Available backends: {', '.join(backend_members)} "
-                         "(case-sensitive).")
+def backend_name_to_enum(backend_name: str) -> Optional[_Backend]:
+    """
+    Convert a string backend name to a _Backend enum value.
 
-    return _Backend[backend_name]
+    Returns:
+    * _Backend: enum value if backend_name is a valid in-tree type
+    * None: otherwise it's an invalid in-tree type or an out-of-tree platform is
+            loaded.
+    """
+    assert backend_name is not None
+    return _Backend[backend_name] if backend_name in _Backend.__members__ else \
+          None
 
 
 def get_env_variable_attn_backend() -> Optional[_Backend]:

From 97eb97b5a4fd64c3cbc97bb9d71a9bfd98348799 Mon Sep 17 00:00:00 2001
From: RunningLeon <maningsheng@sensetime.com>
Date: Wed, 15 Jan 2025 19:35:17 +0800
Subject: [PATCH 25/67] [Model]: Support internlm3 (#12037)

---
 docs/source/models/supported_models.md |  5 ++++
 tests/models/registry.py               |  2 ++
 vllm/model_executor/models/llama.py    | 35 +++++++++++++++-----------
 vllm/model_executor/models/registry.py |  1 +
 4 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 642ef3c9655b8..85d844f3d3f55 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -216,6 +216,11 @@ See [this page](#generative-models) for more information on how to use generativ
   - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.
   - ✅︎
   - ✅︎
+* - `InternLM3ForCausalLM`
+  - InternLM3
+  - `internlm/internlm3-8b-instruct`, etc.
+  - ✅︎
+  - ✅︎
 * - `JAISLMHeadModel`
   - Jais
   - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index d079725b2f78d..b0f0f9767a90f 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -85,6 +85,8 @@ class _HfExamplesInfo:
                                             trust_remote_code=True),
     "InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B",
                                               trust_remote_code=True),
+    "InternLM3ForCausalLM": _HfExamplesInfo("internlm/internlm3-8b-instruct",
+                                            trust_remote_code=True),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
     "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"),
     "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 16fa7acf54fdc..e8732c57fad49 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -97,20 +97,19 @@ def forward(self, x):
 
 class LlamaAttention(nn.Module):
 
-    def __init__(
-        self,
-        config: LlamaConfig,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
-        bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self,
+                 config: LlamaConfig,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 rope_theta: float = 10000,
+                 rope_scaling: Optional[Dict[str, Any]] = None,
+                 max_position_embeddings: int = 8192,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 bias: bool = False,
+                 cache_config: Optional[CacheConfig] = None,
+                 prefix: str = "",
+                 bias_o_proj: bool = False) -> None:
         super().__init__()
         layer_idx = extract_layer_index(prefix)
         self.hidden_size = hidden_size
@@ -150,7 +149,7 @@ def __init__(
         self.o_proj = RowParallelLinear(
             input_size=self.total_num_heads * self.head_dim,
             output_size=hidden_size,
-            bias=bias,
+            bias=bias_o_proj,
             quant_config=quant_config,
             prefix=f"{prefix}.o_proj",
         )
@@ -231,6 +230,11 @@ def __init__(
         # Support internlm/internlm-7b with bias
         attention_bias = getattr(config, "attention_bias", False) or getattr(
             config, "bias", False)
+        bias_o_proj = attention_bias
+        # support internlm/internlm3-8b with qkv_bias
+        if hasattr(config, 'qkv_bias'):
+            attention_bias = config.qkv_bias
+
         self.self_attn = LlamaAttention(
             config=config,
             hidden_size=self.hidden_size,
@@ -242,6 +246,7 @@ def __init__(
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
+            bias_o_proj=bias_o_proj,
             cache_config=cache_config,
             prefix=f"{prefix}.self_attn",
         )
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index a7286a9203f67..a71f7f7029c7d 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -60,6 +60,7 @@
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
+    "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"),
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),

From 5ecf3e0aafc3ae0e2923e0635adc6b26788429a3 Mon Sep 17 00:00:00 2001
From: Yuan <yuan.zhou@intel.com>
Date: Wed, 15 Jan 2025 21:16:40 +0800
Subject: [PATCH 26/67] Misc: allow to use proxy in `HTTPConnection` (#12042)

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 vllm/connections.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/connections.py b/vllm/connections.py
index e785a0b3ebd74..4c9f4f40cf640 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -29,7 +29,7 @@ def get_sync_client(self) -> requests.Session:
     # required, so that the client is only accessible inside async event loop
     async def get_async_client(self) -> aiohttp.ClientSession:
         if self._async_client is None or not self.reuse_client:
-            self._async_client = aiohttp.ClientSession()
+            self._async_client = aiohttp.ClientSession(trust_env=True)
 
         return self._async_client
 

From de0526f668d6918c1884fd3b201308e9049e6be9 Mon Sep 17 00:00:00 2001
From: kewang-xlnx <73578509+kewang-xlnx@users.noreply.github.com>
Date: Thu, 16 Jan 2025 00:05:15 +0800
Subject: [PATCH 27/67] [Misc][Quark] Upstream Quark format to VLLM (#10765)

Signed-off-by: kewang-xlnx <kewang@xilinx.com>
Signed-off-by: kewang2 <kewang2@amd.com>
Co-authored-by: kewang2 <kewang2@amd.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 tests/quantization/test_quark.py              |  30 ++
 vllm/config.py                                |   2 +-
 vllm/model_executor/layers/linear.py          |   2 +-
 .../layers/quantization/__init__.py           |   4 +
 .../layers/quantization/base_config.py        |   3 +
 .../compressed_tensors/compressed_tensors.py  |  16 +
 .../compressed_tensors/triton_scaled_mm.py    |   4 +
 .../quantization/compressed_tensors/utils.py  |  17 -
 .../layers/quantization/quark/__init__.py     |   0
 .../layers/quantization/quark/quark.py        | 387 ++++++++++++++++++
 .../layers/quantization/quark/quark_moe.py    | 225 ++++++++++
 .../quantization/quark/schemes/__init__.py    |   5 +
 .../quark/schemes/quark_scheme.py             |  52 +++
 .../quark/schemes/quark_w8a8_fp8.py           | 140 +++++++
 .../quark/schemes/quark_w8a8_int8.py          | 105 +++++
 .../layers/quantization/quark/utils.py        |  99 +++++
 vllm/model_executor/models/aria.py            |  11 +-
 vllm/model_executor/models/commandr.py        |  14 +
 vllm/model_executor/models/dbrx.py            |  73 +++-
 vllm/model_executor/models/exaone.py          |  12 +-
 vllm/model_executor/models/gemma2.py          |   6 +-
 vllm/model_executor/models/gpt_j.py           |  14 +
 vllm/model_executor/models/granite.py         |  12 +-
 vllm/model_executor/models/llama.py           |  12 +-
 vllm/model_executor/models/mixtral.py         |  14 +
 vllm/model_executor/models/mllama.py          |  13 +
 vllm/model_executor/models/nemotron.py        |  13 +
 vllm/model_executor/models/phimoe.py          |  14 +
 vllm/model_executor/models/qwen2.py           |  13 +
 vllm/model_executor/models/solar.py           |  12 +-
 vllm/model_executor/parameter.py              |   8 +-
 vllm/platforms/rocm.py                        |   2 +-
 32 files changed, 1264 insertions(+), 70 deletions(-)
 create mode 100644 tests/quantization/test_quark.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/__init__.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/quark.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/quark_moe.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/schemes/__init__.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/utils.py

diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
new file mode 100644
index 0000000000000..27493a682b746
--- /dev/null
+++ b/tests/quantization/test_quark.py
@@ -0,0 +1,30 @@
+"""Test model set-up and weight loading for quark-quantized models.
+
+Run `pytest tests/quantization/test_quark.py`.
+"""
+
+import torch
+
+from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
+    QuarkLinearMethod, QuarkW8A8Fp8)
+
+
+def test_quark_fp8(vllm_runner):
+    model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
+    with vllm_runner(model_path) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+
+        assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+        assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8)
+
+        if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
+            assert len(qkv_proj.input_scale.shape) == 0
+            assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+            #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz
+            assert len(qkv_proj.weight_scale.shape) == 0
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
diff --git a/vllm/config.py b/vllm/config.py
index 4a42aefb75026..65cb0d85f172a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -553,7 +553,7 @@ def _verify_quantization(self) -> None:
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
             "awq_marlin", "fbgemm_fp8", "compressed_tensors",
-            "compressed-tensors", "experts_int8"
+            "compressed-tensors", "experts_int8", "quark"
         ]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 8876ca72792cf..00ae64bbe6388 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -32,7 +32,7 @@
     "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod",
     "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod",
     "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod",
-    "HQQMarlinMethod"
+    "HQQMarlinMethod", "QuarkLinearMethod"
 ]
 
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index dd10c434f0752..caeb8b95e02f2 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -26,6 +26,7 @@
     "experts_int8",
     "neuron_quant",
     "ipex",
+    "quark"
 ]
 
 
@@ -34,6 +35,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         raise ValueError(f"Invalid quantization method: {quantization}")
 
     # lazy import to avoid triggering `torch.compile` too early
+    from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig
+
     from .aqlm import AQLMConfig
     from .awq import AWQConfig
     from .awq_marlin import AWQMarlinConfig
@@ -79,6 +82,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         "experts_int8": ExpertsInt8Config,
         "neuron_quant": NeuronQuantConfig,
         "ipex": IPEXConfig,
+        "quark": QuarkConfig
     }
 
     return method_to_config[quantization]
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 6dfac8aad5358..2fb2642dd5156 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -133,3 +133,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             method.
         """
         raise NotImplementedError
+
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        return None
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index f0e4eda76734b..b2fc2360f47f1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -412,6 +412,22 @@ def get_scheme(
         self._check_scheme_supported(scheme.get_min_capability())
         return scheme
 
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        """
+        Check whether the param name matches the format for k/v cache scales
+        in compressed-tensors. If this is the case, return its equivalent
+        param name expected by vLLM
+
+        :param name: param name
+        :return: matching param name for KV cache scale in vLLM
+        """
+        if name.endswith(".output_scale") and ".k_proj" in name:
+            return name.replace(".k_proj.output_scale", ".attn.k_scale")
+        if name.endswith(".output_scale") and ".v_proj" in name:
+            return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        # If no matches, return None
+        return None
+
     @staticmethod
     def supports_cutlass_24(
             weight_quant: Optional[QuantizationArgs],
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
index 2659afcdc74a9..f4c1dbc0361c6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -136,6 +136,10 @@ def triton_scaled_mm(input: torch.Tensor,
     assert N > 0 and K > 0 and M > 0
     assert weight.shape[0] == K
     assert input.dtype == weight.dtype
+
+    scale_a = scale_a.reshape(-1, 1) if scale_a.dim() <= 1 else scale_a
+    scale_b = scale_b.reshape(-1, 1) if scale_b.dim() <= 1 else scale_b
+
     assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point()
     assert scale_a.shape == torch.Size([1, 1]) or scale_a.shape == torch.Size(
         [M, 1])
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index dfae4db71e546..8fcbda377428e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -133,23 +133,6 @@ def _find_first_match(value: str,
     return None
 
 
-def get_compressed_tensors_cache_scale(name: str) -> Optional[str]:
-    """
-    Check whether the param name matches the format for k/v cache scales
-    in compressed-tensors. If this is the case, return its equivalent
-    param name expected by vLLM
-
-    :param name: param name
-    :return: matching param name for KV cache scale in vLLM
-    """
-    if name.endswith(".output_scale") and ".k_proj" in name:
-        return name.replace(".k_proj.output_scale", ".attn.k_scale")
-    if name.endswith(".output_scale") and ".v_proj" in name:
-        return name.replace(".v_proj.output_scale", ".attn.v_scale")
-    # If no matches, return None
-    return None
-
-
 def _is_equal_or_regex_match(value: str,
                              target: str,
                              check_contains: bool = False) -> bool:
diff --git a/vllm/model_executor/layers/quantization/quark/__init__.py b/vllm/model_executor/layers/quantization/quark/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
new file mode 100644
index 0000000000000..fc214255eca71
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -0,0 +1,387 @@
+import fnmatch
+import re
+from typing import Any, Dict, List, Optional, cast
+
+import torch
+
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.quark.quark_moe import (  # noqa: E501
+    QuarkMoEMethod)
+from vllm.model_executor.layers.quantization.quark.schemes import (
+    QuarkScheme, QuarkW8A8Fp8, QuarkW8A8Int8)
+from vllm.model_executor.layers.quantization.quark.utils import (
+    deep_compare, should_ignore_layer)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    FUSED_LAYER_NAME_MAPPING)
+from vllm.platforms import current_platform
+
+__all__ = ["QuarkLinearMethod"]
+
+
+class QuarkConfig(QuantizationConfig):
+
+    def __init__(self,
+                 quant_config: Dict[str, Any],
+                 kv_cache_group: Optional[List[str]] = None,
+                 kv_cache_config: Optional[Dict[str, Any]] = None,
+                 pack_method: str = "reorder"):
+        if kv_cache_group is None:
+            kv_cache_group = []
+        self.quant_config = quant_config
+        self.kv_cache_group = kv_cache_group
+        self.kv_cache_config = kv_cache_config
+        self.pack_method = pack_method
+
+    def get_linear_method(self) -> "QuarkLinearMethod":
+        return QuarkLinearMethod(self)
+
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    def get_name(self) -> str:
+        return "quark"
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        # Check if the layer is skipped for quantization.
+        exclude_layers = cast(List[str], self.quant_config.get("exclude"))
+        if should_ignore_layer(prefix, ignore=exclude_layers):
+            return UnquantizedLinearMethod()
+        if isinstance(layer, LinearBase):
+            scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            layer.scheme = scheme
+            return QuarkLinearMethod(self)
+        if isinstance(layer, Attention):
+            return QuarkKVCacheMethod(self)
+        if isinstance(layer, FusedMoE):
+            return QuarkMoEMethod.get_moe_method(self,
+                                                 module=layer,
+                                                 layer_name=prefix)
+        return None
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig":
+        export_config = config.get("export")
+        if export_config is None:
+            raise ValueError("The export key should be included in "
+                             "the configurations of Quark quantized model")
+        kv_cache_group = cast(List[str], export_config.get("kv_cache_group"))
+        pack_method = cast(str, export_config.get("pack_method"))
+
+        # In the export model of quark, the quantization configuration
+        # of kv_cache is stored in layer_quant_config. First, it is
+        # judged whether kv_cache_group exists, and then it is judged
+        # whether layer_quant_config has a quantization configuration
+        # that matches kv_cache.
+        if len(kv_cache_group) == 0:
+            kv_cache_config = None
+        else:
+            kv_cache_set = set(kv_cache_group)
+            layer_quant_config = cast(Dict[str, Any],
+                                      config.get("layer_quant_config"))
+            layer_quant_names = list(layer_quant_config.keys())
+            layer_quant_set = set(layer_quant_names)
+
+            if not kv_cache_set.issubset(layer_quant_set):
+                raise ValueError("The Quark quantized model has the "
+                                 "kv_cache_group parameter setting, "
+                                 "but no kv_cache quantization settings "
+                                 "were found in the quantization "
+                                 "configuration.")
+
+            q_configs = [
+                cast(Dict[str, Any], layer_quant_config.get(name))
+                for name in kv_cache_group
+            ]
+            if not all(
+                    deep_compare(q_config, q_configs[0])
+                    for q_config in q_configs):
+                raise ValueError(
+                    "The quantization method used for kv_cache should "
+                    "be the same, but the quantization method for the "
+                    "kv_cache layer in the config is different.")
+            kv_cache_config = q_configs[0].get("output_tensors")
+            if kv_cache_config is None:
+                raise ValueError(
+                    "The kv_cache quantization configuration is empty.")
+
+            # Since we have already set kv_cache quantization configurations,
+            # we will remove the quantization configuration for the
+            # output_tensors corresponding to the kv_cache layer.
+            for q_config in q_configs:
+                q_config["output_tensors"] = None
+
+        return cls(quant_config=config,
+                   kv_cache_group=kv_cache_group,
+                   kv_cache_config=kv_cache_config,
+                   pack_method=pack_method)
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    def _check_scheme_supported(self,
+                                min_capability: int,
+                                error: bool = True) -> bool:
+        capability_tuple = current_platform.get_device_capability()
+
+        if capability_tuple is not None:
+            capability = capability_tuple.to_int()
+            supported = capability >= min_capability
+            if error and not supported:
+                raise RuntimeError(
+                    "Quantization scheme is not supported for ",
+                    f"the current GPU. Min capability: {min_capability}. ",
+                    f"Current capability: {capability}.")
+            return supported
+        else:
+            return False
+
+    def _is_fp8_w8a8(self, weight_quant: Optional[Dict[str, Any]],
+                     input_quant: Optional[Dict[str, Any]]) -> bool:
+        # Confirm weights and input quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        # Confirm weight scheme is supported
+        is_fp8_dtype = (weight_quant.get("dtype") == "fp8_e4m3"
+                        and input_quant.get("dtype") == "fp8_e4m3")
+        is_static_weight = not weight_quant.get("is_dynamic")
+        is_per_tensor_or_channel_weight = (weight_quant.get("qscheme")
+                                           in ["per_tensor", "per_channel"])
+
+        if not (is_fp8_dtype and is_static_weight
+                and is_per_tensor_or_channel_weight):
+            return False
+
+        # Dynamic quantization is always supported if weights supported.
+        if input_quant.get("is_dynamic"):
+            return True
+
+        # Confirm activation scheme is supported.
+        is_per_tensor_activation = (input_quant.get("qscheme") == "per_tensor")
+        return is_per_tensor_activation
+
+    def _is_static_tensor_w8a8(self, weight_quant: Optional[Dict[str, Any]],
+                               input_quant: Optional[Dict[str, Any]]) -> bool:
+        # Confirm weights and input quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        is_int8_dtype = (weight_quant.get("dtype") == "int8"
+                         and input_quant.get("dtype") == "int8")
+
+        is_tensor = (weight_quant.get("qscheme")
+                     in ["per_tensor", "per_channel"]
+                     and input_quant.get("qscheme") == "per_tensor")
+
+        is_static = (not weight_quant.get("is_dynamic")
+                     and not input_quant.get("is_dynamic"))
+
+        is_weight_symmetric = (weight_quant.get("symmetric") is True)
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_int8_dtype and is_tensor and is_weight_symmetric and is_static
+
+    def _find_matched_config(self, layer_name: str,
+                             module: torch.nn.Module) -> Dict[str, Any]:
+
+        proj_name = layer_name.split(".")[-1]
+        if proj_name in FUSED_LAYER_NAME_MAPPING:
+            shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
+
+            # Convert fused_name --> [shard_names]
+            shard_names = [
+                layer_name.replace(proj_name, shard_proj_name)
+                for shard_proj_name in shard_proj_names
+            ]
+            shard_configs = [
+                self._find_matched_config(shard_name, module)
+                for shard_name in shard_names
+            ]
+            if not all(
+                    deep_compare(q_config, shard_configs[0])
+                    for q_config in shard_configs):
+                raise ValueError(
+                    f"Found a different quantization configuration for "
+                    f"{shard_proj_names} in {layer_name}. vLLM "
+                    "requires all to use the same scheme.")
+            return shard_configs[0]
+        else:
+            layer_quant_config = cast(
+                Dict[str, Any], self.quant_config.get("layer_quant_config"))
+            for name_pattern in layer_quant_config:
+                if fnmatch.fnmatch(layer_name, name_pattern):
+                    return layer_quant_config[name_pattern]
+
+            layer_type = cast(str, type(module))
+            layer_type_quant_config = cast(
+                Dict[str, Any],
+                self.quant_config.get("layer_type_quant_config"))
+            if layer_type in layer_type_quant_config:
+                return layer_type_quant_config[layer_type]
+
+            global_quant_config = cast(
+                Dict[str, Any], self.quant_config.get("global_quant_config"))
+            return global_quant_config
+
+    def _get_scheme_from_config(self, config: Dict[str, Any]) -> "QuarkScheme":
+        if config.get("output_tensors") or config.get("bias"):
+            raise NotImplementedError(
+                "Currently, Quark models with output_tensors "
+                "and bias quantized are not supported")
+        weight_config = cast(Dict[str, Any], config.get("weight"))
+        input_config = cast(Dict[str, Any], config.get("input_tensors"))
+
+        if self._is_fp8_w8a8(weight_config, input_config):
+            is_fp8_w8a8_supported = self._check_scheme_supported(
+                QuarkW8A8Fp8.get_min_capability(), error=False)
+            if is_fp8_w8a8_supported:
+                weight_qscheme = cast(str, weight_config.get("qscheme"))
+                input_static = (input_config is not None and
+                                not cast(bool, input_config.get("is_dynamic")))
+                return QuarkW8A8Fp8(qscheme=weight_qscheme,
+                                    is_static_input_scheme=input_static)
+        elif self._is_static_tensor_w8a8(weight_config, input_config):
+            weight_qscheme = cast(str, weight_config.get("qscheme"))
+            return QuarkW8A8Int8(qscheme=weight_qscheme,
+                                 is_static_input_scheme=True,
+                                 input_symmetric=input_config.get("symmetric"))
+
+        raise NotImplementedError("No quark compatible scheme was found. "
+                                  f"Weight config: {weight_config}, "
+                                  f"Input config: {input_config}")
+
+    def get_scheme(self, layer: torch.nn.Module,
+                   layer_name: str) -> "QuarkScheme":
+
+        layer_quant_config = self._find_matched_config(layer_name, layer)
+
+        # Find the quant_scheme
+        scheme = self._get_scheme_from_config(layer_quant_config)
+        # Raise error if device does not support the scheme
+        # (e.g. fp8 needs ada lovelace)
+        self._check_scheme_supported(scheme.get_min_capability())
+
+        return scheme
+
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        """
+        Check whether the param name matches the format for k/v cache scales
+        in quark. If this is the case, return its equivalent param name 
+        expected by vLLM
+
+        :param name: param name
+        :return: matching param name for KV cache scale in vLLM
+        """
+        if self.kv_cache_group is None or len(self.kv_cache_group) == 0:
+            return None
+
+        kv_proj_names = [
+            re.split(r"[*.]", kv_cache)[-1] for kv_cache in self.kv_cache_group
+        ]
+        if name.endswith(".output_scale"):
+            if len(kv_proj_names) == 1 and kv_proj_names[0] in name:
+                kv_output_scale_name = "." + kv_proj_names[0] + ".output_scale"
+                return name.replace(kv_output_scale_name, ".attn.k_scale")
+
+            elif len(kv_proj_names) == 2:
+                for kv_proj_name in kv_proj_names:
+                    if kv_proj_name in name and kv_proj_name == "k_proj":
+                        return name.replace(".k_proj.output_scale",
+                                            ".attn.k_scale")
+                    elif kv_proj_name in name and kv_proj_name == "v_proj":
+                        return name.replace(".v_proj.output_scale",
+                                            ".attn.v_scale")
+
+        # If no matches, return None
+        return None
+
+
+class QuarkLinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: QuarkConfig):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: List[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        """
+        Use the CompressedTensorsScheme associated with each layer to create
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+        """
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.scheme.create_weights(
+            layer=layer,
+            input_size=input_size,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None):
+        """
+        Use the output of create_weights and the CompressedTensorsScheme
+        associated with the layer to apply the forward pass with the
+        layer input.  See LinearMethodBase for param details
+
+        """
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, x, bias=bias)
+
+
+class QuarkKVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from quark checkpoints.
+    """
+
+    def __init__(self, quant_config: QuarkConfig):
+        self.validate_kv_cache_config(quant_config.kv_cache_config)
+        super().__init__(quant_config)
+
+    @staticmethod
+    def validate_kv_cache_config(kv_cache_config: Optional[Dict[str, Any]]):
+        """
+        Validator for the kv cache configuration. Useful for controlling the
+        kv cache quantization schemes, that are being supported in vLLM
+        :param kv_cache_config: the quark kv cache scheme
+        """
+        if kv_cache_config is None:
+            return
+
+        dtype = kv_cache_config.get("dtype")
+        if dtype != "fp8_e4m3":
+            raise NotImplementedError(
+                "Currently supported kv cache quantization is "
+                f"dtype=fp8_e4m3, however received {dtype}")
+
+        qscheme = kv_cache_config.get("qscheme")
+        if qscheme != "per_tensor":
+            raise NotImplementedError(
+                "Only support per-tensor scaling factor "
+                "for quark KV cache. "
+                f"Expected qscheme: per_tensor, found qscheme: {qscheme}")
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
new file mode 100644
index 0000000000000..3e19247300808
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -0,0 +1,225 @@
+from typing import Any, Callable, Dict, Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe  # noqa
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
+                                                  FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+__all__ = ["QuarkMoEMethod", "QuarkW8A8Fp8MoEMethod"]
+
+
+class QuarkMoEMethod(FusedMoEMethodBase):
+
+    @staticmethod
+    def get_moe_method(
+            quant_config: "QuarkConfig",  # type: ignore # noqa E501 # noqa F821
+            module: torch.nn.Module,
+            layer_name: str) -> "QuarkMoEMethod":
+        layer_quant_config = quant_config._find_matched_config(
+            layer_name, module)
+
+        if (layer_quant_config.get("output_tensors")
+                or layer_quant_config.get("bias")):
+            raise NotImplementedError("Currently, Quark models with "
+                                      "output_tensors and bias "
+                                      "quantized are not supported")
+        weight_config = layer_quant_config.get("weight")
+        input_config = layer_quant_config.get("input_tensors")
+
+        if quant_config._is_fp8_w8a8(weight_config, input_config):
+            return QuarkW8A8Fp8MoEMethod(weight_config, input_config)
+        else:
+            raise RuntimeError("Unsupported FusedMoe scheme")
+
+
+class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
+
+    def __init__(self, weight_config: Dict[str, Any], input_config: Dict[str,
+                                                                         Any]):
+        self.weight_quant = weight_config
+        self.input_quant = input_config
+
+        weight_qscheme = self.weight_quant.get("qscheme")
+        input_qscheme = self.input_quant.get("qscheme")
+        if not (weight_qscheme == "per_tensor"
+                and input_qscheme == "per_tensor"):
+            raise ValueError(
+                "For FP8 Fused MoE layers, only per-tensor scales"
+                "for weights and activations are supported. Found "
+                f"{weight_qscheme}, {input_qscheme}")  # noqa E501
+
+        self.static_input_scales = not self.input_quant.get("is_dynamic")
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        params_dtype = torch.float8_e4m3fn
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
+                                                    2 * intermediate_size,
+                                                    hidden_size,
+                                                    dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
+                                                   hidden_size,
+                                                   intermediate_size,
+                                                   dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                         2,
+                                                         dtype=torch.float32),
+                                              requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+
+        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                        dtype=torch.float32),
+                                             requires_grad=False)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        if self.static_input_scales:
+            w13_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+            w2_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                requires_grad=False)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+            set_weight_attrs(w2_input_scale, extra_weight_attrs)
+        else:
+            layer.w13_input_scale = None
+            layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Fp8 moe kernels require a single activation scale.
+        # We take the max of all the scales in case they differ.
+        if self.static_input_scales:
+            if (layer.w13_input_scale is None or layer.w2_input_scale is None):
+                raise ValueError(
+                    "QuantConfig has static quantization, but found "
+                    "activation scales are None.")
+            if (not all_close_1d(layer.w13_input_scale)
+                    or not all_close_1d(layer.w2_input_scale)):
+                logger.warning_once(
+                    "Found input_scales that are not equal for "
+                    "fp8 MoE layer. Using the maximum across experts "
+                    "for each layer. ")
+            layer.w13_input_scale = torch.nn.Parameter(
+                layer.w13_input_scale.max(), requires_grad=False)
+            layer.w2_input_scale = torch.nn.Parameter(
+                layer.w2_input_scale.max(), requires_grad=False)
+
+        # If rocm, normalize the weights and scales to e4m3fnuz
+        if current_platform.is_rocm():
+            # Normalize the weights and scales
+            w13_weight, w13_weight_scale, w13_input_scale = \
+                normalize_e4m3fn_to_e4m3fnuz(
+                    layer.w13_weight, layer.w13_weight_scale,
+                    layer.w13_input_scale)
+            w2_weight, w2_weight_scale, w2_input_scale = \
+                normalize_e4m3fn_to_e4m3fnuz(
+                    layer.w2_weight, layer.w2_weight_scale,
+                    layer.w2_input_scale)
+            # Reset the parameter
+            layer.w13_weight = torch.nn.Parameter(w13_weight,
+                                                  requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale,
+                                                        requires_grad=False)
+            if w13_input_scale is not None:
+                layer.w13_input_scale = torch.nn.Parameter(w13_input_scale,
+                                                           requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight,
+                                                 requires_grad=False)
+            layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
+                                                       requires_grad=False)
+            if w2_input_scale is not None:
+                layer.w2_input_scale = torch.nn.Parameter(w2_input_scale,
+                                                          requires_grad=False)
+
+        # Fp8 moe kernel needs single weight scale for w13 per expert.
+        # We take the max then dequant and requant each expert.
+        assert layer.w13_weight_scale is not None
+        shard_size = layer.intermediate_size_per_partition
+        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+        for expert_id in range(layer.num_experts):
+            start = 0
+            for shard_id in range(2):
+                dq_weight = per_tensor_dequantize(
+                    layer.w13_weight[expert_id][start:start + shard_size, :],
+                    layer.w13_weight_scale[expert_id][shard_id])
+                layer.w13_weight[expert_id][
+                    start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                        dq_weight, max_w13_scales[expert_id])
+                start += shard_size
+
+        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
+                                                    requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        return fused_experts(x,
+                             layer.w13_weight,
+                             layer.w2_weight,
+                             topk_weights=topk_weights,
+                             topk_ids=topk_ids,
+                             inplace=True,
+                             use_fp8_w8a8=True,
+                             w1_scale=layer.w13_weight_scale,
+                             w2_scale=layer.w2_weight_scale,
+                             a1_scale=layer.w13_input_scale,
+                             a2_scale=layer.w2_input_scale)
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
new file mode 100644
index 0000000000000..fb0ba9bd5220c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
@@ -0,0 +1,5 @@
+from .quark_scheme import QuarkScheme
+from .quark_w8a8_fp8 import QuarkW8A8Fp8
+from .quark_w8a8_int8 import QuarkW8A8Int8
+
+__all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8"]
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
new file mode 100644
index 0000000000000..239597fa4be0e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
@@ -0,0 +1,52 @@
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+__all__ = ["QuarkScheme"]
+
+
+class QuarkScheme(ABC):
+    """
+    Abstract class used to describe the weight creation and forward pass 
+    of different quantization schemes supported by Quark.
+    """
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function 
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]):
+        """
+        Run the forward pass for the particular scheme. This is where 
+        scheme-specific dequant/quant steps/kernels should be applied.
+
+        :param layer: torch.nn.Module with the registered weights and 
+            other parameters relevant to the particular scheme. 
+        :param x: input to the layer
+        :param bias: bias parameter
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
new file mode 100644
index 0000000000000..206931ea2ffc0
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -0,0 +1,140 @@
+from typing import Callable, List, Optional
+
+import torch
+from torch.nn import Parameter
+
+from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz,
+    requantize_with_max_scale)
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+from vllm.platforms import current_platform
+
+__all__ = ["QuarkW8A8Fp8"]
+
+
+class QuarkW8A8Fp8(QuarkScheme):
+
+    def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool]):
+        self.qscheme = qscheme
+        self.is_static_input_scheme = is_static_input_scheme
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # lovelace and up
+        return 89
+
+    def process_weights_after_loading(self, layer) -> None:
+        # If per tensor, when we have a fused module (e.g. QKV) with per
+        # tensor scales (thus N scales being passed to the kernel),
+        # requantize so we can always run per tensor
+        if self.qscheme == "per_tensor":
+            max_w_scale, weight = requantize_with_max_scale(
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                logical_widths=layer.logical_widths,
+            )
+
+            if current_platform.is_rocm():
+                weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=weight,
+                    weight_scale=max_w_scale,
+                    input_scale=layer.input_scale)
+                if input_scale is not None:
+                    layer.input_scale = Parameter(input_scale,
+                                                  requires_grad=False)
+
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+
+        # If channelwise, scales are already lined up, so just transpose.
+        elif self.qscheme == "per_channel":
+            weight = layer.weight
+
+            if current_platform.is_rocm():
+                weight, weight_scale, input_scale = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        weight=weight,
+                        weight_scale=layer.weight_scale,
+                        input_scale=layer.input_scale)
+                if input_scale is not None:
+                    layer.input_scale = Parameter(input_scale,
+                                                  requires_grad=False)
+            else:
+                weight_scale = layer.weight_scale.data
+
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            # required by torch.compile to be torch.nn.Parameter
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+
+        else:
+            raise ValueError(f"Unknown quantization scheme {self.qscheme}")
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            layer.input_scale = Parameter(layer.input_scale.max(),
+                                          requires_grad=False)
+        else:
+            layer.input_scale = None
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=torch.float8_e4m3fn),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        # TODO: update create_xxx_parameter functions to return
+        # the newly added parameters
+        if self.qscheme == "per_channel":
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1),
+                                 dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader)
+        else:
+            assert self.qscheme == "per_tensor"
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
+
+        # min requirement for fp8 kernels
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            input_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                  weight_loader=weight_loader)
+            input_scale[:] = torch.finfo(torch.float32).min
+            layer.register_parameter("input_scale", input_scale)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+            use_per_token_if_dynamic=True)
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
new file mode 100644
index 0000000000000..8cb47e9c37e56
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
@@ -0,0 +1,105 @@
+from typing import Callable, List, Optional, Set
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
+    ScaledMMLinearLayerConfig, choose_scaled_mm_linear_kernel)
+from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+logger = init_logger(__name__)
+
+
+class QuarkW8A8Int8(QuarkScheme):
+    _kernel_backends_being_used: Set[str] = set()
+
+    def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool],
+                 input_symmetric: Optional[bool]):
+        self.qscheme = qscheme
+        self.is_static_input_scheme = is_static_input_scheme
+        self.input_symmetric = input_symmetric
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # turing and up
+        return 75
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        self.logical_widths = output_partition_sizes
+
+        scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig(
+            is_channelwise=(self.qscheme == "per_channel"),
+            is_static_input_scheme=(self.is_static_input_scheme is True),
+            input_symmetric=(self.input_symmetric is True))
+
+        kernel_type = choose_scaled_mm_linear_kernel(
+            scaled_mm_linear_kernel_config)
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for QuarkW8A8Int8", kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
+
+        # WEIGHT
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=torch.int8),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        if self.qscheme == "per_channel":
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1),
+                                 dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader)
+        else:
+            assert self.qscheme == "per_tensor"
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            input_scale = BasevLLMParameter(data=torch.empty(
+                1, dtype=torch.float32),
+                                            weight_loader=weight_loader)
+            layer.register_parameter("input_scale", input_scale)
+
+            if not self.input_symmetric:
+                # Note: quark stores the zp using the same dtype
+                # as the weights
+                # AZP loaded as int8 but used as int32
+                input_zero_point = BasevLLMParameter(
+                    data=torch.empty(1, dtype=torch.int8),
+                    weight_loader=weight_loader)
+                layer.register_parameter("input_zero_point", input_zero_point)
+
+        self.kernel = kernel_type(c=scaled_mm_linear_kernel_config,
+                                  w_q_param_name="weight",
+                                  w_s_param_name="weight_scale",
+                                  i_s_param_name="input_scale",
+                                  i_zp_param_name="input_zero_point",
+                                  azp_adj_param_name="azp_adj")
+
+    # Checkpoints are serialized in quark format, which is
+    # different from the format the kernel may want. Handle repacking here.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+        return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py
new file mode 100644
index 0000000000000..742a629bdb1c5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/utils.py
@@ -0,0 +1,99 @@
+import re
+from typing import Any, Iterable, Optional
+
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    FUSED_LAYER_NAME_MAPPING)
+
+
+def deep_compare(dict1: Any, dict2: Any) -> bool:
+    if type(dict1) is not type(dict2):
+        return False
+    if isinstance(dict1, dict):
+        if dict1.keys() != dict2.keys():
+            return False
+        return all(deep_compare(dict1[k], dict2[k]) for k in dict1)
+    elif isinstance(dict1, list):
+        return set(dict1) == set(dict2)
+    else:
+        return dict1 == dict2
+
+
+def should_ignore_layer(layer_name: Optional[str],
+                        ignore: Iterable[str]) -> bool:
+    if layer_name is None:
+        return False
+
+    # layer_name = model.layers.0.self_attn.qkv_proj
+    # proj_name = qkv_proj
+    proj_name = layer_name.split(".")[-1]
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in FUSED_LAYER_NAME_MAPPING:
+        shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
+
+        # Convert fused_name --> [shard_names]
+        shard_names = [
+            layer_name.replace(proj_name, shard_proj_name)
+            for shard_proj_name in shard_proj_names
+        ]
+
+        # Layer should be ignored if shards are ignored.
+        should_ignore_layer = None
+        for shard_name in shard_names:
+            should_ignore_shard = check_equal_or_regex_match(
+                layer_name=shard_name, targets=ignore)
+
+            # If shard_idx=0, set layer ignore to match shard.
+            if should_ignore_layer is None:
+                should_ignore_layer = should_ignore_shard
+
+            # If shard_idx=1+ confirm scheme matches prior shards.
+            elif should_ignore_shard != should_ignore_layer:
+                raise ValueError(f"Found a different quantization schemes for "
+                                 f"{shard_proj_names} in {layer_name}. vLLM "
+                                 "requires all to use the same scheme.")
+
+    # Unfused layers like down_proj and o_proj will match
+    # the safetensors checkpoint already.
+    else:
+        should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name,
+                                                         targets=ignore)
+
+    assert should_ignore_layer is not None
+    return should_ignore_layer
+
+
+def check_equal_or_regex_match(layer_name: str,
+                               targets: Iterable[str]) -> bool:
+    """
+    Checks whether a layer_name is exactly equal or a regex match for 
+    if target starts with 're:' to any target in list.
+    """
+    for target in targets:
+        if _is_equal_or_regex_match(layer_name, target):
+            return True
+    return False
+
+
+def _is_equal_or_regex_match(value: str,
+                             target: str,
+                             check_contains: bool = False) -> bool:
+    """
+    Checks whether a value is exactly equal or a regex match for target
+    if target starts with 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+    """
+
+    if target.startswith("re:"):
+        pattern = target[3:]
+        if re.match(pattern, value):
+            return True
+    elif check_contains:
+        if target.lower() in value.lower():
+            return True
+    elif target == value:
+        return True
+    return False
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 089062ab53fc3..91225c0ddc917 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -13,8 +13,6 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.sampler import (SamplerOutput,
                                                 SamplingMetadata, get_sampler)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
@@ -390,12 +388,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            if scale_name := get_compressed_tensors_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                loaded_weight = loaded_weight[0]
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 8d61ece289412..6517422697c04 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -437,6 +437,20 @@ def load_weights(self, weights: Iterable[Tuple[str,
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
+
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
             for param_name, shard_name, shard_id in stacked_params_mapping:
                 if shard_name not in name:
                     continue
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 3932d8b52a9d1..ff1f1c2a939ff 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -83,7 +83,7 @@ def __init__(
 
     # Define custom weight loader for dbrx model
     def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
-                      weight_name: str):
+                      weight_name: str, param_name: str):
         tp_rank = get_tensor_model_parallel_rank()
         param_data = param.data
         shard_size = self.intermediate_size
@@ -91,25 +91,37 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
         # DBRX uses GLU for each experts.
         # GLU has 3 linear layers: w1, v1 and w2.
         if weight_name.endswith("w1"):
-            loaded_weight = torch.reshape(
-                loaded_weight,
-                [-1, self.intermediate_size * self.tp_size, self.d_model],
-            )
-            param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :]
+            if param_name.endswith("weight"):
+                loaded_weight = torch.reshape(
+                    loaded_weight,
+                    [-1, self.intermediate_size * self.tp_size, self.d_model],
+                )
+                param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :]
+            elif param_name.endswith("weight_scale"):
+                param_data[:, 0] = loaded_weight
+            else:
+                param_data = loaded_weight
         if weight_name.endswith("v1"):
-            loaded_weight = torch.reshape(
-                loaded_weight,
-                [-1, self.intermediate_size * self.tp_size, self.d_model],
-            )
-            param_data[:,
-                       shard_size:2 * shard_size, :] = loaded_weight[:,
-                                                                     shard, :]
+            if param_name.endswith("weight"):
+                loaded_weight = torch.reshape(
+                    loaded_weight,
+                    [-1, self.intermediate_size * self.tp_size, self.d_model],
+                )
+                param_data[:, shard_size:2 *
+                           shard_size, :] = loaded_weight[:, shard, :]
+            elif param_name.endswith("weight_scale"):
+                param_data[:, 1] = loaded_weight
+            else:
+                param_data[:] = loaded_weight
         if weight_name.endswith("w2"):
-            loaded_weight = torch.reshape(
-                loaded_weight,
-                [-1, self.intermediate_size * self.tp_size, self.d_model],
-            ).transpose(1, 2)
-            param_data[:] = loaded_weight[:, :, shard]
+            if param_name.endswith("weight"):
+                loaded_weight = torch.reshape(
+                    loaded_weight,
+                    [-1, self.intermediate_size * self.tp_size, self.d_model],
+                ).transpose(1, 2)
+                param_data[:] = loaded_weight[:, :, shard]
+            else:
+                param_data[:] = loaded_weight
 
 
 class DbrxMoE(nn.Module):
@@ -430,14 +442,29 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-
         expert_params_mapping = [(
-            "w13_weight" if weight_name in ["w1", "v1"] else "w2_weight",
+            "w13" if weight_name in ["w1", "v1"] else "w2",
             f"mlp.{weight_name}",
         ) for weight_name in ["w1", "v1", "w2"]]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: Set[str] = set()
+
         for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            if name.endswith(("w1", "w2", "v1")):
+                name = name + "_weight"
             for param_name, weight_name in expert_params_mapping:
                 if weight_name not in name:
                     continue
@@ -446,8 +473,9 @@ def load_weights(self, weights: Iterable[Tuple[str,
                     continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, weight_name)
+                weight_loader(param, loaded_weight, weight_name, name)
                 break
+
             else:
                 # Remapping the name of FP8 kv-scale.
                 name = maybe_remap_kv_scale_name(name, params_dict)
@@ -456,6 +484,9 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
                 if is_pp_missing_parameter(name, self):
                     continue
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index ad15f835b1609..ac679d6ff43c7 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -39,8 +39,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -439,6 +437,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = quant_config
 
         self.transformer = ExaoneModel(
             vllm_config=vllm_config,
@@ -532,12 +531,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
             # processed with quantization, LoRA, fine-tuning, etc.
             if self.config.tie_word_embeddings and "lm_head.weight" in name:
                 continue
-            if scale_name := get_compressed_tensors_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                loaded_weight = loaded_weight[0]
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 698b9a5b6b1d6..f0dc7693974be 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -31,8 +31,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -254,6 +252,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
+        self.quant_config = quant_config
 
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
@@ -329,7 +328,8 @@ def load_weights(self, weights: Iterable[Tuple[str,
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if scale_name := get_compressed_tensors_cache_scale(name):
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
                 # Loading kv cache scales for compressed-tensors quantization
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 4829578a56959..56343ca9a71ac 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -313,6 +313,20 @@ def load_weights(self, weights: Iterable[Tuple[str,
         for name, loaded_weight in weights:
             if "attn.bias" in name or "attn.masked_bias" in name:
                 continue
+
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 3e95926fd1e22..67e04b57658bc 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -39,8 +39,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -371,6 +369,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = quant_config
 
         self.model = GraniteModel(vllm_config=vllm_config,
                                   prefix=maybe_prefix(prefix, "model"))
@@ -474,12 +473,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
             # processed with quantization, LoRA, fine-tuning, etc.
             if self.config.tie_word_embeddings and "lm_head.weight" in name:
                 continue
-            if scale_name := get_compressed_tensors_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                loaded_weight = loaded_weight[0]
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e8732c57fad49..4667f275ecd33 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -38,8 +38,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -306,6 +304,7 @@ def __init__(self,
         lora_config = vllm_config.lora_config
 
         self.config = config
+        self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
@@ -396,12 +395,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            if scale_name := get_compressed_tensors_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                loaded_weight = loaded_weight[0]
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a5b364fe5ec85..2c8895e842996 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -347,6 +347,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = quant_config
 
         self.model = MixtralModel(vllm_config=vllm_config,
                                   prefix=maybe_prefix(prefix, "model"))
@@ -428,6 +429,19 @@ def load_weights(self, weights: Iterable[Tuple[str,
             if "rotary_emb.inv_freq" in name:
                 continue
 
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index b2368ffff5412..bd261f31499cb 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1116,6 +1116,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
+        self.quant_config = quant_config
         self.vocab_size = config.text_config.vocab_size
         self.hidden_size = config.text_config.hidden_size
         self.max_num_tiles = config.vision_config.max_num_tiles
@@ -1429,6 +1430,18 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 name = name.replace('patch_embedding.weight',
                                     'patch_embedding._linear.weight')
                 loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1)
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                updated_params.add(scale_name)
+                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 8cc62d5c803cc..e7875e6fb889f 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -405,6 +405,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = quant_config
 
         self.model = NemotronModel(vllm_config=vllm_config,
                                    prefix=maybe_prefix(prefix, "model"))
@@ -489,6 +490,18 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 1febd62f2f705..dc76818e22cbb 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -546,6 +546,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = vllm_config.quant_config
 
         self.model = PhiMoEModel(vllm_config=vllm_config,
                                  prefix=maybe_prefix(prefix, "model"))
@@ -623,6 +624,19 @@ def load_weights(self, weights: Iterable[Tuple[str,
             if "rotary_emb.inv_freq" in name:
                 continue
 
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 0a99c87470850..b9c259ad73c40 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -279,6 +279,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                              ))
 
         self.config = config
+        self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
@@ -364,6 +365,18 @@ def load_weights(self, weights: Iterable[Tuple[str,
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index e83d316f74de2..b27d2b10850f5 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -39,8 +39,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -409,6 +407,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = quant_config
 
         self.model = SolarModel(
             vllm_config=vllm_config,
@@ -491,12 +490,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            if scale_name := get_compressed_tensors_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                loaded_weight = loaded_weight[0]
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index fc5a3e7fba674..a9ce8af15d3bb 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -56,8 +56,14 @@ def __init__(self, data: torch.Tensor, weight_loader: Callable):
     def weight_loader(self):
         return self._weight_loader
 
+    def _is_1d_and_scalar(self, loaded_weight: torch.Tensor):
+        cond1 = self.data.ndim == 1 and self.data.numel() == 1
+        cond2 = loaded_weight.ndim == 0 and loaded_weight.numel() == 1
+        return (cond1 and cond2)
+
     def _assert_and_load(self, loaded_weight: torch.Tensor):
-        assert self.data.shape == loaded_weight.shape
+        assert (self.data.shape == loaded_weight.shape
+                or self._is_1d_and_scalar(loaded_weight))
         self.data.copy_(loaded_weight)
 
     def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 67a9e816cb658..5ef56406e1935 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -70,7 +70,7 @@ class RocmPlatform(Platform):
 
     supported_quantization: list[str] = [
         "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
-        "fbgemm_fp8", "gguf"
+        "fbgemm_fp8", "gguf", "quark"
     ]
 
     @classmethod

From 57e729e87478d734e8d0075e35aeb4c9bd440e77 Mon Sep 17 00:00:00 2001
From: maang-h <55082429+maang-h@users.noreply.github.com>
Date: Thu, 16 Jan 2025 00:07:45 +0800
Subject: [PATCH 28/67] [Doc]: Update `OpenAI-Compatible Server` documents
 (#12082)

---
 vllm/engine/arg_utils.py            | 16 ++++----
 vllm/entrypoints/openai/cli_args.py | 60 ++++++++++++++---------------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c31b206d6f60e..03a8959a7d9ff 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -238,7 +238,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             choices=get_args(TaskOption),
             help='The task to use the model for. Each vLLM instance only '
             'supports one task, even if the same model can be used for '
-            'multiple tasks. When the model only supports one task, "auto" '
+            'multiple tasks. When the model only supports one task, ``"auto"`` '
             'can be used to select it; otherwise, you must specify explicitly '
             'which task to use.')
         parser.add_argument(
@@ -250,7 +250,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             '--skip-tokenizer-init',
             action='store_true',
-            help='Skip initialization of tokenizer and detokenizer')
+            help='Skip initialization of tokenizer and detokenizer.')
         parser.add_argument(
             '--revision',
             type=nullable_str,
@@ -401,7 +401,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             '--worker-use-ray',
             action='store_true',
-            help='Deprecated, use --distributed-executor-backend=ray.')
+            help='Deprecated, use ``--distributed-executor-backend=ray``.')
         parser.add_argument('--pipeline-parallel-size',
                             '-pp',
                             type=int,
@@ -430,7 +430,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             choices=[8, 16, 32, 64, 128],
                             help='Token block size for contiguous chunks of '
                             'tokens. This is ignored on neuron devices and '
-                            'set to max-model-len. On CUDA devices, '
+                            'set to ``--max-model-len``. On CUDA devices, '
                             'only block sizes up to 32 are supported. '
                             'On HPU devices, block size defaults to 128.')
 
@@ -439,12 +439,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             action=argparse.BooleanOptionalAction,
             default=EngineArgs.enable_prefix_caching,
             help="Enables automatic prefix caching. "
-            "Use --no-enable-prefix-caching to disable explicitly.",
+            "Use ``--no-enable-prefix-caching`` to disable explicitly.",
         )
         parser.add_argument('--disable-sliding-window',
                             action='store_true',
                             help='Disables sliding window, '
-                            'capping to sliding window size')
+                            'capping to sliding window size.')
         parser.add_argument('--use-v2-block-manager',
                             action='store_true',
                             default=True,
@@ -861,7 +861,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "of the provided names. The model name in the model "
             "field of a response will be the first name in this "
             "list. If not specified, the model name will be the "
-            "same as the `--model` argument. Noted that this name(s) "
+            "same as the ``--model`` argument. Noted that this name(s) "
             "will also be used in `model_name` tag content of "
             "prometheus metrics, if multiple names provided, metrics "
             "tag will take the first one.")
@@ -881,7 +881,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=None,
             help="Valid choices are " +
             ",".join(ALLOWED_DETAILED_TRACE_MODULES) +
-            ". It makes sense to set this only if --otlp-traces-endpoint is"
+            ". It makes sense to set this only if ``--otlp-traces-endpoint`` is"
             " set. If set, it will collect detailed traces for the specified "
             "modules. This involves use of possibly costly and or blocking "
             "operations and hence might have a performance impact.")
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 22206ef8dbfe6..35445449463e9 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -79,29 +79,29 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument("--host",
                         type=nullable_str,
                         default=None,
-                        help="host name")
-    parser.add_argument("--port", type=int, default=8000, help="port number")
+                        help="Host name.")
+    parser.add_argument("--port", type=int, default=8000, help="Port number.")
     parser.add_argument(
         "--uvicorn-log-level",
         type=str,
         default="info",
         choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'],
-        help="log level for uvicorn")
+        help="Log level for uvicorn.")
     parser.add_argument("--allow-credentials",
                         action="store_true",
-                        help="allow credentials")
+                        help="Allow credentials.")
     parser.add_argument("--allowed-origins",
                         type=json.loads,
                         default=["*"],
-                        help="allowed origins")
+                        help="Allowed origins.")
     parser.add_argument("--allowed-methods",
                         type=json.loads,
                         default=["*"],
-                        help="allowed methods")
+                        help="Allowed methods.")
     parser.add_argument("--allowed-headers",
                         type=json.loads,
                         default=["*"],
-                        help="allowed headers")
+                        help="Allowed headers.")
     parser.add_argument("--api-key",
                         type=nullable_str,
                         default=None,
@@ -115,10 +115,10 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         action=LoRAParserAction,
         help="LoRA module configurations in either 'name=path' format"
         "or JSON format. "
-        "Example (old format): 'name=path' "
+        "Example (old format): ``'name=path'`` "
         "Example (new format): "
-        "'{\"name\": \"name\", \"local_path\": \"path\", "
-        "\"base_model_name\": \"id\"}'")
+        "``{\"name\": \"name\", \"local_path\": \"path\", "
+        "\"base_model_name\": \"id\"}``")
     parser.add_argument(
         "--prompt-adapters",
         type=nullable_str,
@@ -132,7 +132,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                         default=None,
                         help="The file path to the chat template, "
                         "or the template in single-line form "
-                        "for the specified model")
+                        "for the specified model.")
     parser.add_argument(
         '--chat-template-content-format',
         type=str,
@@ -141,38 +141,39 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         help='The format to render message content within a chat template.'
         '\n\n'
         '* "string" will render the content as a string. '
-        'Example: "Hello World"\n'
+        'Example: ``"Hello World"``\n'
         '* "openai" will render the content as a list of dictionaries, '
         'similar to OpenAI schema. '
-        'Example: [{"type": "text", "text": "Hello world!"}]')
+        'Example: ``[{"type": "text", "text": "Hello world!"}]``')
     parser.add_argument("--response-role",
                         type=nullable_str,
                         default="assistant",
                         help="The role name to return if "
-                        "`request.add_generation_prompt=true`.")
+                        "``request.add_generation_prompt=true``.")
     parser.add_argument("--ssl-keyfile",
                         type=nullable_str,
                         default=None,
-                        help="The file path to the SSL key file")
+                        help="The file path to the SSL key file.")
     parser.add_argument("--ssl-certfile",
                         type=nullable_str,
                         default=None,
-                        help="The file path to the SSL cert file")
+                        help="The file path to the SSL cert file.")
     parser.add_argument("--ssl-ca-certs",
                         type=nullable_str,
                         default=None,
-                        help="The CA certificates file")
+                        help="The CA certificates file.")
     parser.add_argument(
         "--ssl-cert-reqs",
         type=int,
         default=int(ssl.CERT_NONE),
-        help="Whether client certificate is required (see stdlib ssl module's)"
+        help="Whether client certificate is required (see stdlib ssl module's)."
     )
     parser.add_argument(
         "--root-path",
         type=nullable_str,
         default=None,
-        help="FastAPI root_path when app is behind a path based routing proxy")
+        help="FastAPI root_path when app is behind a path based routing proxy."
+    )
     parser.add_argument(
         "--middleware",
         type=nullable_str,
@@ -182,15 +183,15 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "We accept multiple --middleware arguments. "
         "The value should be an import path. "
         "If a function is provided, vLLM will add it to the server "
-        "using @app.middleware('http'). "
+        "using ``@app.middleware('http')``. "
         "If a class is provided, vLLM will add it to the server "
-        "using app.add_middleware(). ")
+        "using ``app.add_middleware()``. ")
     parser.add_argument(
         "--return-tokens-as-token-ids",
         action="store_true",
-        help="When --max-logprobs is specified, represents single tokens as "
-        "strings of the form 'token_id:{token_id}' so that tokens that "
-        "are not JSON-encodable can be identified.")
+        help="When ``--max-logprobs`` is specified, represents single tokens "
+        " as strings of the form 'token_id:{token_id}' so that tokens "
+        "that are not JSON-encodable can be identified.")
     parser.add_argument(
         "--disable-frontend-multiprocessing",
         action="store_true",
@@ -205,9 +206,8 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "--enable-auto-tool-choice",
         action="store_true",
         default=False,
-        help=
-        "Enable auto tool choice for supported models. Use --tool-call-parser"
-        " to specify which parser to use")
+        help="Enable auto tool choice for supported models. Use "
+        "``--tool-call-parser`` to specify which parser to use.")
 
     valid_tool_parsers = ToolParserManager.tool_parsers.keys()
     parser.add_argument(
@@ -219,7 +219,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         help=
         "Select the tool call parser depending on the model that you're using."
         " This is used to parse the model-generated tool call into OpenAI API "
-        "format. Required for --enable-auto-tool-choice.")
+        "format. Required for ``--enable-auto-tool-choice``.")
 
     parser.add_argument(
         "--tool-parser-plugin",
@@ -228,7 +228,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         help=
         "Special the tool parser plugin write to parse the model-generated tool"
         " into OpenAI API format, the name register in this plugin can be used "
-        "in --tool-call-parser.")
+        "in ``--tool-call-parser``.")
 
     parser = AsyncEngineArgs.add_cli_args(parser)
 
@@ -243,7 +243,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "--disable-fastapi-docs",
         action='store_true',
         default=False,
-        help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint"
+        help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."
     )
     parser.add_argument(
         "--enable-prompt-tokens-details",

From edce722eaa5e9f0b97bea611531e3341ec2e2e71 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Wed, 15 Jan 2025 09:31:01 -0700
Subject: [PATCH 29/67] [Bugfix] use right truncation for non-generative tasks
 (#12050)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 tests/entrypoints/llm/test_encode.py                | 7 +++++++
 vllm/config.py                                      | 4 ++++
 vllm/transformers_utils/tokenizer_group/__init__.py | 3 ++-
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index 41163809237e9..3906ad766e0b6 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -105,3 +105,10 @@ def test_multiple_pooling_params(llm: LLM):
     # pooling_params is None, default params should be applied
     outputs = llm.encode(PROMPTS, pooling_params=None)
     assert len(PROMPTS) == len(outputs)
+
+
+@pytest.mark.skip_global_cleanup
+def test_right_side_truncation(llm: LLM):
+    # Embeddings models should truncate the end of the prompt
+    tokenizer = llm.get_tokenizer()
+    assert tokenizer.truncation_side == "right"
diff --git a/vllm/config.py b/vllm/config.py
index 65cb0d85f172a..4ffc13a05e026 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -357,6 +357,10 @@ def __init__(self,
         supported_tasks, task = self._resolve_task(task, self.hf_config)
         self.supported_tasks = supported_tasks
         self.task: Final = task
+        if self.task in ("draft", "generate"):
+            self.truncation_side = "left"
+        else:
+            self.truncation_side = "right"
 
         self.pooler_config = self._init_pooler_config(override_pooler_config)
         self.logits_processor_pattern = logits_processor_pattern
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index d400276796996..09569c564a58d 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -24,7 +24,8 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
                        max_input_length=None,
                        tokenizer_mode=model_config.tokenizer_mode,
                        trust_remote_code=model_config.trust_remote_code,
-                       revision=model_config.tokenizer_revision)
+                       revision=model_config.tokenizer_revision,
+                       truncation_side=model_config.truncation_side)
 
     return get_tokenizer_group(parallel_config.tokenizer_pool_config,
                                **init_kwargs)

From 70755e819e0ae5d963dab7d81321bdfaef6d955a Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 15 Jan 2025 11:29:00 -0800
Subject: [PATCH 30/67] [V1][Core] Autotune encoder cache budget (#11895)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/config.py                        | 15 ++++--
 vllm/multimodal/registry.py           | 29 ++++++++--
 vllm/v1/core/encoder_cache_manager.py | 78 ++++++++++++++++++++++++++-
 vllm/v1/core/scheduler.py             | 26 ++++++---
 vllm/v1/engine/core.py                |  9 ++--
 vllm/v1/worker/gpu_model_runner.py    | 60 +++++++++++----------
 6 files changed, 167 insertions(+), 50 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 4ffc13a05e026..e64883368a751 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1387,13 +1387,15 @@ class SchedulerConfig:
 
     is_multimodal_model: bool = False
 
-    # FIXME(woosuk & ywang96): Below are placeholder values. We need to
-    # calculate the actual values from the configurations.
-    # Multimodal encoder run compute budget, only used in V1
-    max_num_encoder_input_tokens = 16384
+    # NOTE: The following multimodal encoder budget will be initialized to
+    # max_num_batched_tokens and overridden in case max multimodal embedding
+    # size is larger.
+    # TODO (ywang96): Make these configurable.
+    # Multimodal encoder compute budget, only used in V1
+    max_num_encoder_input_tokens: int = field(default=None)  # type: ignore
 
     # Multimodal encoder cache size, only used in V1
-    encoder_cache_size = 16384
+    encoder_cache_size: int = field(default=None)  # type: ignore
 
     # Whether to perform preemption by swapping or
     # recomputation. If not specified, we determine the mode as follows:
@@ -1467,6 +1469,9 @@ def __post_init__(self) -> None:
                     _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
                 )
 
+        self.max_num_encoder_input_tokens = self.max_num_batched_tokens
+        self.encoder_cache_size = self.max_num_batched_tokens
+
         if self.enable_chunked_prefill:
             logger.info(
                 "Chunked prefill is enabled with max_num_batched_tokens=%d.",
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 2961f7c76ca12..aaf7ff34ca573 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -252,11 +252,8 @@ def get_max_tokens_per_item_by_modality(
         model_config: "ModelConfig",
     ) -> Mapping[str, int]:
         """
-        Get the maximum number of tokens per data item from each modality
-        for profiling the memory usage of a model.
-
-        Note:
-            This is currently directly used only in V1.
+        Get the maximum number of tokens per data item from each modality based 
+        on underlying model configuration.
         """
         if self.has_processor(model_config):
             tokenizer = cached_get_tokenizer(
@@ -272,6 +269,28 @@ def get_max_tokens_per_item_by_modality(
             for key, plugin in self._plugins.items()
         }
 
+    def get_max_tokens_per_item_by_nonzero_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens per data item from each modality based
+        on underlying model configuration, excluding modalities that user 
+        explicitly disabled via `limit_mm_per_prompt`.
+
+        Note:
+            This is currently directly used only in V1 for profiling the memory 
+            usage of a model.
+        """
+        limits_per_plugin = self._limits_by_model[model_config]
+
+        return {
+            key: max_tokens_per_mm_item
+            for key, max_tokens_per_mm_item in
+            self.get_max_tokens_per_item_by_modality(model_config).items()
+            if limits_per_plugin[key] > 0
+        }
+
     def get_max_tokens_by_modality(
         self,
         model_config: "ModelConfig",
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 845bd5ea05e3c..0cd8c806a3e47 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -1,7 +1,14 @@
-from typing import Dict, List, Set, Tuple
+from typing import TYPE_CHECKING, Dict, List, Set, Tuple
 
+from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.v1.request import Request
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, SchedulerConfig
+
+logger = init_logger(__name__)
+
 
 class EncoderCacheManager:
 
@@ -46,3 +53,72 @@ def get_freed_ids(self) -> List[Tuple[str, int]]:
         freed = self.freed
         self.freed = []
         return freed
+
+
+def compute_encoder_budget(
+    model_config: "ModelConfig",
+    scheduler_config: "SchedulerConfig",
+) -> Tuple[int, int]:
+    """Compute the encoder cache budget based on the model and scheduler 
+    configurations.
+
+    Args:
+        model_config: Model configuration.
+        scheduler_config: Scheduler configuration.
+
+    Returns:
+        - Compute budget for encoder execution, in unit of number of tokens 
+            in the input sequence.
+        - Space budget for encoder cache size, in unit of number of tokens 
+            in the input sequence.
+    """
+
+    if not model_config.is_multimodal_model:
+        return 0, 0
+
+    # TODO: handle encoder-decoder models once we support them.
+    (
+        encoder_compute_budget,
+        encoder_cache_size,
+    ) = _compute_encoder_budget_multimodal(model_config, scheduler_config)
+
+    return encoder_compute_budget, encoder_cache_size
+
+
+def _compute_encoder_budget_multimodal(
+    model_config: "ModelConfig",
+    scheduler_config: "SchedulerConfig",
+) -> Tuple[int, int]:
+    """Compute the encoder cache budget based on the model and scheduler 
+    configurations for a multimodal model.
+
+    Args:
+        model_config: Model configuration.
+        scheduler_config: Scheduler configuration.
+
+    Returns:
+        - Compute budget for encoder execution, in unit of number of tokens 
+            in the input sequence.
+        - Space budget for encoder cache size, in unit of number of tokens 
+            in the input sequence.
+    """
+
+    max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality(  # noqa: E501
+        model_config)
+
+    if not max_tokens_by_modality_dict:
+        logger.warning(
+            "All non-text modalities supported by the model have been "
+            "explicitly disabled via limit_mm_per_prompt. Encoder cache will "
+            "not be initialized.")
+        return 0, 0
+
+    _, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(),
+                                    key=lambda item: item[1])
+
+    encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens,
+                                 max_tokens_per_mm_item)
+    encoder_cache_size = max(scheduler_config.encoder_cache_size,
+                             max_tokens_per_mm_item)
+
+    return encoder_compute_budget, encoder_cache_size
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 45e67c94f8f15..64df21d59fef4 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -3,10 +3,11 @@
 from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
                     Tuple, Union)
 
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
+from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
+                                                compute_encoder_budget)
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
 from vllm.v1.metrics.stats import SchedulerStats
@@ -25,6 +26,7 @@ class Scheduler:
     def __init__(
         self,
         scheduler_config: SchedulerConfig,
+        model_config: ModelConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
     ) -> None:
@@ -69,16 +71,24 @@ def __init__(
         self.running_reqs_data: Dict[str, RunningRequestData] = {}
 
         # Encoder-related.
+        # Calculate encoder cache size if applicable
+        # NOTE: For now we use the same budget for both compute and space.
+        # This can be changed when we make encoder cache for embedding caching
+        # across requests.
+        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
+            model_config=model_config,
+            scheduler_config=scheduler_config,
+        )
+
         # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
         # projector if needed). Currently, we assume that the encoder also
         # has the Transformer architecture (e.g., ViT).
-        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  #noqa: E501
-        # NOTE(woosuk): For the models without encoder (e.g., text-only models),
-        # the encoder cache will not be initialized and used, regardless of
-        # the cache size. This is because the memory space for the encoder cache
-        # is preallocated in the profiling run.
+        self.max_num_encoder_input_tokens = encoder_compute_budget
+        # NOTE: For the models without encoder (e.g., text-only models),
+        # the encoder cache will not be initialized because cache size is 0
+        # for these models.
         self.encoder_cache_manager = EncoderCacheManager(
-            cache_size=self.scheduler_config.encoder_cache_size)
+            cache_size=encoder_cache_size)
 
     def schedule(self) -> "SchedulerOutput":
         # NOTE(woosuk) on the scheduling algorithm:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e7f90d3c62142..ef616229aa57b 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -54,9 +54,12 @@ def __init__(
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
         # Setup scheduler.
-        self.scheduler = Scheduler(vllm_config.scheduler_config,
-                                   vllm_config.cache_config,
-                                   vllm_config.lora_config)
+        self.scheduler = Scheduler(
+            scheduler_config=vllm_config.scheduler_config,
+            model_config=vllm_config.model_config,
+            cache_config=vllm_config.cache_config,
+            lora_config=vllm_config.lora_config,
+        )
 
         self.mm_input_mapper_server = MMInputMapperServer(
             vllm_config.model_config)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fb87dc5a8222a..de83640b27cd6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -20,6 +20,7 @@
                         is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
+from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -88,8 +89,12 @@ def __init__(
         self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config)
         self.mm_input_mapper_profiling.use_cache = False
 
-        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
-        self.encoder_cache_size = self.scheduler_config.encoder_cache_size
+        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
+            model_config=model_config,
+            scheduler_config=scheduler_config,
+        )
+        self.max_num_encoder_input_tokens = encoder_compute_budget
+        self.encoder_cache_size = encoder_cache_size
 
         # Lazy initialization
         # self.model: nn.Module  # Set after load_model
@@ -721,44 +726,30 @@ def profile_run(self) -> None:
         ]
 
         # Profile with multimodal encoder & encoder cache.
-        if self.is_multimodal_model:
-
-            # Create dummy batch of multimodal inputs.
-            dummy_request_data = self.input_registry.dummy_data_for_profiling(
-                model_config=self.model_config,
-                seq_len=self.max_num_tokens,
-                mm_registry=self.mm_registry,
-            )
-            dummy_mm_data = dummy_request_data.multi_modal_data
+        # TODO: handle encoder-decoder models once we support them.
+        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
+                and self.encoder_cache_size > 0):
 
             # NOTE: Currently model is profiled with a single non-text
             # modality with the max possible input tokens even when
             # it supports multiple.
-            max_tokens_by_modality_dict = self.mm_registry.get_max_tokens_per_item_by_modality(  # noqa: E501
+            max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality(  # noqa: E501
                 self.model_config)
-
             dummy_data_modality, max_tokens_per_mm_item = max(
                 max_tokens_by_modality_dict.items(), key=lambda item: item[1])
 
             # Check how many items of this modality can be supported by
-            # the encoder cache budget.
-            encoder_cache_budget = min(self.max_num_encoder_input_tokens,
-                                       self.encoder_cache_size)
-            max_num_mm_items_encoder_budget = encoder_cache_budget // \
-                max_tokens_per_mm_item
-
-            # TODO: Allow users to set encoder_cache_budget in case this
-            # happens.
-            assert max_num_mm_items_encoder_budget > 0, (
-                f"Encoder cache budget={encoder_cache_budget} is too small to "
-                f"support the maximum possible size of multimodal embeddings"
-                f"={max_tokens_per_mm_item}.")
+            # the encoder budget.
+            encoder_budget = min(self.max_num_encoder_input_tokens,
+                                 self.encoder_cache_size)
+
+            max_num_mm_items_encoder_budget = cdiv(encoder_budget,
+                                                   max_tokens_per_mm_item)
 
             # Check how many items of this modality can be supported by
             # the decoder budget.
-            max_mm_items_per_req = max(
-                self.mm_registry.get_mm_limits_per_prompt(
-                    self.model_config).values())
+            max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt(
+                self.model_config)[dummy_data_modality]
 
             # NOTE: We do not consider max_num_batched_tokens on purpose
             # because the multimodal embeddings can be generated in advance
@@ -769,6 +760,19 @@ def profile_run(self) -> None:
             max_num_mm_items = min(max_num_mm_items_encoder_budget,
                                    max_num_mm_items_decoder_budget)
 
+            logger.info(
+                "Encoder cache will be initialized with a budget of %s tokens,"
+                " and profiled with %s %s items of the maximum feature size.",
+                encoder_budget, max_num_mm_items, dummy_data_modality)
+
+            # Create dummy batch of multimodal inputs.
+            dummy_request_data = self.input_registry.dummy_data_for_profiling(
+                model_config=self.model_config,
+                seq_len=self.max_num_tokens,
+                mm_registry=self.mm_registry,
+            )
+            dummy_mm_data = dummy_request_data.multi_modal_data
+
             # Dummy data definition in V0 may contain multiple multimodal items
             # (e.g, multiple images) for a single request, therefore here we
             # always replicate first item by max_num_mm_items times since in V1

From ebd8c669efa54a218eb83735fd7ba40922f5f3ad Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 16 Jan 2025 01:29:42 +0530
Subject: [PATCH 31/67] [Bugfix] Fix _get_lora_device for HQQ marlin (#12090)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 vllm/lora/layers.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index a933ccaecf15e..dd981ffce8833 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -51,6 +51,9 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device:
     # marlin
     elif hasattr(base_layer, "B"):
         return base_layer.B.device
+    # HQQ marlin
+    elif hasattr(base_layer, "W_q"):
+        return base_layer.W_q.device
     else:
         raise ValueError(f"Unsupported base layer: {base_layer}")
 

From cd9d06fb8d1f89fc1bcc9305bc20d57c6d8b73d8 Mon Sep 17 00:00:00 2001
From: tvirolai-amd <teemu.virolainen@amd.com>
Date: Wed, 15 Jan 2025 23:46:03 +0200
Subject: [PATCH 32/67] Allow hip sources to be directly included when
 compiling for rocm. (#12087)

---
 cmake/utils.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 40430dae10c5b..15b09395a889f 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -58,8 +58,8 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
   #
   set(SRCS ${ORIG_SRCS})
   set(CXX_SRCS ${ORIG_SRCS})
-  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
-  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
+  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)|(hip)$")
+  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)|(hip)$")
 
   #
   # Generate ROCm/HIP source file names from CUDA file names.

From fa0050db08660535368ec5ea41d313bdeb69909d Mon Sep 17 00:00:00 2001
From: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Date: Wed, 15 Jan 2025 20:31:27 -0800
Subject: [PATCH 33/67] [Core] Default to using per_token quantization for fp8
 when cutlass is supported. (#8651)

Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a1be45a49e94a..4969ee559522e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -355,7 +355,8 @@ def apply(self,
             input_scale=layer.input_scale,
             bias=bias,
             cutlass_fp8_supported=self.cutlass_fp8_supported,
-            use_per_token_if_dynamic=False)
+            # Default to using per_token quantization if cutlass is supported
+            use_per_token_if_dynamic=self.cutlass_fp8_supported)
 
 
 class Fp8MoEMethod(FusedMoEMethodBase):

From f8ef146f03da8993fb3bf5638b28bef6e931fc51 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 16 Jan 2025 15:53:43 +0800
Subject: [PATCH 34/67] [Doc] Add documentation for specifying model
 architecture (#12105)

---
 docs/source/serving/offline_inference.md | 53 ++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 94703a1c32ade..1f5a54f755f13 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -31,6 +31,59 @@ Please refer to the above pages for more details about each API.
 This section lists the most common options for running the vLLM engine.
 For a full list, refer to the [Engine Arguments](#engine-args) page.
 
+### Model resolution
+
+vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository
+and finding the corresponding implementation that is registered to vLLM.
+Nevertheless, our model resolution may fail for the following reasons:
+
+- The `config.json` of the model repository lacks the `architectures` field.
+- Unofficial repositories refer to a model using alternative names which are not recorded in vLLM.
+- The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded.
+
+In those cases, vLLM may throw an error like:
+
+```text
+Traceback (most recent call last):
+...
+  File "vllm/model_executor/models/registry.py", line xxx, in inspect_model_cls
+    for arch in architectures:
+TypeError: 'NoneType' object is not iterable
+```
+
+or:
+
+```text
+  File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported
+    raise ValueError(
+ValueError: Model architectures ['<arch>'] are not supported for now. Supported architectures: [...]
+```
+
+:::{note}
+The above error is distinct from the following similar but different error:
+
+```text
+  File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported
+    raise ValueError(
+ValueError: Model architectures ['<arch>'] failed to be inspected. Please check the logs for more details.
+```
+
+This error means that vLLM failed to import the model file. Usually, it is related to missing dependencies or outdated
+binaries in the vLLM build. Please read the logs carefully to determine the real cause of the error.
+:::
+
+To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option.
+For example:
+
+```python
+model = LLM(
+    model="cerebras/Cerebras-GPT-1.3B",
+    hf_overrides={"architectures": ["GPT2LMHeadModel"]},  # GPT-2
+)
+```
+
+Our [list of supported models](#supported-models) shows the model architectures that are recognized by vLLM.
+
 ### Reducing memory usage
 
 Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.

From 9aa1519f089e8dc2c2bd1b4c74a8ce47d386f0a9 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 16 Jan 2025 04:59:06 -0500
Subject: [PATCH 35/67] Various cosmetic/comment fixes (#12089)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 .../compressed_tensors/schemes/compressed_tensors_24.py    | 2 +-
 vllm/model_executor/models/aria.py                         | 3 +--
 vllm/model_executor/models/commandr.py                     | 3 +--
 vllm/model_executor/models/dbrx.py                         | 3 +--
 vllm/model_executor/models/exaone.py                       | 3 +--
 vllm/model_executor/models/gpt_j.py                        | 3 +--
 vllm/model_executor/models/granite.py                      | 3 +--
 vllm/model_executor/models/llama.py                        | 7 +++----
 vllm/model_executor/models/mixtral.py                      | 3 +--
 vllm/model_executor/models/mllama.py                       | 3 +--
 vllm/model_executor/models/nemotron.py                     | 3 +--
 vllm/model_executor/models/phimoe.py                       | 3 +--
 vllm/model_executor/models/qwen2.py                        | 3 +--
 vllm/model_executor/models/solar.py                        | 3 +--
 14 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index bc697ef93b34b..21e6fe7a22616 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -42,7 +42,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
 
         if not sparse_cutlass_supported():
             raise ValueError(
-                "Sparse CUTLASS not supported. vLLM must be built with"
+                "Sparse CUTLASS not supported. vLLM must be built with "
                 "CUDA 12.2 or later to use this feature")
 
         self.output_dtype = params_dtype
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 91225c0ddc917..5b97eced62df0 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -390,8 +390,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 6517422697c04..989056bf5c155 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -440,8 +440,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index ff1f1c2a939ff..b2aa3c0709bd4 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -452,8 +452,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         for name, loaded_weight in weights:
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index ac679d6ff43c7..eab3bf0756fca 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -533,8 +533,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 56343ca9a71ac..08298cc0db36f 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -316,8 +316,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 67e04b57658bc..ddd2d7a16b242 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -475,8 +475,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 4667f275ecd33..a5bd418801f2c 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -105,9 +105,9 @@ def __init__(self,
                  max_position_embeddings: int = 8192,
                  quant_config: Optional[QuantizationConfig] = None,
                  bias: bool = False,
+                 bias_o_proj: bool = False,
                  cache_config: Optional[CacheConfig] = None,
-                 prefix: str = "",
-                 bias_o_proj: bool = False) -> None:
+                 prefix: str = "") -> None:
         super().__init__()
         layer_idx = extract_layer_index(prefix)
         self.hidden_size = hidden_size
@@ -397,8 +397,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 2c8895e842996..da415cdae96ed 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -431,8 +431,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index bd261f31499cb..2554281610a30 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1432,8 +1432,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1)
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index e7875e6fb889f..2340283b69665 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -492,8 +492,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index dc76818e22cbb..881c09ea9db99 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -626,8 +626,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index b9c259ad73c40..d015f60c6d065 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -367,8 +367,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index b27d2b10850f5..37c5a4b5713b8 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -492,8 +492,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)

From dd7c9ad87074b68d201208a196e0a4b2b5ecc27a Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 16 Jan 2025 18:11:54 +0800
Subject: [PATCH 36/67] [Bugfix] Remove hardcoded `head_size=256` for Deepseek
 v2 and v3 (#12067)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/kernels/test_attention.py           |  6 +++---
 vllm/config.py                            |  9 ++++++---
 vllm/model_executor/models/deepseek_v2.py | 24 +++++++----------------
 vllm/model_executor/models/deepseek_v3.py | 24 +++++++----------------
 4 files changed, 23 insertions(+), 40 deletions(-)

diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 3e3c0668198ad..124d5d297a574 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -31,9 +31,9 @@
 NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
 NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
 
-# FlashAttention forward only supports head dimension at most 128
-# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
-HEAD_SIZES = [64, 80, 120, 256]
+# This should be sync with get_supported_head_sizes() in
+# vllm.attention.ops.paged_attn.PagedAttention
+HEAD_SIZES = [32, 64, 80, 96, 112, 120, 128, 192, 256]
 
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
diff --git a/vllm/config.py b/vllm/config.py
index e64883368a751..2fe674b857e16 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -733,9 +733,12 @@ def get_head_size(self) -> int:
         if hasattr(self.hf_text_config,
                    "model_type") and (self.hf_text_config.model_type
                                       in ('deepseek_v2', 'deepseek_v3')):
-            # FlashAttention supports only head_size 32, 64, 128, 256,
-            # we need to pad head_size 192 to 256
-            return 256
+            qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim",
+                                       0)
+            qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim",
+                                       0)
+            if qk_rope_head_dim and qk_nope_head_dim:
+                return qk_rope_head_dim + qk_nope_head_dim
 
         if self.is_attention_free:
             return 0
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index d83cafaf998ab..af6810a140b43 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -262,14 +262,8 @@ def __init__(
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
-        # self.attn = Attention(self.num_heads,
-        #                       self.qk_head_dim,
-        #                       self.scaling,
-        #                       num_kv_heads=self.num_heads)
-
-        # TODO, support head_size 192
         self.attn = Attention(self.num_local_heads,
-                              256,
+                              self.qk_head_dim,
                               self.scaling,
                               num_kv_heads=self.num_local_heads,
                               cache_config=cache_config,
@@ -319,18 +313,14 @@ def forward(
         k = torch.empty_like(q)
         k[..., :self.qk_nope_head_dim] = k_nope
         k[..., self.qk_nope_head_dim:] = k_pe
-        q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim],
-                                    value=0).view(-1,
-                                                  self.num_local_heads * 256)
-        k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim],
-                                    value=0).view(-1,
-                                                  self.num_local_heads * 256)
-        v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim],
-                                    value=0).view(-1,
-                                                  self.num_local_heads * 256)
+        # padding value to qk_head_dim for alignment
+        v = torch.nn.functional.pad(
+            v, [0, self.qk_head_dim - self.v_head_dim],
+            value=0).view(-1, self.num_local_heads * self.qk_head_dim)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         attn_output = attn_output.view(
-            -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(
+            -1, self.num_local_heads,
+            self.qk_head_dim)[..., :self.v_head_dim].reshape(
                 -1, self.num_local_heads * self.v_head_dim)
         output, _ = self.o_proj(attn_output)
         return output
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index ca79b14c55fea..0b44f0d062c40 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -269,14 +269,8 @@ def __init__(
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
-        # self.attn = Attention(self.num_heads,
-        #                       self.qk_head_dim,
-        #                       self.scaling,
-        #                       num_kv_heads=self.num_heads)
-
-        # TODO, support head_size 192
         self.attn = Attention(self.num_local_heads,
-                              256,
+                              self.qk_head_dim,
                               self.scaling,
                               num_kv_heads=self.num_local_heads,
                               cache_config=cache_config,
@@ -326,18 +320,14 @@ def forward(
         k = torch.empty_like(q)
         k[..., :self.qk_nope_head_dim] = k_nope
         k[..., self.qk_nope_head_dim:] = k_pe
-        q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim],
-                                    value=0).view(-1,
-                                                  self.num_local_heads * 256)
-        k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim],
-                                    value=0).view(-1,
-                                                  self.num_local_heads * 256)
-        v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim],
-                                    value=0).view(-1,
-                                                  self.num_local_heads * 256)
+        # padding value to qk_head_dim for alignment
+        v = torch.nn.functional.pad(
+            v, [0, self.qk_head_dim - self.v_head_dim],
+            value=0).view(-1, self.num_local_heads * self.qk_head_dim)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         attn_output = attn_output.view(
-            -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(
+            -1, self.num_local_heads,
+            self.qk_head_dim)[..., :self.v_head_dim].reshape(
                 -1, self.num_local_heads * self.v_head_dim)
         output, _ = self.o_proj(attn_output)
         return output

From 40bb71fd83b0569ada0b38964cdee5cd731d501c Mon Sep 17 00:00:00 2001
From: Nir David <124874956+nirda7@users.noreply.github.com>
Date: Thu, 16 Jan 2025 13:17:19 +0200
Subject: [PATCH 37/67] Fix weights load device use (#686)

Some hardware (like TPU) change device in the device_config after the
initial setting (None) in their worker's init_device function.
So we cannot set the device to upload the unquantized weights during
initialization.
---
 vllm/engine/arg_utils.py                   | 10 +++-------
 vllm/model_executor/model_loader/loader.py | 12 ++++++------
 vllm/worker/hpu_model_runner.py            |  1 +
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8692d7d49234d..85ec168af6a75 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1026,13 +1026,11 @@ def create_model_config(self) -> ModelConfig:
             logits_processor_pattern=self.logits_processor_pattern,
             generation_config=self.generation_config)
 
-    def create_load_config(self, load_device=None) -> LoadConfig:
-        if load_device is None:
-            load_device = DeviceConfig(device=self.device).device
+    def create_load_config(self) -> LoadConfig:
         return LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,
-            device=load_device,
+            device=self.weights_load_device,
             model_loader_extra_config=self.model_loader_extra_config,
             ignore_patterns=self.ignore_patterns,
         )
@@ -1245,9 +1243,7 @@ def create_engine_config(self,
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
 
-        load_device = device_config.device if self.weights_load_device is \
-            None else self.weights_load_device
-        load_config = self.create_load_config(load_device)
+        load_config = self.create_load_config()
 
         prompt_adapter_config = PromptAdapterConfig(
             max_prompt_adapters=self.max_prompt_adapters,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 5eb47076d044c..01295f0002b89 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -51,7 +51,7 @@
 from vllm.platforms import current_platform
 from vllm.transformers_utils.s3_utils import glob as s3_glob
 from vllm.transformers_utils.utils import is_s3
-from vllm.utils import is_fake_hpu, is_pin_memory_available
+from vllm.utils import is_pin_memory_available
 
 
 @contextmanager
@@ -367,17 +367,17 @@ def download_model(self, model_config: ModelConfig) -> None:
 
     def load_model(self, vllm_config: VllmConfig) -> nn.Module:
         device_config = vllm_config.device_config
+        load_config = vllm_config.load_config
         model_config = vllm_config.model_config
 
-        target_device = torch.device(device_config.device)
+        load_device = device_config.device if load_config.device is None else \
+                      load_config.device
+        target_device = torch.device(load_device)
         with set_default_torch_dtype(model_config.dtype):
-            target_device = torch.device(
-                device_config.device) if is_fake_hpu() else torch.device(
-                    self.load_config.device)
             with target_device:
                 model = _initialize_model(vllm_config=vllm_config)
 
-            logger.info("Loading weights on %s...", self.load_config.device)
+            logger.info("Loading weights on %s...", load_device)
             weights_to_load = {name for name, _ in model.named_parameters()}
             loaded_weights = model.load_weights(
                 self._get_all_weights(model_config, model))
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 2dfff9414977b..8b3d7ba9c422c 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -606,6 +606,7 @@ def __init__(
         if is_fake_hpu():
             self.device_config.device = torch.device('cpu')
             self.device_config.device_type = 'cpu'
+            self.load_config.device = None
         self.device = self.device_config.device
         self.enforce_eager = self.model_config.enforce_eager
         self.max_num_seqs = self.scheduler_config.max_num_seqs

From aaaac6c6258f9fd91360d985b4c25b6761c7b1bf Mon Sep 17 00:00:00 2001
From: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Date: Thu, 16 Jan 2025 13:24:33 +0200
Subject: [PATCH 38/67] format

---
 vllm/worker/hpu_model_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 135af74fd686c..303ccf7cfcf88 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -884,7 +884,8 @@ def _prepare_prompt(
             if seq_group_metadata.multi_modal_data:
                 positions = input_positions[0]
                 mm_data, placeholder_maps = MultiModalPlaceholderMap \
-                    .from_seq_group(seq_group_metadata, range(positions[0], positions[0] + len(positions)))
+                    .from_seq_group(seq_group_metadata,
+                      range(positions[0], positions[0] + len(positions)))
 
                 if self.mm_registry.has_processor(self.model_config):
                     mm_kwargs = mm_data
@@ -1041,8 +1042,7 @@ def _prepare_prompt(
             num_prefill_tokens=num_prefill_tokens,
             num_decode_tokens=0,
             slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=placeholder_index_maps
-        )
+            multi_modal_placeholder_index_maps=placeholder_index_maps)
         multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
         for t in multi_modal_kwargs:
             if torch.is_tensor(multi_modal_kwargs[t]):

From bf53e0c70b0fe17087914cc770fd801e0bf02137 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 16 Jan 2025 19:58:53 +0800
Subject: [PATCH 39/67] Support torchrun and SPMD-style offline inference
 (#12071)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  1 +
 .../offline_inference/torchrun_example.py     | 64 +++++++++++++++
 tests/distributed/test_torchrun_example.py    | 56 +++++++++++++
 tests/engine/test_multiproc_workers.py        |  2 +-
 vllm/config.py                                |  7 +-
 vllm/engine/arg_utils.py                      |  2 +-
 vllm/engine/llm_engine.py                     |  5 ++
 vllm/executor/ray_distributed_executor.py     |  6 +-
 vllm/executor/uniproc_executor.py             | 81 ++++++++++++++++++-
 vllm/lora/layers.py                           |  4 +-
 .../model_executor/layers/logits_processor.py | 16 ++--
 vllm/v1/executor/multiproc_executor.py        |  2 +-
 vllm/worker/worker.py                         |  3 -
 vllm/worker/worker_base.py                    | 29 ++++---
 14 files changed, 248 insertions(+), 30 deletions(-)
 create mode 100644 examples/offline_inference/torchrun_example.py
 create mode 100644 tests/distributed/test_torchrun_example.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 74b287c7adbfa..00fed96c1ac8c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -463,6 +463,7 @@ steps:
   - vllm/worker/worker.py
   - vllm/worker/model_runner.py
   commands:
+  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py
new file mode 100644
index 0000000000000..b6de73eb7266e
--- /dev/null
+++ b/examples/offline_inference/torchrun_example.py
@@ -0,0 +1,64 @@
+"""
+experimental support for tensor-parallel inference with torchrun,
+see https://github.com/vllm-project/vllm/issues/11400 for
+the motivation and use case for this example.
+run the script with `torchrun --nproc-per-node=2 torchrun_example.py`,
+the argument 2 should match the `tensor_parallel_size` below.
+see `tests/distributed/test_torchrun_example.py` for the unit test.
+"""
+
+from vllm import LLM, SamplingParams
+
+# Create prompts, the same across all ranks
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create sampling parameters, the same across all ranks
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Use `distributed_executor_backend="external_launcher"` so that
+# this llm engine/instance only creates one worker.
+llm = LLM(
+    model="facebook/opt-125m",
+    tensor_parallel_size=2,
+    distributed_executor_backend="external_launcher",
+)
+
+outputs = llm.generate(prompts, sampling_params)
+
+# all ranks will have the same outputs
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
+"""
+Further tips:
+
+1. to communicate control messages across all ranks, use the cpu group,
+a PyTorch ProcessGroup with GLOO backend.
+
+```python
+from vllm.distributed.parallel_state import get_world_group
+cpu_group = get_world_group().cpu_group
+torch_rank = dist.get_rank(group=cpu_group)
+if torch_rank == 0:
+    # do something for rank 0, e.g. saving the results to disk.
+```
+
+2. to communicate data across all ranks, use the model's device group,
+a PyTorch ProcessGroup with NCCL backend.
+```python
+from vllm.distributed.parallel_state import get_world_group
+device_group = get_world_group().device_group
+```
+
+3. to access the model directly in every rank, use the following code:
+```python
+llm.llm_engine.model_executor.driver_worker.worker.model_runner.model
+```
+"""
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
new file mode 100644
index 0000000000000..7aa03d7f0402a
--- /dev/null
+++ b/tests/distributed/test_torchrun_example.py
@@ -0,0 +1,56 @@
+# unit test for `examples/offline_inference/torchrun_example.py`
+
+import random
+
+import torch.distributed as dist
+
+from vllm import LLM, SamplingParams
+from vllm.distributed.parallel_state import get_world_group
+
+# Create prompts
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# set different `gpu_memory_utilization` and `swap_space` for different ranks,
+# to test if all ranks agree on the same kv cache configuration.
+llm = LLM(model="facebook/opt-125m",
+          tensor_parallel_size=2,
+          distributed_executor_backend="external_launcher",
+          gpu_memory_utilization=random.uniform(0.7, 0.9),
+          swap_space=random.randint(1, 4))
+
+outputs = llm.generate(prompts, sampling_params)
+
+cpu_group = get_world_group().cpu_group
+
+torch_rank = dist.get_rank(group=cpu_group)
+
+
+def test_consistent_across_ranks(obj):
+    if torch_rank == 0:
+        dist.broadcast_object_list([obj], src=0, group=cpu_group)
+    else:
+        container = [None]
+        dist.broadcast_object_list(container, src=0, group=cpu_group)
+        assert container[0] == obj
+
+
+test_consistent_across_ranks(
+    llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
+test_consistent_across_ranks(
+    llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
+
+# all ranks should have the same outputs
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    test_consistent_across_ranks(prompt)
+    test_consistent_across_ranks(generated_text)
+    print(f"Rank {torch_rank}, Prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
index db70a808c008b..04505fcaae24b 100644
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -22,7 +22,7 @@ def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
             # simulate error case
             raise worker_input
 
-        return self.rank, input
+        return self.rpc_rank, input
 
 
 def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
diff --git a/vllm/config.py b/vllm/config.py
index 2fe674b857e16..a5f2161068d2a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1338,14 +1338,15 @@ def _verify_args(self) -> None:
         from vllm.executor.executor_base import ExecutorBase
         from vllm.platforms import current_platform
         if self.distributed_executor_backend not in (
-                "ray", "mp", "uni", None) and not (isinstance(
+                "ray", "mp", "uni",
+                "external_launcher", None) and not (isinstance(
                     self.distributed_executor_backend, type) and issubclass(
                         self.distributed_executor_backend, ExecutorBase)):
             raise ValueError(
                 "Unrecognized distributed executor backend "
                 f"{self.distributed_executor_backend}. Supported "
-                "values are 'ray', 'mp' 'uni', or custom ExecutorBase"
-                " subclass.")
+                "values are 'ray', 'mp' 'uni', 'external_launcher' or"
+                " custom ExecutorBase subclass.")
         if self.use_ray:
             from vllm.executor import ray_utils
             ray_utils.assert_ray_available()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 03a8959a7d9ff..a4f4c9558d056 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -388,7 +388,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         # Parallel arguments
         parser.add_argument(
             '--distributed-executor-backend',
-            choices=['ray', 'mp'],
+            choices=['ray', 'mp', 'uni', 'external_launcher'],
             default=EngineArgs.distributed_executor_backend,
             help='Backend to use for distributed model '
             'workers, either "ray" or "mp" (multiprocessing). If the product '
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 49a1e9f505d9f..5d19ce03d5b58 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -457,6 +457,11 @@ def _get_executor_cls(cls,
                 # JAX-style, single-process, multi-device executor.
                 from vllm.executor.uniproc_executor import UniProcExecutor
                 executor_class = UniProcExecutor
+            elif distributed_executor_backend == "external_launcher":
+                # executor with external launcher
+                from vllm.executor.uniproc_executor import (  # noqa
+                    ExecutorWithExternalLauncher)
+                executor_class = ExecutorWithExternalLauncher
         else:
             from vllm.executor.uniproc_executor import UniProcExecutor
             executor_class = UniProcExecutor
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index edceece4b68dc..3baeb63918a62 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -172,7 +172,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     scheduling_strategy=scheduling_strategy,
                     **ray_remote_kwargs,
                 )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
-                                           rank=rank)
+                                           rpc_rank=rank)
             else:
                 worker = ray.remote(
                     num_cpus=0,
@@ -181,7 +181,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     scheduling_strategy=scheduling_strategy,
                     **ray_remote_kwargs,
                 )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
-                                           rank=rank)
+                                           rpc_rank=rank)
             worker_metadata.append(
                 RayWorkerMetaData(worker=worker, created_rank=rank))
             rank += 1
@@ -204,7 +204,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     # as the resource holder for the driver process.
                     self.driver_dummy_worker = worker
                     self.driver_worker = RayWorkerWrapper(
-                        vllm_config=self.vllm_config, rank=0)
+                        vllm_config=self.vllm_config, rpc_rank=0)
                     worker_metadata.pop(i)
                     break
 
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index da1d77343cf3b..27b83e95ba95b 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -1,5 +1,10 @@
+import os
 from typing import Any, Dict, List, Optional, Tuple
 
+import torch
+import torch.distributed as dist
+
+import vllm.envs as envs
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
@@ -16,7 +21,7 @@ def _init_executor(self) -> None:
         """Initialize the worker and load the model.
         """
         self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
-                                               rank=0)
+                                               rpc_rank=0)
         distributed_init_method = get_distributed_init_method(
             get_ip(), get_open_port())
         local_rank = 0
@@ -55,3 +60,77 @@ def check_health(self) -> None:
 
 
 UniProcExecutorAsync = UniProcExecutor
+
+
+class ExecutorWithExternalLauncher(UniProcExecutor):
+    """An executor that uses external launchers to launch engines,
+    specially designed for torchrun-compatible launchers, for
+    offline inference with tensor parallelism.
+
+    see https://github.com/vllm-project/vllm/issues/11400 for
+    the motivation, and examples/offline_inference/torchrun_example.py
+    for the usage example.
+
+    The key idea: although it is tensor-parallel inference, we only
+    create one worker per executor, users will launch multiple
+    engines with torchrun-compatible launchers, and all these engines
+    work together to process the same prompts. When scheduling is
+    deterministic, all the engines will generate the same outputs,
+    and they don't need to synchronize the states with each other.
+    """
+    uses_ray: bool = False
+
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model.
+        """
+        assert self.vllm_config.parallel_config.pipeline_parallel_size == 1, \
+            ("ExecutorWithExternalLauncher does not "
+            "support pipeline parallelism.")
+        assert self.vllm_config.scheduler_config.delay_factor == 0.0, \
+            ("ExecutorWithExternalLauncher needs deterministic "
+            "execution, so it"
+            "does not support delay_factor in scheduling")
+        assert not envs.VLLM_USE_V1, \
+            ("V1 architecture cannot guarantee deterministic execution, "
+            "so it is not supported in ExecutorWithExternalLauncher.")
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
+                                               rpc_rank=0)
+        # engines are launched in torchrun-compatible launchers
+        # so we can use the env:// method.
+        # required env vars:
+        # - RANK
+        # - MASTER_ADDR
+        # - MASTER_PORT
+        distributed_init_method = "env://"
+        rank = int(os.environ["RANK"])
+        local_rank = rank
+        is_driver_worker = True
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=is_driver_worker,
+        )
+        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """
+        Determine the number of available KV blocks.
+        Add an additional all_reduce to get the min across all ranks.
+        Note that even if we have the same `gpu_memory_utilization` and 
+        `swap_space`, the available memory in every rank might still 
+        differ because NCCL can take different amounts of memory in 
+        different ranks. Therefore, it is necessary to test if all ranks 
+        agree on the same KV cache configuration.
+        """
+        a, b = super().determine_num_available_blocks()
+        from vllm.distributed.parallel_state import get_world_group
+        cpu_group = get_world_group().cpu_group
+        a_tensor = torch.tensor([a], device="cpu", dtype=torch.int64)
+        b_tensor = torch.tensor([b], device="cpu", dtype=torch.int64)
+        dist.all_reduce(a_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
+        dist.all_reduce(b_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
+        return a_tensor.item(), b_tensor.item()
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index dd981ffce8833..e6f26d2b74b2f 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -940,8 +940,8 @@ def soft_cap(self):
         return self.base_layer.soft_cap
 
     @property
-    def use_gather(self):
-        return self.base_layer.use_gather
+    def use_all_gather(self):
+        return self.base_layer.use_all_gather
 
     @property
     def org_vocab_size(self):
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 2bc7e458494f7..42decde1d0f79 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -6,6 +6,7 @@
 import torch.nn as nn
 
 import vllm.envs as envs
+from vllm.config import get_current_vllm_config
 from vllm.distributed import (tensor_model_parallel_all_gather,
                               tensor_model_parallel_gather)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -44,8 +45,10 @@ def __init__(self,
         self.soft_cap = soft_cap
         # Whether to use gather or all-gather to gather the logits.
 
-        self.use_gather = not current_platform.is_tpu(
-        ) and not envs.VLLM_USE_V1
+        parallel_config = get_current_vllm_config().parallel_config
+        self.use_all_gather = current_platform.is_tpu() \
+            or envs.VLLM_USE_V1 \
+            or parallel_config.distributed_executor_backend == "external_launcher" # noqa
 
     def forward(
         self,
@@ -88,16 +91,17 @@ def _get_logits(
         logits = lm_head.linear_method.apply(lm_head,
                                              hidden_states,
                                              bias=embedding_bias)
-        if self.use_gather:
-            # None may be returned for rank > 0
-            logits = tensor_model_parallel_gather(logits)
-        else:
+
+        if self.use_all_gather:
             # Gather is not supported for some devices such as TPUs.
             # Use all-gather instead.
             # NOTE(woosuk): Here, the outputs of every device should not be None
             # because XLA requires strict SPMD among all devices. Every device
             # should execute the same operations after gathering the logits.
             logits = tensor_model_parallel_all_gather(logits)
+        else:
+            # None may be returned for rank > 0
+            logits = tensor_model_parallel_gather(logits)
         # Remove paddings in vocab (if any).
         if logits is not None:
             logits = logits[..., :self.org_vocab_size]
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index cee0fcc0bad68..e111ac7ee8183 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -246,7 +246,7 @@ def __init__(
         ready_path: str,
     ):
         self.rank = rank
-        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rank=rank)
+        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
         # TODO: move `init_worker` to executor level as a collective rpc call
         all_kwargs: List[Dict] = [
             {} for _ in range(vllm_config.parallel_config.world_size)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index a3e377ef2b19d..43eeb287d64eb 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -55,9 +55,6 @@ def __init__(
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.is_driver_worker = is_driver_worker
-        if is_driver_worker:
-            assert rank % self.parallel_config.tensor_parallel_size == 0, \
-                   "Driver worker should be rank 0 of tensor parallel group."
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 7c14b8344b49e..d464b614b12f1 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -461,7 +461,8 @@ def _execute_model_spmd(
 
 class WorkerWrapperBase:
     """
-    The whole point of this class is to lazily initialize the worker.
+    This class represents one process in an executor/engine. It is responsible
+    for lazily initializing the worker and handling the worker's lifecycle.
     We first instantiate the WorkerWrapper, which remembers the worker module
     and class name. Then, when we call `update_environment_variables`, and the
     real initialization happens in `init_worker`.
@@ -470,9 +471,19 @@ class WorkerWrapperBase:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        rank: int = 0,
+        rpc_rank: int = 0,
     ) -> None:
-        self.rank = rank
+        """
+        Initialize the worker wrapper with the given vllm_config and rpc_rank.
+        Note: rpc_rank is the rank of the worker in the executor. In most cases,
+        it is also the rank of the worker in the distributed group. However,
+        when multiple executors work together, they can be different.
+        e.g. in the case of SPMD-style offline inference with TP=2,
+        users can launch 2 engines/executors, each with only 1 worker.
+        All workers have rpc_rank=0, but they have different ranks in the TP
+        group.
+        """
+        self.rpc_rank = rpc_rank
         self.vllm_config = vllm_config
         self.worker: Optional[WorkerBase] = None
         if vllm_config.model_config is not None:
@@ -485,16 +496,16 @@ def __init__(
 
     def adjust_rank(self, rank_mapping: Dict[int, int]) -> None:
         """
-        Adjust the rank based on the given mapping.
+        Adjust the rpc_rank based on the given mapping.
         It is only used during the initialization of the executor,
-        to adjust the rank of workers after we create all workers.
+        to adjust the rpc_rank of workers after we create all workers.
         """
-        if self.rank in rank_mapping:
-            self.rank = rank_mapping[self.rank]
+        if self.rpc_rank in rank_mapping:
+            self.rpc_rank = rank_mapping[self.rpc_rank]
 
     def update_environment_variables(self, envs_list: List[Dict[str,
                                                                 str]]) -> None:
-        envs = envs_list[self.rank]
+        envs = envs_list[self.rpc_rank]
         key = 'CUDA_VISIBLE_DEVICES'
         if key in envs and key in os.environ:
             # overwriting CUDA_VISIBLE_DEVICES is desired behavior
@@ -507,7 +518,7 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
         Here we inject some common logic before initializing the worker.
         Arguments are passed to the worker class constructor.
         """
-        kwargs = all_kwargs[self.rank]
+        kwargs = all_kwargs[self.rpc_rank]
         enable_trace_function_call_for_thread(self.vllm_config)
 
         # see https://github.com/NVIDIA/nccl/issues/1234

From 92e793d91a1a4e982662ecca0096e5edcafd21c6 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 16 Jan 2025 20:19:52 +0800
Subject: [PATCH 40/67] [core] LLM.collective_rpc interface and RLHF example
 (#12084)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml      |   4 +
 examples/offline_inference/rlhf.py | 191 +++++++++++++++++++++++++++++
 vllm/__init__.py                   |  39 ++++++
 vllm/entrypoints/llm.py            |  25 ++++
 vllm/plugins/__init__.py           |  31 -----
 vllm/worker/worker_base.py         |  15 ++-
 6 files changed, 270 insertions(+), 35 deletions(-)
 create mode 100644 examples/offline_inference/rlhf.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 00fed96c1ac8c..7442de245bd80 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -126,11 +126,15 @@ steps:
   - tests/distributed
   - tests/spec_decode/e2e/test_integration_dist_tp4
   - tests/compile
+  - examples/offline_inference/rlhf.py
   commands:
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - python3 ../examples/offline_inference/rlhf.py
 
 - label: Metrics, Tracing Test # 10min
   num_gpus: 2 
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
new file mode 100644
index 0000000000000..3bc303dad277f
--- /dev/null
+++ b/examples/offline_inference/rlhf.py
@@ -0,0 +1,191 @@
+"""
+a simple demonstration of RLHF with vLLM, inspired by
+the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
+It follows the design that, training processes and inference processes
+are different, and they live on different GPUs.
+Training processes send prompts to inference processes to generate data,
+and also synchronize the weights of the model by broadcasting the weights
+from the training process to the inference process.
+Note that this is a simple demonstration of one training instance and one
+inference instance. In practice, there could be multiple training instances
+and multiple inference instances. For the full implementation, please refer
+to the OpenRLHF framework.
+"""
+import os
+
+import ray
+import torch
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from transformers import AutoModelForCausalLM
+
+from vllm import LLM, SamplingParams, configure_as_vllm_process
+from vllm.utils import get_ip, get_open_port
+from vllm.worker.worker import Worker
+
+
+def stateless_init_process_group(master_address, master_port, rank, world_size,
+                                 device):
+    """
+    vLLM provides `StatelessProcessGroup` to create a process group
+    without considering the global process group in torch.distributed.
+    It is recommended to create `StatelessProcessGroup`, and then initialize
+    the data-plane communication (NCCL) between external (train processes) 
+    and vLLM workers.
+    """
+    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+    from vllm.distributed.utils import StatelessProcessGroup
+    pg = StatelessProcessGroup.create(host=master_address,
+                                      port=master_port,
+                                      rank=rank,
+                                      world_size=world_size)
+    pynccl = PyNcclCommunicator(pg, device=device)
+    return pynccl
+
+
+class MyWorker(Worker):
+    """
+    The `MyWorker` class inherits from `Worker` to provide custom functions.
+    For simplicity, we define the `MyWorker` class in this self-contained 
+    script. Normally, we should define the `MyWorker` class in a separate 
+    file and pass the qualified name of the class to the `worker_cls` 
+    parameter.
+    """
+
+    def init_weight_update_group(self, master_address, master_port,
+                                 rank_offset, world_size):
+        from vllm.distributed.parallel_state import get_world_group
+        rank = get_world_group().rank + rank_offset
+        self.model_update_group = stateless_init_process_group(
+            master_address,
+            master_port,
+            rank,
+            world_size,
+            self.device,
+        )
+
+    def update_weight(self, name, dtype, shape):
+        weight = torch.empty(shape, dtype=dtype, device="cuda")
+        self.model_update_group.broadcast(weight,
+                                          src=0,
+                                          stream=torch.cuda.current_stream())
+
+        self.model_runner.model.load_weights(weights=[(name, weight)])
+
+        del weight
+
+    def check_weights_changed(self):
+        """
+        Check if the weights are updated to 0.
+        """
+        weights_updated = True
+        for name, p in self.model_runner.model.named_parameters():
+            weights_updated = weights_updated and torch.allclose(
+                p, torch.zeros_like(p))
+        return weights_updated
+
+
+class MyLLM(LLM):
+
+    def __init__(self, *args, **kwargs):
+        # a hack to make the script work.
+        # stop ray from manipulating CUDA_VISIBLE_DEVICES
+        # at the top-level
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+        super().__init__(*args, **kwargs)
+
+
+"""
+Start the training process, here we use huggingface transformers 
+as an example to hold a model on GPU 0.
+
+It is important for all the processes outside of vLLM to call
+`configure_as_vllm_process` to set some common environment variables
+the same as vLLM workers.
+"""
+configure_as_vllm_process()
+
+train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+train_model.to("cuda:0")
+"""
+Start the inference process, here we use vLLM to hold a model on GPU 1 and 
+GPU 2. For the details on how to use ray, please refer to the ray 
+documentation https://docs.ray.io/en/latest/ .
+"""
+os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
+ray.init()
+
+pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
+ray.get(pg_inference.ready())
+scheduling_inference = PlacementGroupSchedulingStrategy(
+    placement_group=pg_inference,
+    placement_group_capture_child_tasks=True,
+    placement_group_bundle_index=0,
+)
+"""
+launch the vLLM inference engine.
+here we use `enforce_eager` to reduce the start time.
+"""
+llm = ray.remote(
+    num_cpus=0,
+    num_gpus=0,
+    scheduling_strategy=scheduling_inference,
+)(MyLLM).remote(
+    model="facebook/opt-125m",
+    enforce_eager=True,
+    worker_cls=MyWorker,
+    tensor_parallel_size=2,
+    distributed_executor_backend="ray",
+)
+
+# Generate texts from the prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+sampling_params = SamplingParams(temperature=0)
+
+outputs = ray.get(llm.generate.remote(prompts, sampling_params))
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
+
+# set up the communication between the training process
+# and the inference engine.
+master_address = get_ip()
+master_port = get_open_port()
+
+handle = llm.collective_rpc.remote("init_weight_update_group",
+                                   args=(master_address, master_port, 1, 3))
+model_update_group = stateless_init_process_group(master_address, master_port,
+                                                  0, 3, torch.device("cuda:0"))
+ray.get(handle)
+
+# simulate training, modify the weights of the model.
+for name, p in train_model.named_parameters():
+    p.data.zero_()
+
+# sync weight from the training process to the inference engine.
+for name, p in train_model.named_parameters():
+    handle = llm.collective_rpc.remote("update_weight",
+                                       args=(name, p.dtype, p.shape))
+    model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
+    ray.get(handle)
+
+# check if the weights are updated.
+assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
+
+# use the updated model to generate texts, they will be nonsense
+# because the weights are all zeros.
+outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
+for output in outputs_updated:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 45252b93e3d54..a533dba561c00 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -17,6 +17,44 @@
 
 from .version import __version__, __version_tuple__
 
+
+def configure_as_vllm_process():
+    """
+    set some common config/environment variables that should be set
+    for all processes created by vllm and all processes
+    that interact with vllm workers.
+    """
+    import os
+
+    import torch
+
+    # see https://github.com/NVIDIA/nccl/issues/1234
+    os.environ['NCCL_CUMEM_ENABLE'] = '0'
+
+    # see https://github.com/vllm-project/vllm/issues/10480
+    os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+    # see https://github.com/vllm-project/vllm/issues/10619
+    torch._inductor.config.compile_threads = 1
+
+    from vllm.platforms import current_platform
+
+    if current_platform.is_xpu():
+        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
+        torch._dynamo.config.disable = True
+    elif current_platform.is_hpu():
+        # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
+        # does not support torch.compile
+        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
+        # torch.compile support
+        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
+        if is_lazy:
+            torch._dynamo.config.disable = True
+            # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
+            # requires enabling lazy collectives
+            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
+            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
+
+
 __all__ = [
     "__version__",
     "__version_tuple__",
@@ -42,4 +80,5 @@
     "AsyncEngineArgs",
     "initialize_ray_cluster",
     "PoolingParams",
+    "configure_as_vllm_process",
 ]
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index acb4db85632a8..b78d5c65a40f8 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -4,6 +4,7 @@
 from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type,
                     Union, cast, overload)
 
+import cloudpickle
 from tqdm import tqdm
 from typing_extensions import deprecated
 
@@ -186,6 +187,13 @@ def __init__(
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
 
+        if "worker_cls" in kwargs:
+            worker_cls = kwargs["worker_cls"]
+            # if the worker_cls is not qualified string name,
+            # we serialize it using cloudpickle to avoid pickling issues
+            if isinstance(worker_cls, type):
+                kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)
+
         if compilation_config is not None:
             if isinstance(compilation_config, (int, dict)):
                 compilation_config_instance = CompilationConfig.from_cli(
@@ -455,6 +463,23 @@ def generate(
         outputs = self._run_engine(use_tqdm=use_tqdm)
         return self.engine_class.validate_outputs(outputs, RequestOutput)
 
+    def collective_rpc(self,
+                       method: str,
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        """
+        Run a method on all workers, with homogeneous arguments.
+        The main extension point for the LLM entrypoint.
+        Users can provide custom worker class through `worker_cls`
+        argument, and implement new methods in the worker class.
+        Then, users can call the new methods through this API.
+        It is recommended to use this API to only pass control messages,
+        and set up data-plane communication to pass data.
+        """
+        return self.llm_engine.model_executor.collective_rpc(
+            method, timeout, args, kwargs)
+
     def beam_search(
         self,
         prompts: List[Union[TokensPrompt, TextPrompt]],
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index e5fa4f0e4a2f6..ff54174f634af 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,9 +1,6 @@
 import logging
-import os
 from typing import Callable, Dict
 
-import torch
-
 import vllm.envs as envs
 
 logger = logging.getLogger(__name__)
@@ -50,34 +47,6 @@ def load_general_plugins():
     processes. They should be designed in a way that they can be loaded
     multiple times without causing issues.
     """
-
-    # all processes created by vllm will load plugins,
-    # and here we can inject some common environment variables
-    # for all processes.
-
-    # see https://github.com/vllm-project/vllm/issues/10480
-    os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
-    # see https://github.com/vllm-project/vllm/issues/10619
-    torch._inductor.config.compile_threads = 1
-
-    from vllm.platforms import current_platform
-
-    if current_platform.is_xpu():
-        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
-        torch._dynamo.config.disable = True
-    if current_platform.is_hpu():
-        # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
-        # does not support torch.compile
-        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
-        # torch.compile support
-        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
-        if is_lazy:
-            torch._dynamo.config.disable = True
-            # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
-            # requires enabling lazy collectives
-            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
-            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
-
     global plugins_loaded
     if plugins_loaded:
         return
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index d464b614b12f1..bced5b9f44228 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -4,6 +4,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
 
+import cloudpickle
 import torch
 
 from vllm.config import ObservabilityConfig, VllmConfig
@@ -521,14 +522,20 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
         kwargs = all_kwargs[self.rpc_rank]
         enable_trace_function_call_for_thread(self.vllm_config)
 
-        # see https://github.com/NVIDIA/nccl/issues/1234
-        os.environ['NCCL_CUMEM_ENABLE'] = '0'
+        from vllm import configure_as_vllm_process
+        configure_as_vllm_process()
 
         from vllm.plugins import load_general_plugins
         load_general_plugins()
 
-        worker_class = resolve_obj_by_qualname(
-            self.vllm_config.parallel_config.worker_cls)
+        if isinstance(self.vllm_config.parallel_config.worker_cls, str):
+            worker_class = resolve_obj_by_qualname(
+                self.vllm_config.parallel_config.worker_cls)
+        else:
+            assert isinstance(self.vllm_config.parallel_config.worker_cls,
+                              bytes)
+            worker_class = cloudpickle.loads(
+                self.vllm_config.parallel_config.worker_cls)
         self.worker = worker_class(**kwargs)
         assert self.worker is not None
 

From b3a0db23ba43beaf42af919d6e58262d09383457 Mon Sep 17 00:00:00 2001
From: Michal Adamczyk <madamczyk@habana.ai>
Date: Thu, 16 Jan 2025 15:06:08 +0100
Subject: [PATCH 41/67] Move scores to float32 in case of running xgrammar on
 cpu (#695)

According to:
https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22
xgrammar only supports float32 logits on CPU.
---
 vllm/model_executor/guided_decoding/xgrammar_decoding.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index f10a8fb8e03cf..2d8594cb8aafa 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -298,8 +298,11 @@ def __call__(self, input_ids: list[int],
         # token_bitmask is a CPU tensor for use with accept_token and
         # fill_next_token_bitmask so we move it to the device of scores
         device_type = scores.device.type
+        dtype = scores.dtype
         if device_type != "cuda":
-            scores = scores.to("cpu").unsqueeze(0)
+            # xgrammar on cpu only supports float32 scores
+            # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22
+            scores = scores.to("cpu").float().unsqueeze(0)
 
         # Note: In this method, if the tensors have different dimensions
         # on CPU device fails, but on GPU it runs without error. Hence the
@@ -307,7 +310,7 @@ def __call__(self, input_ids: list[int],
         xgr.apply_token_bitmask_inplace(scores,
                                         self.token_bitmask.to(scores.device))
         if device_type != "cuda":
-            scores = scores.to(device_type).squeeze()
+            scores = scores.to(dtype).to(device_type).squeeze()
 
         return scores
 

From 874f7c292a4f4f5dbb89b12426187e5a70f006d6 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 16 Jan 2025 06:54:06 -0800
Subject: [PATCH 42/67] [Bugfix] Fix max image feature size for
 Llava-one-vision (#12104)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 .../multimodal/processing/test_llava_next.py  | 61 ++++++++++++++++++
 .../processing/test_llava_onevision.py        | 62 +++++++++++++++++++
 vllm/model_executor/models/llava_onevision.py |  8 ++-
 3 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index 1eec35d9c3c72..6de649f87204d 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -13,6 +13,67 @@
 from ...utils import build_model_context
 
 
+def _validate_image_max_tokens_one(
+    processor: BaseMultiModalProcessor,
+    max_tokens: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    info = processor.info
+    feature_size = info.get_num_image_tokens(image_width=image_size.width,
+                                             image_height=image_size.height)
+
+    try:
+        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+@pytest.mark.skip("This test takes around 5 minutes to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+def test_processor_max_tokens(model_id):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
+    info = processor.info
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 2
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(32, 4096), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_max_tokens_one,
+        processor,
+        info.get_max_image_tokens(),  # type: ignore
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
 def _validate_image_prompt_replacements_one(
     processor: BaseMultiModalProcessor,
     num_imgs: int,
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index 94ea604c58b43..806437d35ec87 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -13,6 +13,68 @@
 from ...utils import build_model_context
 
 
+def _validate_image_max_tokens_one(
+    processor: BaseMultiModalProcessor,
+    max_tokens: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    info = processor.info
+    feature_size = info.get_num_image_tokens(image_width=image_size.width,
+                                             image_height=image_size.height)
+
+    try:
+        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+@pytest.mark.skip("This test takes around 5 minutes to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+def test_processor_max_tokens(model_id):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
+    info = processor.info
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 6
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(32, 4096), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_max_tokens_one,
+        processor,
+        info.get_max_image_tokens(),  # type: ignore
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
 def _validate_image_prompt_replacements_one(
     processor: BaseMultiModalProcessor,
     num_imgs: int,
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 78a47e64d9afc..c9283e0c5ba20 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -19,8 +19,8 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
-                                   VideoProcessorItems)
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import PromptReplacement
 from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -145,6 +145,10 @@ def _get_num_unpadded_features(
 
         return (unpadded_features, newline_features)
 
+    def get_image_size_with_most_features(self) -> ImageSize:
+        # NOTE: This hardcoded value is found via processor tests
+        return ImageSize(width=1153, height=944)
+
     def _get_num_frame_tokens(
         self,
         *,

From 5fd24ec02e0365f96301ac73a31ef06976c256e8 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 16 Jan 2025 21:21:40 +0530
Subject: [PATCH 43/67] [misc] Add LoRA kernel micro benchmarks (#11579)

---
 benchmarks/kernels/benchmark_lora.py | 1147 ++++++++++++++++++++++++++
 benchmarks/kernels/utils.py          |  210 +++++
 2 files changed, 1357 insertions(+)
 create mode 100644 benchmarks/kernels/benchmark_lora.py
 create mode 100644 benchmarks/kernels/utils.py

diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
new file mode 100644
index 0000000000000..e1f613e1da509
--- /dev/null
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -0,0 +1,1147 @@
+import argparse
+import copy
+import json
+import pickle
+import time
+from dataclasses import dataclass
+from enum import Enum, auto
+from itertools import product
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import ArgPool, Bench, CudaGraphBenchParams
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_TP_SIZES = [1]
+DEFAULT_BATCH_SIZES = [
+    1, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 640, 768, 896, 1024,
+    2048, 3072, 4096, 5120, 6144, 7168, 8192
+]
+DEFAULT_HIDDEN_SIZES = [1024, 2048, 4096, 8192, 16384]
+DEFAULT_LORA_RANKS = [16]
+DEFAULT_NUM_LORAS = [1, 2, 3, 4]
+DEFAULT_SORT_BY_LORA_IDS = [False, True]
+DEFAULT_SEQ_LENGTHS = [1]
+DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False]
+
+
+# Utilities
+def dtype_to_str(dtype: torch.dtype):
+    if dtype == torch.float16:
+        return "f16"
+    if dtype == torch.bfloat16:
+        return "bf16"
+    if dtype == torch.float32:
+        return "f32"
+    raise ValueError(f"Unsupported dtype {dtype}")
+
+
+def make_rand_lora_weight_tensor(k: int,
+                                 n: int,
+                                 num_loras: int,
+                                 dtype: torch.dtype,
+                                 device: str = "cuda") -> torch.Tensor:
+
+    # LoRA weights column major
+    return torch.rand((num_loras, n, k), dtype=dtype).to(device)
+
+
+def make_rand_tensors(
+    a_shape: Tuple[int],
+    b_shape: Tuple[int],
+    c_shape: Tuple[int],
+    a_dtype: torch.dtype,
+    b_dtype: torch.dtype,
+    c_dtype: torch.dtype,
+    num_slices: int,
+    device: str = "cuda",
+) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
+    """
+    Make LoRA input/output matrices.
+    """
+    A = torch.rand(a_shape, dtype=a_dtype).to(device)
+
+    # LoRA weights column major
+    Bs = [
+        torch.rand(b_shape, dtype=b_dtype).to(device)
+        for _ in range(num_slices)
+    ]
+
+    C = torch.zeros(c_shape, dtype=c_dtype).to(device)
+    return A, Bs, C
+
+
+def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int,
+                             sort_by_lora_id: bool,
+                             device: str) -> torch.Tensor:
+    """
+    All prompts are mapped to a Lora ID in range [0, num_active_loras).
+    where 0 refers to first lora, 1 refers to second lora and so on.
+    """
+    assert num_active_loras > 0
+
+    if not sort_by_lora_id:
+        return torch.randint(0,
+                             num_active_loras, (num_prompts, ),
+                             dtype=torch.long)
+
+    # Divide LoRAs equally and in order.
+    part_size = num_prompts // num_active_loras
+    part_size = max(part_size, 1)
+
+    lora_id = 0
+    prompt_lora_mapping = []
+    while len(prompt_lora_mapping) < num_prompts:
+        prompt_lora_mapping.extend([lora_id] * part_size)
+        lora_id = lora_id + 1 if lora_id + 1 < num_active_loras else lora_id
+    return torch.tensor(prompt_lora_mapping[:num_prompts],
+                        dtype=torch.long,
+                        device=device)
+
+
+def make_token_lora_mapping(num_tokens: int, num_prompts: int,
+                            prompt_lora_mapping: torch.Tensor,
+                            seq_len_tensor: torch.Tensor, device: str):
+    """
+    Make token_lora_mapping from prompt_lora_mapping and seq_lens_tensor
+    """
+    assert prompt_lora_mapping.shape[0] == num_prompts
+
+    # token to lora index mapping
+    token_lora_mapping = [0] * num_tokens
+    current_offset = 0
+    for b_id in range(num_prompts):
+        lora_index = prompt_lora_mapping[b_id].item()
+        s = current_offset
+        e = s + seq_len_tensor[b_id].item()
+        token_lora_mapping[s:e] = [lora_index] * (e - s)
+        current_offset += seq_len_tensor[b_id].item()
+
+    return torch.tensor(token_lora_mapping, dtype=torch.long, device=device)
+
+
+def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor,
+                   lora_weights: List[torch.Tensor],
+                   seq_lens_cpu: torch.Tensor,
+                   prompt_lora_mapping_cpu: torch.Tensor, scaling: float,
+                   add_inputs: Optional[bool]):
+    """
+    Torch group gemm reference implementation to test correctness of
+    benchmarking operations.
+    """
+    batches = seq_lens_cpu.size(0)
+    out_list = []
+    current_offset = 0
+    for lora_index, b_length in zip(range(batches), seq_lens_cpu):
+        x = input[current_offset:b_length + current_offset, :]
+        current_offset += b_length
+        w = lora_weights[prompt_lora_mapping_cpu[lora_index]]
+        result = torch.nn.functional.linear(x, w)
+        result *= scaling
+        out_list.append(result)
+    torch.cat(out_list, dim=0)
+
+    cat_result = torch.cat(out_list, dim=0)
+
+    if add_inputs:
+        ref_out += cat_result
+    else:
+        ref_out.copy_(cat_result)
+
+
+class OpType(Enum):
+    """
+    LoRA Ops to benchmark and its properties.
+    """
+    SGMV_SHRINK = auto()
+    BGMV_SHRINK = auto()
+    SGMV_EXPAND = auto()
+    BGMV_EXPAND = auto()
+    BGMV_EXPAND_SLICE = auto()
+
+    @staticmethod
+    def from_str(s: str) -> "OpType":
+        if s.lower() == 'sgmv_shrink':
+            return OpType.SGMV_SHRINK
+        if s.lower() == 'sgmv_expand':
+            return OpType.SGMV_EXPAND
+        if s.lower() == 'bgmv_shrink':
+            return OpType.BGMV_SHRINK
+        if s.lower() == 'bgmv_expand':
+            return OpType.BGMV_EXPAND
+        if s.lower() == "bgmv_expand_slice":
+            return OpType.BGMV_EXPAND_SLICE
+        raise ValueError(f"Unrecognized str {s} to convert to OpType")
+
+    def is_shrink_fn(self) -> bool:
+        return self in [OpType.SGMV_SHRINK, OpType.BGMV_SHRINK]
+
+    def is_expand_fn(self) -> bool:
+        return self in [OpType.SGMV_EXPAND, OpType.BGMV_EXPAND]
+
+    def is_prefill_op(self) -> bool:
+        return self in [OpType.SGMV_SHRINK, OpType.SGMV_EXPAND]
+
+    def is_decode_op(self) -> bool:
+        return self in [
+            OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE
+        ]
+
+    def is_expand_slice_fn(self) -> bool:
+        return self in [OpType.BGMV_EXPAND_SLICE]
+
+    def num_slices(self) -> List[int]:
+        if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]:
+            # SGMV kernels supports slices
+            return [1, 2, 3]
+        if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
+            return [1]
+        if self in [OpType.BGMV_EXPAND_SLICE]:
+            return [2, 3]
+        raise ValueError(f"Unrecognized OpType {self}")
+
+    def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
+            lora_rank: int) -> Tuple[int, int, int]:
+        num_tokens = batch_size * seq_length
+        if self.is_shrink_fn():
+            m = num_tokens
+            k = hidden_size
+            n = lora_rank
+        else:
+            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            m = num_tokens
+            k = lora_rank
+            n = hidden_size
+        return m, k, n
+
+    def matmul_dtypes(
+            self, op_dtype: torch.dtype
+    ) -> Tuple[torch.dtype, torch.dtype, torch.dtype]:
+        """
+        return a type, b type and c type for A x B = C
+        """
+        if self.is_shrink_fn():
+            return op_dtype, op_dtype, torch.float32
+        else:
+            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            return torch.float32, op_dtype, op_dtype
+
+    def matmul_shapes(
+            self, batch_size: int, seq_length: int, hidden_size: int,
+            lora_rank: int, num_loras: int,
+            num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]:
+        """
+        Given num_slices, return the shapes of the A, B, and C matrices
+        in A x B = C, for the op_type
+        """
+        m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)
+
+        b_shape = (num_loras, n, k)  # col-major
+        if self == OpType.SGMV_SHRINK:
+            # SGMV shrink supports num_slices inherently in the kernel
+            return ((m, k), b_shape, (num_slices, m, n))
+        if self == OpType.SGMV_EXPAND:
+            # SGMV expand supports num_slices inherently in the kernel
+            return ((num_slices, m, k), b_shape, (m, n * num_slices))
+        if self == OpType.BGMV_SHRINK:
+            return ((m, k), b_shape, (m, n))
+        if self == OpType.BGMV_EXPAND:
+            return ((m, k), b_shape, (m, n))
+        if self == OpType.BGMV_EXPAND_SLICE:
+            return ((num_slices, m, k), b_shape, (m, n * num_slices))
+
+        raise ValueError(f"Unrecognized op_type {self}")
+
+    def bench_fn(self) -> Callable:
+
+        def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]):
+            for x in kwargs_list:
+                bgmv_expand_slice(**x)
+
+        if self == OpType.SGMV_SHRINK:
+            return sgmv_shrink
+        if self == OpType.SGMV_EXPAND:
+            return sgmv_expand
+        if self == OpType.BGMV_SHRINK:
+            return bgmv_shrink
+        if self == OpType.BGMV_EXPAND:
+            return bgmv_expand
+        if self == OpType.BGMV_EXPAND_SLICE:
+            return emulate_bgmv_expand_slice
+        raise ValueError(f"Unrecognized optype {self}")
+
+    def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor,
+                           lora_weights: List[torch.Tensor],
+                           **kwargs) -> Callable:
+        """Each benchmark operation expected the input, lora_weights and outputs
+           in a slightly different format. Refer to self.matmul_shapes().
+           run_ref_group_gemm accounts for those differences in executing a
+           reference group gemm for correctness testing.
+        """
+        w_dtype = lora_weights[0].dtype
+        num_slices = len(lora_weights)
+        if self == OpType.SGMV_SHRINK:
+            for slice_idx in range(num_slices):
+                ref_group_gemm(ref_out=output[slice_idx, :],
+                               input=input,
+                               lora_weights=lora_weights[slice_idx],
+                               **kwargs)
+        if self == OpType.SGMV_EXPAND:
+            hidden_size = lora_weights[0].shape[1]
+            for slice_idx in range(num_slices):
+                slice_offset = slice_idx * hidden_size
+                ref_group_gemm(
+                    ref_out=output[:, slice_offset:slice_offset + hidden_size],
+                    input=input[slice_idx].clone().to(dtype=w_dtype),
+                    lora_weights=lora_weights[slice_idx],
+                    **kwargs)
+        if self == OpType.BGMV_SHRINK:
+            assert num_slices == 1
+            ref_group_gemm(ref_out=output,
+                           input=input,
+                           lora_weights=lora_weights[0],
+                           **kwargs)
+        if self == OpType.BGMV_EXPAND:
+            assert num_slices == 1
+            ref_group_gemm(ref_out=output,
+                           input=input.clone().to(dtype=w_dtype),
+                           lora_weights=lora_weights[0],
+                           **kwargs)
+        if self == OpType.BGMV_EXPAND_SLICE:
+            hidden_size = lora_weights[0].shape[1]
+            for slice_idx in range(num_slices):
+                slice_offset = slice_idx * hidden_size
+                ref_group_gemm(
+                    ref_out=output[:, slice_offset:slice_offset + hidden_size],
+                    input=input[slice_idx].clone().to(dtype=w_dtype),
+                    lora_weights=lora_weights[slice_idx],
+                    **kwargs)
+        raise ValueError(f"Unrecognized optype {self}")
+
+
+@dataclass
+class BenchmarkContext:
+    """
+    LoRA benchmark context
+    """
+    batch_size: int
+    hidden_size: int
+    num_loras: int
+    num_active_loras: int
+    lora_rank: int
+    sort_by_lora_id: bool
+    dtype: torch.dtype
+    seq_length: Optional[int] = None
+    num_slices: Optional[int] = None  # num_slices for slice based ops
+
+    def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
+        ctx = copy.copy(self)
+        ctx.seq_length = seq_length
+        return ctx
+
+    def with_num_slices(self, num_slices: int) -> "BenchmarkContext":
+        ctx = copy.copy(self)
+        ctx.num_slices = num_slices
+        return ctx
+
+    def bench_label(self) -> str:
+        return f"lora-{self.dtype}"
+
+    def bench_sublabel(self, op_type: OpType) -> str:
+        m, k, n = op_type.mkn(self.batch_size, self.seq_length,
+                              self.hidden_size, self.lora_rank)
+        desc = {
+            'bs': self.batch_size,
+            'sl': self.seq_length,
+            'm': m,
+            'k': k,
+            'n': n,
+            'num_loras': self.num_loras,
+            'sort_by_lora': self.sort_by_lora_id,
+            'num_slices': self.num_slices,
+        }
+        return json.dumps(desc)
+
+
+@dataclass
+class BenchmarkTensors:
+    """
+    Input/Output tensors used for benchmarks
+    """
+    # matmul tensors
+    input: torch.Tensor
+    lora_weights_lst: List[torch.Tensor]
+    output: torch.Tensor
+    # metadata tensors
+    seq_lens: torch.Tensor
+    seq_start_loc: torch.Tensor
+    prompt_lora_mapping: torch.Tensor
+    token_lora_mapping: torch.Tensor
+
+    def io_types(self) -> str:
+        return (f"{dtype_to_str(self.input.dtype)}x"
+                f"{dtype_to_str(self.lora_weights_lst[0].dtype)}=>"
+                f"{dtype_to_str(self.output.dtype)}")
+
+    @staticmethod
+    def make(ctx: BenchmarkContext,
+             op_type: OpType,
+             device: str = "cuda") -> "BenchmarkTensors":
+
+        # Make input / output matmul tensors.
+        a_shape, b_shape, c_shape = op_type.matmul_shapes(
+            ctx.batch_size, ctx.seq_length, ctx.hidden_size, ctx.lora_rank,
+            ctx.num_loras, ctx.num_slices)
+        a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype)
+        input_tensor, lora_weights, output_tensor = \
+            make_rand_tensors(a_shape, b_shape, c_shape, a_type, b_type, c_type,
+                              num_slices = ctx.num_slices)
+
+        # Make metadata tensors.
+        # Keep the metadata tensors in the CPU for further processing if needed.
+        # The tensors get moved to the GPU before benchmarking.
+        assert ctx.num_active_loras <= ctx.num_loras
+        total_tokens = ctx.batch_size * ctx.seq_length
+
+        # Prepare seq lens tensor
+        seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1,
+                                       (ctx.batch_size, ))
+        # Prepare seq_start_loc tensor
+        seq_start_loc_tensor = torch.cumsum(torch.tensor(
+            [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+                                            dim=0)
+        assert total_tokens == seq_len_tensor.sum()
+        # Prepare prompt lora indices tensor
+        prompt_lora_indices_tensor = make_prompt_lora_mapping(
+            ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu")
+        # Prepare token lora indices tensor
+        token_lora_indices_tensor = make_token_lora_mapping(
+            total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
+            seq_len_tensor, "cpu")
+
+        return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
+                                seq_len_tensor, seq_start_loc_tensor,
+                                prompt_lora_indices_tensor,
+                                token_lora_indices_tensor)
+
+    def sanity_check(self) -> None:
+        """
+        Fails asserts when non-conformality is detected.
+        """
+        num_tokens = self.input.shape[-2]
+        # check metadata tensors
+        assert torch.sum(self.seq_lens) == num_tokens
+        num_seqs = self.seq_lens.shape[0]
+        assert self.seq_start_loc.shape[0] == num_seqs
+        assert self.prompt_lora_mapping.shape[0] == num_seqs
+        assert self.token_lora_mapping.shape[0] == num_tokens
+
+    def to_device(self, device: str):
+        """
+        Transfer tensors to device if the tensors aren't already on the device
+        """
+
+        def to_device(tensor: torch.Tensor):
+            if tensor.device != device:
+                tensor = tensor.to(device=device)
+            return tensor
+
+        self.input = to_device(self.input)
+        self.output = to_device(self.output)
+        self.seq_lens = to_device(self.seq_lens)
+        self.seq_start_loc = to_device(self.seq_start_loc)
+        self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
+        self.token_lora_mapping = to_device(self.token_lora_mapping)
+        for i in range(len(self.lora_weights_lst)):
+            self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
+
+    def metadata(self) -> Tuple[int, int, int]:
+        """
+        Return num_seqs, num_tokens and max_seq_len
+        """
+        num_seqs = self.seq_lens.shape[0]
+        num_tokens = self.token_lora_mapping.shape[0]
+        max_seq_len = torch.max(self.seq_lens).item()
+        num_slices = len(self.lora_weights_lst)
+        return num_seqs, num_tokens, max_seq_len, num_slices
+
+    def convert_to_sgmv_benchmark_tensors(self):
+        """
+        For sgmv punica kernels, when consecutive sequences have the
+        same LoRA ID, we just merge them together.
+        This happens in punica.py::compute_metadata
+        """
+
+        # Collapse seq_lens and seq_start_loc
+        _, seq_lens = torch.unique_consecutive(self.token_lora_mapping,
+                                               return_counts=True)
+        cum_result = torch.cumsum(seq_lens, dim=0)
+        seq_start_loc = torch.zeros_like(seq_lens)
+        seq_start_loc[1:].copy_(cum_result[:-1])
+
+        # Collapse prompt mapping
+        prompt_lora_mapping = torch.unique_consecutive(
+            self.prompt_lora_mapping)
+
+        assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \
+         f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}"
+
+        self.prompt_lora_mapping = prompt_lora_mapping.to(
+            dtype=self.prompt_lora_mapping.dtype)
+        self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
+        self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
+
+    def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]:
+        self.convert_to_sgmv_benchmark_tensors()
+        self.sanity_check()
+        self.to_device(self.input.device)
+
+        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_tokens, hidden_size]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        hidden_size = i_shape[1]
+        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == hidden_size
+        lora_rank = lw_shape[1]
+        # Expected output shape [num_slices, num_tokens, lora_rank]
+        assert len(o_shape) == 3
+        assert o_shape == (num_slices, num_tokens, lora_rank)
+
+        return {
+            'inputs': self.input,
+            'lora_a_weights': self.lora_weights_lst,
+            'output_tensor': self.output,
+            'b_seq_start_loc': self.seq_start_loc,
+            'seq_len_tensor': self.seq_lens,
+            'lora_indices_tensor': self.prompt_lora_mapping,
+            'batches': num_seqs,
+            'max_seq_length': max_seq_len,
+            'token_nums': num_tokens,
+            'scaling': 1.0,
+        }
+
+    def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
+
+        self.convert_to_sgmv_benchmark_tensors()
+        self.sanity_check()
+        self.to_device(self.input.device)
+
+        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape : [num_slices, num_tokens, lora_rank]
+        assert len(i_shape) == 3
+        assert i_shape[0] == num_slices
+        assert i_shape[1] == num_tokens
+        lora_rank = i_shape[2]
+        # Expected lora weight shape : [num_lora, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape : [num_tokens, hidden_size * num_slices]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size * num_slices)
+
+        return {
+            'inputs': self.input,
+            'lora_b_weights': self.lora_weights_lst,
+            'output_tensor': self.output,
+            'b_seq_start_loc': self.seq_start_loc,
+            'seq_len_tensor': self.seq_lens,
+            'lora_indices_tensor': self.prompt_lora_mapping,
+            'batches': num_seqs,
+            'max_seq_length': max_seq_len,
+            'token_nums': num_tokens,
+            'offset_start': 0,
+            'add_inputs': add_inputs,
+        }
+
+    def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]:
+        assert len(self.lora_weights_lst) == 1
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, _ = self.metadata()
+        # Sanity check shapes
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_tokens, hidden_size]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        hidden_size = i_shape[1]
+        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == hidden_size
+        lora_rank = lw_shape[1]
+        # Expected output shape [num_tokens, lora_rank]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, lora_rank)
+
+        return {
+            'inputs': self.input,
+            'lora_a_weights': self.lora_weights_lst[0],
+            'output_tensor': self.output,
+            'lora_indices_tensor': self.token_lora_mapping,
+            'scaling': 1.0
+        }
+
+    def as_bgmv_expand_kwargs(self, add_inputs: bool):
+        assert len(self.lora_weights_lst) == 1
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, _ = self.metadata()
+        # Sanity check shapes
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_tokens, lora_rank]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        lora_rank = i_shape[1]
+        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape [num_tokens, hidden_size]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size)
+
+        return {
+            'inputs': self.input,
+            'lora_b_weights': self.lora_weights_lst[0],
+            'output_tensor': self.output,
+            'lora_indices_tensor': self.token_lora_mapping,
+            'add_inputs': add_inputs
+        }
+
+    def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
+
+        _, num_tokens, _, num_slices = self.metadata()
+        # Sanity check shapes
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_slices, num_tokens, lora_rank]
+        assert len(i_shape) == 3
+        assert i_shape[0] == num_slices
+        assert i_shape[1] == num_tokens
+        lora_rank = i_shape[2]
+        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape [num_tokens, hidden_size * num_slices]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size * num_slices)
+
+        self.to_device(self.input.device)
+
+        kwargs_list = []
+        for i in range(num_slices):
+            kwargs_list.append({
+                'inputs': self.input[i],
+                'lora_b_weights': self.lora_weights_lst[i],
+                'output_tensor': self.output,
+                'lora_indices_tensor': self.token_lora_mapping,
+                'slice_offset': i * hidden_size,
+                'slice_size': hidden_size,
+                'add_inputs': add_inputs,
+            })
+        return {'kwargs_list': kwargs_list}
+
+    def bench_fn_kwargs(self,
+                        op_type: OpType,
+                        add_inputs: Optional[bool] = None) -> Dict[str, Any]:
+        if op_type.is_shrink_fn():
+            assert add_inputs is None
+        else:
+            assert add_inputs is not None
+
+        if op_type == OpType.SGMV_SHRINK:
+            return self.as_sgmv_shrink_kwargs()
+        if op_type == OpType.SGMV_EXPAND:
+            return self.as_sgmv_expand_kwargs(add_inputs)
+        if op_type == OpType.BGMV_SHRINK:
+            return self.as_bgmv_shrink_kwargs()
+        if op_type == OpType.BGMV_EXPAND:
+            return self.as_bgmv_expand_kwargs(add_inputs)
+        if op_type == OpType.BGMV_EXPAND_SLICE:
+            return self.as_bgmv_expand_slice_kwargs(add_inputs)
+        raise ValueError(f"Unrecognized optype {self}")
+
+    def test_correctness(self, op_type: OpType,
+                         expand_fn_add_inputs: Optional[bool]) -> bool:
+        """
+        Test correctness of op_type implementation against a grouped gemm
+        reference implementation.
+        """
+        seq_lens_cpu = self.seq_lens.to(device="cpu")
+        prompt_lora_mapping_cpu = self.prompt_lora_mapping.to(device="cpu")
+        ref_output = self.output.clone()
+
+        self.output.zero_()
+        op_type.bench_fn()(
+            **self.bench_fn_kwargs(op_type, expand_fn_add_inputs))
+
+        op_type.run_ref_group_gemm(
+            ref_output,
+            self.input,
+            self.lora_weights_lst,
+            seq_lens_cpu=seq_lens_cpu,
+            prompt_lora_mapping_cpu=prompt_lora_mapping_cpu,
+            scaling=1.0,
+            add_inputs=expand_fn_add_inputs)
+
+        rtol, atol = {
+            torch.float16: (6e-2, 6e-2),
+            torch.bfloat16: (6e-2, 6e-2),
+            torch.float32: (1e-2, 1e-2),
+        }[self.output.dtype]
+
+        return torch.allclose(ref_output, self.output, rtol=rtol, atol=atol)
+
+
+def bench_optype(ctx: BenchmarkContext,
+                 arg_pool_size: int,
+                 op_type: OpType,
+                 cuda_graph_nops: Optional[int] = None,
+                 expand_fn_add_inputs: Optional[bool] = None,
+                 test_correctness: bool = False) -> TMeasurement:
+
+    assert arg_pool_size >= 1
+    if op_type.is_shrink_fn():
+        assert expand_fn_add_inputs is None
+    else:
+        assert expand_fn_add_inputs is not None
+
+    # BenchmarkContext -> BenchmarkTensors
+    bench_tensors : List[BenchmarkTensors] = \
+        [BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)]
+    for bt in bench_tensors:
+        bt.sanity_check()
+
+    # Test correctness of our implementation.
+    if test_correctness:
+        assert all([
+            bt.test_correctness(op_type, expand_fn_add_inputs)
+            for bt in bench_tensors
+        ])
+
+    # BenchmarkTensors -> Dict (kwargs)
+    kwargs_list = [
+        bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
+        for bt in bench_tensors
+    ]
+
+    # Clear LoRA optimization hash-maps.
+    _LORA_A_PTR_DICT.clear()
+    _LORA_B_PTR_DICT.clear()
+    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
+    for kwargs in kwargs_list:
+        op_type.bench_fn()(**kwargs)
+    torch.cuda.synchronize()
+
+    # Merge into a single kwargs and qualify arguments as ArgPool
+    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
+    for _kwargs in kwargs_list:
+        for k, v in _kwargs.items():
+            kwargs[k].values.append(v)
+
+    describe_args = (f"add_inputs={expand_fn_add_inputs}"
+                     if expand_fn_add_inputs is not None else "")
+    description = (
+        f"{op_type.name}({describe_args}) ({bench_tensors[0].io_types()})")
+
+    cuda_graph_params = None
+    if cuda_graph_nops:
+        cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops)
+    timer = None
+    with Bench(cuda_graph_params,
+               ctx.bench_label(), ctx.bench_sublabel(op_type), description,
+               op_type.bench_fn(), **kwargs) as bench:
+        timer = bench.run()
+    return timer
+
+
+def bench_torch_mm(ctx: BenchmarkContext,
+                   arg_pool_size: int,
+                   op_type: OpType,
+                   cuda_graph_nops: Optional[int] = None) -> TMeasurement:
+    """
+    Benchmark basic torch.mm as a roofline.
+
+    When all the input tokens have the same LoRA ID, the LoRA kernels are just
+    a matmul. This torch.mm benchmark serves as a roofline for that case. 
+
+    input op_type is used in determining the m, k, n dimensions for the matmul.
+    """
+
+    batch_size, hidden_size, lora_rank, seq_length, dtype = (ctx.batch_size,
+                                                             ctx.hidden_size,
+                                                             ctx.lora_rank,
+                                                             ctx.seq_length,
+                                                             ctx.dtype)
+
+    m, k, n = op_type.mkn(batch_size, seq_length, hidden_size, lora_rank)
+    # For a fairer comparison.
+    n = n * ctx.num_slices
+
+    # Get matmul input and output tensors for A x B = C
+    As, Bs, Cs = [], [], []
+    for _ in range(arg_pool_size):
+        As.append(torch.rand((m, k), dtype=dtype).to("cuda"))
+        Bs.append(torch.rand((n, k), dtype=dtype).to("cuda").t())
+        Cs.append(torch.rand((m, n), dtype=dtype).to("cuda"))
+
+    # Make torch.mm kwargs
+    mm_kwargs = {'input': ArgPool(As), 'mat2': ArgPool(Bs), 'out': ArgPool(Cs)}
+
+    description = (
+        f"single-lora roofline using torch.mm ({dtype_to_str(dtype)}"
+        f"x{dtype_to_str(dtype)}"
+        f"=>{dtype_to_str(dtype)})")
+    cuda_graph_params = None
+    if cuda_graph_nops:
+        cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops)
+    with Bench(cuda_graph_params, ctx.bench_label(),
+               ctx.bench_sublabel(op_type), description, torch.mm,
+               **mm_kwargs) as bench:
+        return bench.run()
+
+
+# runner
+def use_cuda_graph_recommendation() -> str:
+    return """
+            Triton kernels have a significant launch overhead with
+            launched directly via python. This overhead is more noticeable
+            for small the problem sizes. For these cases, it is recommended
+            to use the script with `--cuda-graph-nops N` to benchmark N
+            consecutive invocations of the benchmarking operations from 
+            inside a CUDA Graph. Note that the returned measurement is for N 
+            invocations of the operation.
+            """
+
+
+def print_timers(timers: List[TMeasurement],
+                 args: Optional[argparse.Namespace] = None):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+    if args and args.cuda_graph_nops:
+        print(
+            f"Note : The timings reported above is for {args.cuda_graph_nops} "
+            "consecutive invocations of the benchmarking functions. "
+            f"Please divide by {args.cuda_graph_nops} for single invocation "
+            "timings.")
+
+    print("Note on Comparison with torch.mm : The torch.mm numbers are "
+          "benchmark numbers of a simple matmul emulating the single lora "
+          "case. It is provided as a roofline for comparing our LoRA Kernel "
+          "implementations. It is expected that the LoRA kernels will be "
+          "slower than torch.mm in cases where num_loras is big. But for "
+          "small num_loras the goal should be to match the torch.mm numbers.")
+
+
+def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
+
+    if args.cuda_graph_nops is not None:
+        assert args.cuda_graph_nops > 0
+        print(f"Benchmarking {args.cuda_graph_nops} invocations inside a CUDA "
+              "Graph")
+    else:
+        print(f"CUDA Graphs not enabled.\n{use_cuda_graph_recommendation()}")
+
+    timers = []
+    for bench_ctx in bench_ctxs:
+        for seq_len in args.seq_lengths:
+            bench_ops: List[OpType] = []
+            if seq_len == 1:
+                # bench all decode ops
+                bench_ops = [op for op in args.op_types if op.is_decode_op()]
+            else:
+                # bench all prefill ops
+                bench_ops = [op for op in args.op_types if op.is_prefill_op()]
+
+            seq_len_timers = []
+            for bench_op in bench_ops:
+                for num_slices in bench_op.num_slices():
+                    _ctx = bench_ctx.with_seq_length(seq_len).with_num_slices(
+                        num_slices)
+                    # Benchmark torch.mm as a roofline
+                    seq_len_timers.append(
+                        bench_torch_mm(_ctx, args.arg_pool_size, bench_op,
+                                       args.cuda_graph_nops))
+
+                    # Benchmark bench_op
+                    expand_fn_add_inputs = [
+                        None
+                    ] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs
+                    for add_input_arg in expand_fn_add_inputs:
+                        seq_len_timers.append(
+                            bench_optype(_ctx, args.arg_pool_size, bench_op,
+                                         args.cuda_graph_nops, add_input_arg,
+                                         args.test_correctness))
+
+            print_timers(seq_len_timers)
+            timers.extend(seq_len_timers)
+
+    # Result stdout dump
+    print("== All Results ====")
+    print_timers(timers, args)
+
+    if args.output_directory:
+        # Result file dump
+        od = Path(args.output_directory)
+        if not od.exists():
+            od.mkdir()
+
+        timestamp = int(time.time())
+        pkl_file = od / f"lora_bench-{timestamp}.pkl"
+        print(f"Writing benchmarks to {pkl_file}")
+        with open(pkl_file, "wb") as f:
+            pickle.dump(timers, f)
+
+
+def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int],
+                          args: argparse.Namespace) -> List[BenchmarkContext]:
+
+    ctxs: List[BenchmarkContext] = []
+    for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product(  # noqa
+            args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras,
+            args.sort_by_lora_id):
+        ctxs.append(
+            BenchmarkContext(
+                batch_size=batch_size,
+                hidden_size=hidden_size,
+                lora_rank=lora_rank,
+                num_loras=num_loras,
+                num_active_loras=args.num_active_loras
+                if args.num_active_loras else num_loras,
+                # To be filled based on the OpType to benchmark
+                seq_length=None,
+                sort_by_lora_id=sort_by_lora_id,
+                dtype=args.dtype,
+                # To be filled based on the OpType to benchmark
+                num_slices=None))
+
+    return ctxs
+
+
+def run_list_bench(args: argparse.Namespace):
+    print(args)
+
+    print("List bench :\n"
+          f"  Hidden Sizes {args.hidden_sizes}"
+          f"  LoRA Ranks {args.lora_ranks}")
+
+    # Get all benchmarking contexts
+    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+        hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args)
+
+    run(args, bench_contexts)
+
+
+def run_range_bench(args: argparse.Namespace):
+    print(args)
+
+    hidden_sizes = list(
+        range(args.hidden_sizes_start, args.hidden_sizes_end + 1,
+              args.hidden_sizes_increment))
+    lora_ranks = list(
+        range(args.lora_ranks_start, args.lora_ranks_end + 1,
+              args.lora_ranks_increment))
+
+    print("Range bench :\n"
+          f" Hidden Sizes {hidden_sizes}"
+          f" LoRA Ranks {lora_ranks}")
+
+    # Get all benchmarking contexts
+    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+        hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args)
+
+    run(args, bench_contexts)
+
+
+def run_model_bench(args: argparse.Namespace):
+    print(args)
+
+    def hidden_sizes_from_model(model: str, tp_size: int) -> set[int]:
+        hidden_sizes = set()
+        for KN, tp_split_dim in WEIGHT_SHAPES[model]:
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            hidden_sizes.add(KN[1])
+        return hidden_sizes
+
+    # Get all hidden sizes
+    hidden_sizes: set[int] = set()
+    for model_name, tp_size in product(args.models, args.tp_sizes):
+        hidden_sizes = hidden_sizes.union(
+            hidden_sizes_from_model(model_name, tp_size))
+
+    print("Model bench :\n"
+          f" Hidden Sizes {hidden_sizes}"
+          f" LoRA Ranks {args.lora_ranks}")
+
+    # Get all benchmarking contexts
+    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+        hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args)
+
+    run(args, bench_contexts)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "torch.float16":
+            return torch.float16
+        if dt == "torch.bfloat16":
+            return torch.bfloat16
+        raise ValueError("unsupported dtype")
+
+    def get_bool(s: str) -> bool:
+        return s.lower() in ['true', '1']
+
+    def add_common_command_args(p: argparse.ArgumentParser):
+        p.add_argument(
+            "--dtype",
+            type=to_torch_dtype,
+            required=True,
+            help="Available options are ['torch.float16', 'torch.bfloat16']")
+
+        p.add_argument(
+            "--arg-pool-size",
+            type=int,
+            default=32,
+            help="Run profiles with a pool of input/output/meta tensors instead"
+            "of simply reusing the same tensors for all runs. A bigger arg-pool"
+            "mitigates hardware caching effects during benchmarking.")
+
+        p.add_argument(
+            "--cuda-graph-nops",
+            type=int,
+            help=("when set profiling is done using cudagraph, "
+                  "with the given number of operations in a graph."
+                  "Note that the measurement returned is the time "
+                  "taken for N consecutive executions of the benchmarking "
+                  "functions, where N is the value of this argument."))
+        p.add_argument("--num-loras",
+                       nargs="+",
+                       type=int,
+                       default=DEFAULT_NUM_LORAS)
+        p.add_argument("--num-active-loras",
+                       type=int,
+                       default=None,
+                       help="Active LoRAs. When None, all LoRAs are active")
+        p.add_argument("--sort-by-lora-id",
+                       nargs="+",
+                       type=get_bool,
+                       default=DEFAULT_SORT_BY_LORA_IDS)
+        p.add_argument("--op-types",
+                       nargs="+",
+                       type=OpType.from_str,
+                       default=list(OpType))
+        p.add_argument('--seq-lengths',
+                       nargs="+",
+                       type=int,
+                       default=DEFAULT_SEQ_LENGTHS)
+        p.add_argument("--batch-sizes",
+                       nargs="+",
+                       type=int,
+                       default=DEFAULT_BATCH_SIZES)
+        p.add_argument("--expand-fn-add-inputs",
+                       nargs="+",
+                       type=get_bool,
+                       default=DEFAULT_EXPAND_FN_ADD_INPUTS)
+        p.add_argument(
+            '-o',
+            '--output-directory',
+            type=str,
+            help=("Output directory to store a the list of benchmarking"
+                  "TMeasurement objects as a pickle file"))
+
+        p.add_argument(
+            "--test-correctness",
+            action='store_true',
+            help=("When enabled, the benchmarking functions are tested"
+                  "for correctness before the actual benchmarking"))
+
+    parser = FlexibleArgumentParser(
+        description=f"""
+Benchmark LoRA kernels:
+    {use_cuda_graph_recommendation()}
+
+    list_bench example:
+        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
+
+    model_bench example:
+        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 
+
+    range_bench example:
+        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    subparsers = parser.add_subparsers(dest="cmd", required=True)
+
+    list_parser = subparsers.add_parser("list_bench")
+    list_parser.add_argument("--hidden-sizes",
+                             nargs="+",
+                             type=int,
+                             default=DEFAULT_HIDDEN_SIZES)
+    list_parser.add_argument("--lora-ranks",
+                             nargs="+",
+                             type=int,
+                             default=DEFAULT_LORA_RANKS)
+    add_common_command_args(list_parser)
+    list_parser.set_defaults(func=run_list_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--hidden-sizes-start", type=int, required=True)
+    range_parser.add_argument("--hidden-sizes-end", type=int, required=True)
+    range_parser.add_argument("--hidden-sizes-increment",
+                              type=int,
+                              required=True)
+    range_parser.add_argument("--lora-ranks-start", type=int, required=True)
+    range_parser.add_argument("--lora-ranks-end", type=int, required=True)
+    range_parser.add_argument("--lora-ranks-increment",
+                              type=int,
+                              required=True)
+    add_common_command_args(range_parser)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--lora-ranks",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_LORA_RANKS)
+    add_common_command_args(model_parser)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py
new file mode 100644
index 0000000000000..fee877b6f76fa
--- /dev/null
+++ b/benchmarks/kernels/utils.py
@@ -0,0 +1,210 @@
+import dataclasses
+from typing import Any, Callable, Iterable, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+
+
+@dataclasses.dataclass
+class CudaGraphBenchParams:
+    num_ops_in_cuda_graph: int
+
+
+@dataclasses.dataclass
+class ArgPool:
+    """
+    When some argument of the benchmarking function is annotated with this type,
+    the benchmarking class (BenchMM) will collapse the argument to a pick a
+    single value from the given list of values, during function invocation.
+    For every invocation during a benchmarking run, it will choose a
+    different value from the list.
+    """
+    values: Iterable[Any]
+
+    def __getitem__(self, index):
+        return self.values[index]
+
+
+class Bench:
+
+    class ArgsIterator:
+
+        def __init__(self, args_list, kwargs_list):
+            assert len(args_list) == len(kwargs_list)
+            self.args_list = args_list
+            self.kwargs_list = kwargs_list
+            self.n = len(self.args_list)
+            self.idx = 0
+
+        def __next__(self):
+            while True:
+                yield (self.args_list[self.idx], self.kwargs_list[self.idx])
+                self.idx += 1
+                self.idx = self.idx % self.n
+
+        def reset(self):
+            self.idx = 0
+
+        @property
+        def n_args(self):
+            return self.n
+
+    def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams],
+                 label: str, sub_label: str, description: str, fn: Callable,
+                 *args, **kwargs):
+
+        self.cuda_graph_params = cuda_graph_params
+        self.use_cuda_graph = self.cuda_graph_params is not None
+        self.label = label
+        self.sub_label = sub_label
+        self.description = description
+        self.fn = fn
+
+        # Process args
+        self._args = args
+        self._kwargs = kwargs
+        self.args_list, self.kwargs_list = self.collapse_argpool(
+            *args, **kwargs)
+        self.args_iterator = self.ArgsIterator(self.args_list,
+                                               self.kwargs_list)
+
+        # Cudagraph runner
+        self.g = None
+        if self.use_cuda_graph:
+            self.g = self.get_cuda_graph_runner()
+
+        # benchmark run params
+        self.min_run_time = 1
+
+    def collapse_argpool(self, *args, **kwargs):
+        argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [
+            arg for arg in kwargs.values() if isinstance(arg, ArgPool)
+        ]
+        if len(argpool_args) == 0:
+            return [args], [kwargs]
+
+        # Make sure all argpools are of the same size
+        argpool_size = len(argpool_args[0].values)
+        assert all([argpool_size == len(arg.values) for arg in argpool_args])
+
+        # create copies of the args
+        args_list = []
+        kwargs_list = []
+        for _ in range(argpool_size):
+            args_list.append(args)
+            kwargs_list.append(kwargs.copy())
+
+        for i in range(argpool_size):
+            # collapse args; Just pick the ith value
+            args_list[i] = tuple([
+                arg[i] if isinstance(arg, ArgPool) else arg
+                for arg in args_list[i]
+            ])
+
+            # collapse kwargs
+            kwargs_i = kwargs_list[i]
+            arg_pool_keys = [
+                k for k, v in kwargs_i.items() if isinstance(v, ArgPool)
+            ]
+            for k in arg_pool_keys:
+                # again just pick the ith value
+                kwargs_i[k] = kwargs_i[k][i]
+            kwargs_list[i] = kwargs_i
+
+        return args_list, kwargs_list
+
+    def get_cuda_graph_runner(self):
+        assert self.use_cuda_graph
+        assert self.args_iterator is not None
+
+        num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
+
+        # warmup
+        args_it = self.args_iterator.__next__()
+        for _ in range(2):
+            args, kwargs = next(args_it)
+            self.fn(*args, **kwargs)
+
+        self.args_iterator.reset()
+        args_it = self.args_iterator.__next__()
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                for _ in range(num_graph_ops):
+                    args, kwargs = next(args_it)
+                    self.fn(*args, **kwargs)
+        return g
+
+    def run_cudagrah(self) -> TMeasurement:
+        assert self.use_cuda_graph
+        globals = {'g': self.g}
+
+        return TBenchmark.Timer(
+            stmt="g.replay()",
+            globals=globals,
+            label=(
+                f"{self.label}"
+                f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops"
+            ),
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run_eager(self) -> TMeasurement:
+        setup = None
+        stmt = None
+        globals = None
+
+        has_arg_pool = self.args_iterator.n_args > 1
+        if has_arg_pool:
+            setup = '''
+                    args_iterator.reset()
+                    args_it = args_iterator.__next__()
+                    '''
+            stmt = '''
+                    args, kwargs = next(args_it)
+                    fn(*args, **kwargs)
+                    '''
+            globals = {'fn': self.fn, 'args_iterator': self.args_iterator}
+        else:
+            # no arg pool. Just use the args and kwargs directly
+            self.args_iterator.reset()
+            args_it = self.args_iterator.__next__()
+            args, kwargs = next(args_it)
+
+            setup = ""
+            stmt = '''
+                    fn(*args, **kwargs)
+                   '''
+            globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs}
+
+        return TBenchmark.Timer(
+            stmt=stmt,
+            setup=setup,
+            globals=globals,
+            label=self.label,
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run(self) -> TMeasurement:
+        timer = None
+        if self.use_cuda_graph:  # noqa SIM108
+            timer = self.run_cudagrah()
+        else:
+            timer = self.run_eager()
+        if not timer.meets_confidence() or timer.has_warnings:
+            print("Doesn't meet confidence - re-running bench ...")
+            return self.run()
+        return timer
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_type:
+            print(f"exc type {exc_type}")
+            print(f"exc value {exc_value}")
+            print(f"exc traceback {traceback}")

From 62b06ba23deaca5a0e7602cd2e3a85aeec57f306 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 17 Jan 2025 01:14:48 +0800
Subject: [PATCH 44/67] [Model] Add support for deepseek-vl2-tiny model
 (#12068)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 docs/source/models/supported_models.md        |  5 ++---
 examples/offline_inference/vision_language.py |  2 +-
 .../vision_language_multi_image.py            |  2 +-
 .../vision_language/test_models.py            | 20 +++++++++----------
 tests/models/registry.py                      |  3 +--
 vllm/model_executor/models/deepseek_vl2.py    | 11 +++++++---
 6 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 85d844f3d3f55..d07cde3db5c6e 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -618,7 +618,7 @@ See [this page](#generative-models) for more information on how to use generativ
 * - `DeepseekVLV2ForCausalLM`
   - DeepSeek-VL2
   - T + I<sup>+</sup>
-  - `deepseek-ai/deepseek-vl2-tiny`(WIP), `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
+  - `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
   -
   - ✅︎
   - ✅︎
@@ -768,9 +768,8 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 ````{note}
-The `deepseek-ai/deepseek-vl2-tiny` is not supported yet.
-
 To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package:
+
 ```shell
 pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git
 ```
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index ad32b9fe242e9..8bc715a50e0db 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -70,7 +70,7 @@ def run_chameleon(question: str, modality: str):
 def run_deepseek_vl2(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "deepseek-ai/deepseek-vl2-small"
+    model_name = "deepseek-ai/deepseek-vl2-tiny"
 
     llm = LLM(model=model_name,
               max_model_len=4096,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index c6cf3f30c31cb..33ef5f316f040 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
 
 
 def load_deepseek_vl2(question: str, image_urls: List[str]):
-    model_name = "deepseek-ai/deepseek-vl2-small"
+    model_name = "deepseek-ai/deepseek-vl2-tiny"
 
     llm = LLM(model=model_name,
               max_model_len=4096,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 7620ed1107e8f..5710303548c34 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -9,6 +9,7 @@
 
 import pytest
 from transformers import AutoModelForVision2Seq
+from transformers import __version__ as TRANSFORMERS_VERSION
 from transformers.utils import is_flash_attn_2_available
 
 from vllm.platforms import current_platform
@@ -189,30 +190,27 @@
         dtype="bfloat16",
     ),
     "deepseek_vl_v2": VLMTestInfo(
-        models=["deepseek-ai/deepseek-vl2-small"],
+        models=["deepseek-ai/deepseek-vl2-tiny"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        dtype="bfloat16",
         prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
         single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "<image>\nWhat's the color of the stop sign and car?",
-            "cherry_blossom": "<image>\nWhat's the color of the tower?",
+            "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
+            "cherry_blossom": "<image>\nPlease infer the season with reason in details.",   # noqa: E501
         }),
-        multi_image_prompt="image_1:<image>\nimage_2:<image>\nDescribe the two images shortly.",    # noqa: E501
+        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
         vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}},  # noqa: E501
-        image_size_factors=[(0.10, 0.15)],
         patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
         postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
         hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
         stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
-        num_logprobs=5,
+        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
         marks=[
             pytest.mark.skipif(
-                not is_flash_attn_2_available(),
-                reason="Model needs flash-attn for numeric convergence.",
-            ),
-            large_gpu_mark(min_gb=48),
+                TRANSFORMERS_VERSION >= "4.48.0",
+                reason="HF model is not compatible with transformers>=4.48.0",
+            )
         ],
     ),
     "fuyu": VLMTestInfo(
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b0f0f9767a90f..938c838617e8b 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -181,8 +181,7 @@ class _HfExamplesInfo:
                                     trust_remote_code=True),
     "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
                                                        is_available_online=False),
-    # TODO(Isotr0py): Use deepseek-vl2-tiny for test after it's supported
-    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-small"),   # noqa: E501
+    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny"),   # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 99fa941c055d2..4553695022169 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -356,13 +356,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
             )
 
+        if self.text_config.topk_method == "noaux_tc":
+            architectures = ["DeepseekV3ForCausalLM"]
+        elif not self.text_config.use_mla:
+            architectures = ["DeepseekForCausalLM"]
+        else:
+            architectures = ["DeepseekV2ForCausalLM"]
+
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=self.text_config,
             prefix=maybe_prefix(prefix, "language"),
-            architectures=["DeepseekV3ForCausalLM"]
-            if self.text_config.topk_method == "noaux_tc" else
-            ["DeepseekV2ForCausalLM"],
+            architectures=architectures,
         )
 
         self.make_empty_intermediate_tensors = (

From d06e824006d1ba4b92871347738ce1b89f658499 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 17 Jan 2025 04:30:08 +0800
Subject: [PATCH 45/67] [Bugfix] Set enforce_eager automatically for mllama
 (#12127)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 examples/offline_inference/vision_language.py             | 1 -
 examples/offline_inference/vision_language_multi_image.py | 1 -
 vllm/config.py                                            | 8 +++++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 8bc715a50e0db..69228bbf22949 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -325,7 +325,6 @@ def run_mllama(question: str, modality: str):
         model=model_name,
         max_model_len=4096,
         max_num_seqs=16,
-        enforce_eager=True,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 33ef5f316f040..cf3c5dd4e0a2c 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -186,7 +186,6 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
         model=model_name,
         max_model_len=4096,
         max_num_seqs=16,
-        enforce_eager=True,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
diff --git a/vllm/config.py b/vllm/config.py
index a5f2161068d2a..79754bd04102f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -607,10 +607,12 @@ def _verify_cuda_graph(self) -> None:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
-        if (self.hf_config.model_type == 'deepseek_v3'
+        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama']
+        if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH
                 and not self.enforce_eager):
-            logger.warning("CUDA graph is not supported for Deepseek V3 yet, "
-                           "fallback to the eager mode.")
+            logger.warning(
+                "CUDA graph is not supported for %s yet, fallback to the eager "
+                "mode.", self.hf_config.model_type)
             self.enforce_eager = True
 
     def _verify_bnb_config(self) -> None:

From ebc73f2828df48f0ffbb99e52f0e4b394a23dbd3 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 17 Jan 2025 11:12:41 +0800
Subject: [PATCH 46/67] [Bugfix] Fix a path bug in disaggregated prefill
 example script. (#12121)

Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
---
 examples/online_serving/disaggregated_prefill.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
index 87155273a81d1..2bb2824c6c86f 100644
--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@@ -3,6 +3,8 @@
 # We will launch 2 vllm instances (1 for prefill and 1 for decode),
 # and then transfer the KV cache between them.
 
+set -xe
+
 echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
 sleep 1
 
@@ -69,7 +71,7 @@ wait_for_server 8200
 #   instance
 # NOTE: the usage of this API is subject to change --- in the future we will 
 # introduce "vllm connect" to connect between prefill and decode instances
-python3 ../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
+python3 ../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
 sleep 1
 
 # serve two example requests

From 4db525d198c368348a543b987821761fbfc85bcb Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Fri, 17 Jan 2025 09:37:02 +0530
Subject: [PATCH 47/67] Clean-up LoRA flow (#518)

This PR cleans up LoRA flow by removing unnecessary functions and
variables.

Removed special handling of `max_num_batched_tokens` for HPU in
`models.py` since we internally handle this in PunicaWrapperHPU
[PR](https://github.com/vllm-project/vllm/blob/d51d66c3252107d5b986d2eab7af1c210dceb708/vllm/lora/punica_wrapper/punica_hpu.py#L17)

Removed `convert_mapping` from `models.py` based on this
[PR](https://github.com/vllm-project/vllm/pull/5036/files#:~:text=def%20convert_mapping)

Co-authored-by: Vivek Goel <vgoel@habana.ai>
---
 vllm/lora/layers.py             |   1 -
 vllm/lora/models.py             | 119 +-------------------------------
 vllm/worker/hpu_model_runner.py |   9 ---
 3 files changed, 2 insertions(+), 127 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 538aad5290f85..5b915df175181 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -232,7 +232,6 @@ def set_lora(
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         added_tokens_mask = x > self.base_layer.org_vocab_size - 1
-        embeddings_indices = None
         embeddings_indices = self.punica_wrapper.embeddings_indices
         indices = embeddings_indices[1].view_as(x)
         full_lora_a_embeddings = F.embedding(
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 18028c1570df2..5b7225bdc8f37 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -4,8 +4,7 @@
 import os
 import re
 from dataclasses import dataclass, field
-from typing import (Any, Callable, Dict, List, Optional, Sequence, Tuple, Type,
-                    Union)
+from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
 
 import safetensors.torch
 import torch
@@ -30,7 +29,6 @@
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
-from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
@@ -50,116 +48,6 @@ class LongContextLoRAContext:
     offsets_by_lora_id: Dict[int, int] = field(default_factory=dict)
 
 
-def convert_mapping(
-    mapping: LoRAMapping,
-    lora_index_to_id: List[Optional[int]],
-    max_loras: int,
-    vocab_size: int,
-    extra_vocab_size: int,
-    long_lora_context: Optional[LongContextLoRAContext] = None,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
-           Optional[torch.Tensor], List[int]]:
-    """Converts LoRAMapping to index tensors.
-
-    Args:
-        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
-        lora_index_to_id: List mapping LoRA ids to LoRA indices.
-        max_loras: Maximum number of LoRAs.
-        vocab_size: Model vocab size.
-        extra_vocab_size: Extra vocab size each LoRA can have.
-        long_lora_context: Passed if there are long context lora in a batch.
-
-    Returns:
-        A tuple of tensors:
-            base_indices: Tensor of shape [batch_size] mapping batch rows to
-                LoRA indices.
-            sampler_indices: Tensor of shape [batch_size] mapping requests to
-                LoRA indices for sampler. For generation, this will be the
-                same as base_indicies. For prefill, this will map requests
-                to LoRA indices.
-            sampler_indices_padded: Tensor of shape [batch_size] mapping
-                requests to LoRA indices for sampler with padding.
-                Same as sampler_indicies, but -1 is replaced with
-                max_loras.
-            embeddings_indices: Tensor of shape [2, batch_size] mapping
-                requests to embedding indices. First row is for embeddings
-                added by the LoRAs, second row is for the LoRA.lora_a
-                embeddings.
-            long_lora_indices: Tensor of shape [batch_size] mapping
-                requests to RoPE offsets and rot dims for long LoRAs.
-                None if long context lora doesn't exist.
-            indices_len: List of lengths of the above tensors.
-                Used to index into each tensor. It contains length for
-                (base_indices, sampler_indices, sampler_indices_padded,
-                embeddings_indices, long_lora_indices). If long_lora doesn't
-                exist, it only contains first 4 entries.
-    """
-    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
-    embedding_indices = index_mapping_indices.copy()
-    lora_indices = index_mapping_indices.copy()
-    long_lora_offsets: Optional[torch.Tensor] = None
-    device = "hpu" if current_platform.is_hpu() else "cuda"
-    if long_lora_context:
-        long_lora_offsets = torch.zeros(len(index_mapping_indices),
-                                        device=device,
-                                        dtype=torch.long)
-    prompt_mapping: List[int] = [
-        lora_index_to_id.index(x) if x > 0 else -1
-        for x in mapping.prompt_mapping
-    ]
-    lora_idx = None
-    for i in range(len(index_mapping_indices)):
-        # TODO index can be slow. optimize
-        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
-                    if index_mapping_indices[i] > 0 else -1)
-        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
-        lora_indices[i] = lora_idx
-        if long_lora_context:
-            assert long_lora_offsets is not None
-            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
-                index_mapping_indices[i], 0)
-            long_lora_offsets[i] = lora_offset
-
-    indices_list: List[Union[List[int], torch.Tensor]] = [
-        index_mapping_indices, lora_indices, embedding_indices
-    ]
-    if long_lora_context:
-        assert long_lora_offsets is not None
-        indices_list.append(long_lora_offsets)
-    indices = torch.tensor(indices_list, dtype=torch.long, device=device)
-    prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         device=device,
-                                         dtype=torch.long)
-    embeddings_indices = torch.stack([
-        indices[2] * extra_vocab_size,
-        indices[2] * (vocab_size + extra_vocab_size)
-    ])
-    embeddings_indices[embeddings_indices == -1] = max_loras - 1
-    base_indices = indices[1]
-    sampler_indices = prompt_mapping_tensor
-    sampler_indices_padded = sampler_indices.clone()
-    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
-    sampler_indices_padded = (
-        torch.arange(
-            0, len(sampler_indices_padded), device=device, dtype=torch.long) +
-        (sampler_indices_padded * len(sampler_indices_padded)))
-    long_lora_indices = None
-    long_lora_indices_len: Optional[int] = None
-    if long_lora_context:
-        long_lora_indices = indices[3]
-        long_lora_indices_len = long_lora_indices.shape[-1]
-    # Contain length of indices tensors. Used to index into each tensor.
-    indices_len = [
-        base_indices.shape[-1], sampler_indices.shape[-1],
-        sampler_indices_padded.shape[-1], embeddings_indices.shape[-1]
-    ]
-    if long_lora_indices_len is not None:
-        indices_len.append(long_lora_indices_len)
-
-    return (base_indices, sampler_indices, sampler_indices_padded,
-            embeddings_indices, long_lora_indices, indices_len)
-
-
 def get_lora_id():
     global _GLOBAL_LORA_ID
     _GLOBAL_LORA_ID += 1
@@ -440,10 +328,7 @@ def __init__(
         self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots
         self.vocab_size = vocab_size
         self.long_lora_context: Optional[LongContextLoRAContext] = None
-        punica_max_num_batched_tokens = max_num_batched_tokens
-        if current_platform.is_hpu():
-            punica_max_num_batched_tokens = 3 * max_num_batched_tokens
-        self.punica_wrapper = get_punica_wrapper(punica_max_num_batched_tokens,
+        self.punica_wrapper = get_punica_wrapper(max_num_batched_tokens,
                                                  max_batches=self.max_num_seqs,
                                                  device=self.device)
         # Scaling factor -> offset to the sin_cos_cache to it.
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 8b3d7ba9c422c..3afded0118724 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1914,15 +1914,6 @@ def get_counter_dict(self, cache_config, duration, seq_len,
         return counters
 
 
-def unwrap_model(model):
-    if isinstance(model, torch._dynamo.eval_frame.OptimizedModule):
-        return unwrap_model(model._orig_mod)
-    else:
-        model = list(vars(model)['_modules'].values())[0]
-        modules = list(vars(model)['_modules'].values())
-        return modules
-
-
 class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
     """
     GPU model runner with sampling step.

From fead53ba78dbcdd4da616308f1ef1b4a312f8897 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Fri, 17 Jan 2025 12:15:09 +0800
Subject: [PATCH 48/67] [CI]add genai-perf benchmark in nightly benchmark
 (#10704)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 .../scripts/run-nightly-benchmarks.sh         | 107 ++++++++++++++++++
 .../tests/genai-perf-tests.json               |  23 ++++
 requirements-test.in                          |   3 +
 requirements-test.txt                         |  67 ++++++++++-
 4 files changed, 196 insertions(+), 4 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/tests/genai-perf-tests.json

diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 3f38cf5137535..32bd34c431c89 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -301,6 +301,104 @@ run_serving_tests() {
   kill_gpu_processes
 }
 
+run_genai_perf_tests() {
+  # run genai-perf tests 
+
+  # $1: a json file specifying genai-perf test cases
+  local genai_perf_test_file
+  genai_perf_test_file=$1
+
+  # Iterate over genai-perf tests
+  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')    
+    
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+    
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps=$num_prompts
+        echo "now qps is $qps"
+      fi
+    
+      new_test_name=$test_name"_qps_"$qps
+      backend=$CURRENT_LLM_SERVING_ENGINE
+      
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+      #TODO: add output dir.
+      client_command="genai-perf profile \
+        -m $model \
+        --service-kind openai \
+        --backend vllm \
+        --endpoint-type chat \
+        --streaming \
+        --url localhost:$port \
+        --request-rate $qps \
+        --num-prompts $num_prompts \
+      "
+
+    echo "Client command: $client_command"
+
+    eval "$client_command"
+
+    #TODO: process/record outputs
+    done
+  done
+
+  kill_gpu_processes
+
+}
 
 prepare_dataset() {
 
@@ -328,12 +426,17 @@ main() {
 
   pip install -U transformers
 
+  pip install -r requirements-dev.txt
+  which genai-perf
+
   # check storage
   df -h
 
   ensure_installed wget
   ensure_installed curl
   ensure_installed jq
+  # genai-perf dependency
+  ensure_installed libb64-0d
 
   prepare_dataset
 
@@ -345,6 +448,10 @@ main() {
   # run the test
   run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
 
+  # run genai-perf tests
+  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
+  mv artifacts/ $RESULTS_FOLDER/
+
   # upload benchmark results to buildkite
   python3 -m pip install tabulate pandas
   python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
new file mode 100644
index 0000000000000..edbe9f2df0ce0
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@@ -0,0 +1,23 @@
+[
+    {
+        "test_name": "llama8B_tp1_genai_perf",
+        "qps_list": [4,8,16,32],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "port": 8000,
+            "num_prompts": 500,
+            "reuse_server": false
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "genai_perf_input_parameters": {
+        }
+    }
+]
\ No newline at end of file
diff --git a/requirements-test.in b/requirements-test.in
index 4b4dc376d1fa5..bc76a91ad5356 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -29,4 +29,7 @@ lm-eval[api]==0.4.4 # required for model evaluation test
 bitsandbytes>=0.45.0
 buildkite-test-collector==0.1.9
 
+genai_perf==0.0.8
+tritonclient==2.51.0
+
 numpy < 2.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
index f576e42afcbbf..09e009c2e21f4 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -37,7 +37,7 @@ audioread==3.0.1
     # via librosa
 awscli==1.35.23
     # via -r requirements-test.in
-bitsandbytes>=0.45.0
+bitsandbytes==0.45.0
     # via -r requirements-test.in
 black==24.10.0
     # via datamodel-code-generator
@@ -75,6 +75,8 @@ colorama==0.4.6
     #   tqdm-multiprocess
 contourpy==1.3.0
     # via matplotlib
+cramjam==2.9.0
+    # via fastparquet
 cupy-cuda12x==13.3.0
     # via ray
 cycler==0.12.1
@@ -109,6 +111,8 @@ email-validator==2.2.0
     # via pydantic
 evaluate==0.4.3
     # via lm-eval
+fastparquet==2024.11.0
+    # via genai-perf
 fastrlock==0.8.2
     # via cupy-cuda12x
 filelock==3.16.1
@@ -130,8 +134,11 @@ fsspec[http]==2024.9.0
     # via
     #   datasets
     #   evaluate
+    #   fastparquet
     #   huggingface-hub
     #   torch
+genai-perf==0.0.8
+    # via -r requirements-test.in
 genson==1.3.0
     # via datamodel-code-generator
 h11==0.14.0
@@ -186,6 +193,8 @@ jsonschema==4.23.0
     #   ray
 jsonschema-specifications==2024.10.1
     # via jsonschema
+kaleido==0.2.1
+    # via genai-perf
 kiwisolver==1.4.7
     # via matplotlib
 lazy-loader==0.4
@@ -200,6 +209,8 @@ lm-eval[api]==0.4.4
     # via -r requirements-test.in
 lxml==5.3.0
     # via sacrebleu
+markdown-it-py==3.0.0
+    # via rich
 markupsafe==3.0.2
     # via jinja2
 matplotlib==3.9.2
@@ -209,6 +220,8 @@ mbstrdecoder==1.1.3
     #   dataproperty
     #   pytablewriter
     #   typepy
+mdurl==0.1.2
+    # via markdown-it-py
 mistral-common[opencv]==1.5.1
     # via
     #   -r requirements-test.in
@@ -249,6 +262,8 @@ numpy==1.26.4
     #   datasets
     #   decord
     #   evaluate
+    #   fastparquet
+    #   genai-perf
     #   librosa
     #   matplotlib
     #   mistral-common
@@ -256,15 +271,18 @@ numpy==1.26.4
     #   numexpr
     #   opencv-python-headless
     #   pandas
+    #   patsy
     #   peft
     #   rouge-score
     #   sacrebleu
     #   scikit-learn
     #   scipy
     #   soxr
+    #   statsmodels
     #   tensorizer
     #   torchvision
     #   transformers
+    #   tritonclient
 nvidia-cublas-cu12==12.4.5.8
     # via
     #   nvidia-cudnn-cu12
@@ -306,30 +324,39 @@ packaging==24.1
     #   datamodel-code-generator
     #   datasets
     #   evaluate
+    #   fastparquet
     #   huggingface-hub
     #   lazy-loader
     #   matplotlib
     #   peft
+    #   plotly
     #   pooch
     #   pytest
     #   pytest-rerunfailures
     #   ray
+    #   statsmodels
     #   transformers
     #   typepy
 pandas==2.2.3
     # via
     #   datasets
     #   evaluate
+    #   fastparquet
+    #   genai-perf
+    #   statsmodels
 pathspec==0.12.1
     # via black
 pathvalidate==3.2.1
     # via pytablewriter
+patsy==1.0.1
+    # via statsmodels
 peft==0.13.2
     # via
     #   -r requirements-test.in
     #   lm-eval
 pillow==10.4.0
     # via
+    #   genai-perf
     #   matplotlib
     #   mistral-common
     #   sentence-transformers
@@ -338,6 +365,8 @@ platformdirs==4.3.6
     # via
     #   black
     #   pooch
+plotly==5.24.1
+    # via genai-perf
 pluggy==1.5.0
     # via pytest
 pooch==1.8.2
@@ -360,7 +389,9 @@ psutil==6.1.0
 py==1.11.0
     # via pytest-forked
 pyarrow==18.0.0
-    # via datasets
+    # via
+    #   datasets
+    #   genai-perf
 pyasn1==0.6.1
     # via rsa
 pybind11==2.13.6
@@ -373,6 +404,8 @@ pydantic[email]==2.9.2
     #   mistral-common
 pydantic-core==2.23.4
     # via pydantic
+pygments==2.18.0
+    # via rich
 pyparsing==3.2.0
     # via matplotlib
 pytablewriter==1.2.0
@@ -381,14 +414,18 @@ pytest==8.3.3
     # via
     #   -r requirements-test.in
     #   buildkite-test-collector
+    #   genai-perf
     #   pytest-asyncio
     #   pytest-forked
+    #   pytest-mock
     #   pytest-rerunfailures
     #   pytest-shard
 pytest-asyncio==0.24.0
     # via -r requirements-test.in
 pytest-forked==1.6.0
     # via -r requirements-test.in
+pytest-mock==3.14.0
+    # via genai-perf
 pytest-rerunfailures==14.0
     # via -r requirements-test.in
 pytest-shard==0.1.2
@@ -399,6 +436,8 @@ python-dateutil==2.9.0.post0
     #   matplotlib
     #   pandas
     #   typepy
+python-rapidjson==1.20
+    # via tritonclient
 pytz==2024.2
     # via
     #   pandas
@@ -409,9 +448,11 @@ pyyaml==6.0.2
     #   awscli
     #   datamodel-code-generator
     #   datasets
+    #   genai-perf
     #   huggingface-hub
     #   peft
     #   ray
+    #   responses
     #   timm
     #   transformers
 ray[adag]==2.40.0
@@ -438,8 +479,13 @@ requests==2.32.3
     #   mistral-common
     #   pooch
     #   ray
+    #   responses
     #   tiktoken
     #   transformers
+responses==0.25.3
+    # via genai-perf
+rich==13.9.4
+    # via genai-perf
 rouge-score==0.1.2
     # via lm-eval
 rpds-py==0.20.1
@@ -470,6 +516,7 @@ scipy==1.13.1
     #   librosa
     #   scikit-learn
     #   sentence-transformers
+    #   statsmodels
 sentence-transformers==3.2.1
     # via -r requirements-test.in
 sentencepiece==0.2.0
@@ -490,6 +537,8 @@ soxr==0.5.0.post1
     # via librosa
 sqlitedict==2.1.0
     # via lm-eval
+statsmodels==0.14.4
+    # via genai-perf
 sympy==1.13.1
     # via torch
 tabledata==1.3.3
@@ -499,7 +548,9 @@ tabulate==0.9.0
 tcolorpy==0.1.6
     # via pytablewriter
 tenacity==9.0.0
-    # via lm-eval
+    # via
+    #   lm-eval
+    #   plotly
 tensorizer==2.9.0
     # via -r requirements-test.in
 threadpoolctl==3.5.0
@@ -540,6 +591,7 @@ tqdm-multiprocess==0.0.11
     # via lm-eval
 transformers==4.47.0
     # via
+    #   genai-perf
     #   lm-eval
     #   peft
     #   sentence-transformers
@@ -548,6 +600,10 @@ transformers-stream-generator==0.0.5
     # via -r requirements-test.in
 triton==3.1.0
     # via torch
+tritonclient==2.51.0
+    # via
+    #   -r requirements-test.in
+    #   genai-perf
 typepy[datetime]==1.3.2
     # via
     #   dataproperty
@@ -555,6 +611,7 @@ typepy[datetime]==1.3.2
     #   tabledata
 typing-extensions==4.12.2
     # via
+    #   bitsandbytes
     #   huggingface-hub
     #   librosa
     #   mistral-common
@@ -563,10 +620,12 @@ typing-extensions==4.12.2
     #   torch
 tzdata==2024.2
     # via pandas
-urllib3==1.26.20
+urllib3==2.2.3
     # via
     #   botocore
     #   requests
+    #   responses
+    #   tritonclient
 word2number==1.1
     # via lm-eval
 xxhash==3.5.0

From 1475847a14e3693128fcc4f8740493d12074ed93 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Thu, 16 Jan 2025 23:45:36 -0500
Subject: [PATCH 49/67] [Doc] Add instructions on using Podman when SELinux is
 active (#12136)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/source/deployment/docker.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 2606e2765c1ae..438be47316f3b 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -42,6 +42,9 @@ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
 By default vLLM will build for all GPU types for widest distribution. If you are just building for the
 current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
 for vLLM to find the current GPU type and build for that.
+
+If you are using Podman instead of Docker, you might need to disable SELinux labeling by
+adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
 ```
 
 ## Building for Arm64/aarch64

From b8bfa46a18abe0bf9f48a29e1e8dd2bc1a79af98 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Thu, 16 Jan 2025 23:54:01 -0500
Subject: [PATCH 50/67] [Bugfix] Fix issues in CPU build Dockerfile (#12135)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 Dockerfile.cpu |  6 +++---
 setup.py       | 10 +++-------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index f163edc27cba8..ebe226cf6d148 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -26,10 +26,10 @@ RUN pip install intel_extension_for_pytorch==2.5.0
 
 WORKDIR /workspace
 
-COPY requirements-build.txt requirements-build.txt
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
     pip install --upgrade pip && \
     pip install -r requirements-build.txt
 
@@ -37,9 +37,9 @@ FROM cpu-test-1 AS build
 
 WORKDIR /workspace/vllm
 
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cpu.txt requirements-cpu.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
     pip install -v -r requirements-cpu.txt
 
 COPY . .
diff --git a/setup.py b/setup.py
index 7dfcec7f9f0c5..978625a069778 100644
--- a/setup.py
+++ b/setup.py
@@ -472,13 +472,9 @@ def get_gaudi_sw_version():
 
 
 def get_vllm_version() -> str:
-    # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236
-    try:
-        version = get_version(
-            write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
-        )
-    except LookupError:
-        version = "0.0.0"
+    version = get_version(
+        write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
+    )
 
     sep = "+" if "+" not in version else "."  # dev versions might contain +
 

From d1adb9b4032dd430bb28b8e91feb8164c3a1ca9c Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 17 Jan 2025 13:33:22 +0800
Subject: [PATCH 51/67] [BugFix] add more `is not None` check in
 VllmConfig.__post_init__ (#12138)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 79754bd04102f..ac5a4c91b1738 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3174,7 +3174,8 @@ def __post_init__(self):
 
         if self.compilation_config is None:
             self.compilation_config = CompilationConfig()
-        if envs.VLLM_USE_V1 and not self.model_config.enforce_eager:
+        if envs.VLLM_USE_V1 and self.model_config is not None and \
+            not self.model_config.enforce_eager:
             # NOTE(woosuk): Currently, we use inductor because the piecewise
             # CUDA graphs do not work properly with the custom CUDA kernels.
             # FIXME(woosuk): Disable inductor to reduce the compilation time

From d75ab55f1035309c96814af46da1c5166209854b Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 17 Jan 2025 14:34:48 +0800
Subject: [PATCH 52/67] [Misc] Add deepseek_vl2 chat template (#12143)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 examples/template_deepseek_vl2.jinja | 23 +++++++++++++++++++++++
 tests/entrypoints/test_chat_utils.py |  1 +
 2 files changed, 24 insertions(+)
 create mode 100644 examples/template_deepseek_vl2.jinja

diff --git a/examples/template_deepseek_vl2.jinja b/examples/template_deepseek_vl2.jinja
new file mode 100644
index 0000000000000..fbf3d320094d5
--- /dev/null
+++ b/examples/template_deepseek_vl2.jinja
@@ -0,0 +1,23 @@
+{%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+{%- else -%}
+    {% set system_message = '' -%}
+{%- endif -%}
+
+{{ bos_token + system_message }}
+{%- for message in messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif -%}
+
+    {%- if message['role'] == 'user' -%}
+        {{ '<|User|>: ' + message['content'] + '\n' }}
+    {%- elif message['role'] == 'assistant' -%}
+        {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n' }}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{ '<|Assistant|>: ' }}
+{% endif %}
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 8f242df4a60e3..513b466c10d60 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -754,6 +754,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
      ("template_chatglm.jinja", "string"),
      ("template_chatglm2.jinja", "string"),
      ("template_chatml.jinja", "string"),
+     ("template_deepseek_vl2.jinja", "string"),
      ("template_falcon_180b.jinja", "string"),
      ("template_falcon.jinja", "string"),
      ("template_inkbot.jinja", "string"),

From 8027a724611353d2ff3a504f91c5607e94f635b0 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Fri, 17 Jan 2025 00:49:16 -0600
Subject: [PATCH 53/67] [ROCm][MoE] moe tuning support for rocm (#12049)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 benchmarks/kernels/benchmark_moe.py | 272 +++++++++++++++++++++++-----
 1 file changed, 224 insertions(+), 48 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 8f538c21f7f7e..1d59a01422412 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -1,6 +1,7 @@
 import argparse
 import time
 from datetime import datetime
+from itertools import product
 from typing import Any, Dict, List, Tuple, TypedDict
 
 import ray
@@ -11,7 +12,10 @@
 
 from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, is_navi
+
+FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm(
+) and not is_navi() else torch.float8_e4m3fn
 
 
 class BenchmarkConfig(TypedDict):
@@ -80,8 +84,8 @@ def benchmark_config(
         a1_scale = torch.randn(1, dtype=torch.float32)
         a2_scale = torch.randn(1, dtype=torch.float32)
 
-        w1 = w1.to(torch.float8_e4m3fn)
-        w2 = w2.to(torch.float8_e4m3fn)
+        w1 = w1.to(FP8_DTYPE)
+        w2 = w2.to(FP8_DTYPE)
 
     input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
 
@@ -141,28 +145,172 @@ def run():
     return avg
 
 
-def get_configs_compute_bound() -> List[Dict[str, int]]:
-    # Reduced search space for faster tuning.
-    # TODO(woosuk): Increase the search space and use a performance model to
-    # prune the search space.
+def get_rocm_tuning_space(use_fp16):
+    block_mn_range = [16, 32, 64, 128, 256]
+    block_k_range = [16, 32, 64, 128, 256]
+    if not use_fp16:
+        block_k_range.remove(16)  # BLOCK_K=16 not supported for fp8
+    num_warps_range = [1, 2, 4, 8]
+    group_m_range = [1, 4, 8, 16, 32]
+    num_stage_range = [2]
+    waves_per_eu_range = [0]
+    matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
+    kpack_range = [1, 2] if use_fp16 else []
+
+    param_ranges = {
+        "BLOCK_SIZE_M": block_mn_range,
+        "BLOCK_SIZE_N": block_mn_range,
+        "BLOCK_SIZE_K": block_k_range,
+        "GROUP_SIZE_M": group_m_range,
+        "num_warps": num_warps_range,
+        "num_stages": num_stage_range,
+        "waves_per_eu": waves_per_eu_range,
+    }
+    if use_fp16:
+        param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range
+        param_ranges["kpack"] = kpack_range
+
+    return param_ranges
+
+
+def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]:
     configs: List[BenchmarkConfig] = []
-    for num_stages in [2, 3, 4, 5]:
-        for block_m in [16, 32, 64, 128, 256]:
-            for block_k in [64, 128, 256]:
-                for block_n in [32, 64, 128, 256]:
-                    for num_warps in [4, 8]:
-                        for group_size in [1, 16, 32, 64]:
-                            configs.append({
-                                "BLOCK_SIZE_M": block_m,
-                                "BLOCK_SIZE_N": block_n,
-                                "BLOCK_SIZE_K": block_k,
-                                "GROUP_SIZE_M": group_size,
-                                "num_warps": num_warps,
-                                "num_stages": num_stages,
-                            })
+
+    if current_platform.is_rocm():
+        param_ranges = get_rocm_tuning_space(use_fp16)
+    else:
+        # Reduced search space for faster tuning.
+        # TODO(woosuk): Increase the search space and use a performance model to
+        # prune the search space.
+        block_m_range = [16, 32, 64, 128, 256]
+        block_n_range = [32, 64, 128, 256]
+        block_k_range = [64, 128, 256]
+        num_warps_range = [4, 8]
+        group_m_range = [1, 16, 32, 64]
+        num_stage_range = [2, 3, 4, 5]
+
+        param_ranges = {
+            "BLOCK_SIZE_M": block_m_range,
+            "BLOCK_SIZE_N": block_n_range,
+            "BLOCK_SIZE_K": block_k_range,
+            "GROUP_SIZE_M": group_m_range,
+            "num_warps": num_warps_range,
+            "num_stages": num_stage_range,
+        }
+
+    keys, values = zip(*param_ranges.items())
+    for config_values in product(*values):
+        config = dict(zip(keys, config_values))
+        configs.append(config)
     return configs
 
 
+def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size,
+                            search_space, is_fp16):
+    N1, K1 = shard_intermediate_size, hidden_size
+    N2, K2 = hidden_size, shard_intermediate_size // 2
+    pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space,
+                                        is_fp16)
+    pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space,
+                                        is_fp16)
+    search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
+    return search_space
+
+
+# The following code is inspired by ROCm/Triton GEMM tuning script:
+# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89
+def prune_rocm_configs(M, N, K, configs, is_fp16=True):
+    pruned_configs = []
+    elemBytes_a = 2 if is_fp16 else 1
+    elemBytes_b = 2 if is_fp16 else 1
+
+    mfma = 16 if M < 32 or N < 32 else 32
+
+    # TODO (zhanglx): figure out the boundary between large and small gemms
+    large_gemm = False
+    if M >= 2048 and N >= 2048:
+        large_gemm = True
+
+    for config in configs:
+        BLOCK_SIZE_M = config.get("BLOCK_SIZE_M")
+        BLOCK_SIZE_N = config.get("BLOCK_SIZE_N")
+        BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
+        num_warps = config.get("num_warps")
+
+        if is_fp16:
+            matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
+            if matrix_instr_nonkdim > mfma:
+                continue
+        if mfma == 4 and BLOCK_SIZE_K < 64:
+            continue
+        # some layouts could not work properly in case
+        # number elements per thread is less 1
+        if BLOCK_SIZE_M * BLOCK_SIZE_N < 64:
+            continue
+        SPLIT_K = config.get("SPLIT_K", 1)
+        GROUP_M = config.get("GROUP_SIZE_M")
+        if is_fp16:
+            if (matrix_instr_nonkdim > BLOCK_SIZE_M
+                    or matrix_instr_nonkdim > BLOCK_SIZE_N):
+                continue
+            if (matrix_instr_nonkdim >= M
+                    and matrix_instr_nonkdim != BLOCK_SIZE_M):
+                continue
+            if (matrix_instr_nonkdim >= N
+                    and matrix_instr_nonkdim != BLOCK_SIZE_N):
+                continue
+        # Skip BLOCK_SIZE that is too large compare to M/N
+        # unless BLOCK_SIZE is already small enough
+        if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
+            continue
+        if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16:
+            continue
+        # skip large split_k when not necessary
+        if SPLIT_K != 1 and not need_split_k(M, N, K):
+            continue
+        # skip split_k that leads to EVEN_K = false
+        leap = SPLIT_K * BLOCK_SIZE_K
+        modv = K % leap
+        if modv != 0:
+            continue
+        # skip large GROUP_M
+        if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1:
+            continue
+        # out of shared memory resource
+        # TODO (zhanglx): This does not consider the LDS usage in the epilogue
+        LDS = (BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a +
+               BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b)
+        if LDS > 65536:
+            continue
+        # Skip small block sizes and num_warps for large gemm
+        # For fp16 and f8, we want to only use BLOCK_SIZE >= 64
+        if large_gemm:
+            if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
+                continue
+            if BLOCK_SIZE_K < 64:
+                continue
+            if num_warps < 4:
+                continue
+
+        pruned_configs.append(config)
+
+    return pruned_configs
+
+
+def need_split_k(SIZE_M, SIZE_N, SIZE_K):
+    return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024
+
+
+def merge_unique_dicts(list1, list2):
+    result = []
+    combined_list = list1.copy()
+    combined_list.extend(list2)
+    for dictionary in combined_list:
+        if dictionary not in result:
+            result.append(dictionary)
+    return result
+
+
 @ray.remote(num_gpus=1)
 class BenchmarkWorker:
 
@@ -170,6 +318,10 @@ def __init__(self, seed: int) -> None:
         torch.set_default_device("cuda")
         current_platform.seed_everything(seed)
         self.seed = seed
+        # Get the device ID to allocate tensors and kernels
+        # on the respective GPU. This is required for Ray to work
+        # correctly with multi-GPU tuning on the ROCm platform.
+        self.device_id = int(ray.get_gpu_ids()[0])
 
     def benchmark(
         self,
@@ -217,25 +369,33 @@ def tune(
     ) -> Dict[str, int]:
         best_config = None
         best_time = float("inf")
-        for config in tqdm(search_space):
-            try:
-                kernel_time = benchmark_config(config,
-                                               num_tokens,
-                                               num_experts,
-                                               shard_intermediate_size,
-                                               hidden_size,
-                                               topk,
-                                               dtype,
-                                               use_fp8_w8a8,
-                                               use_int8_w8a16,
-                                               num_iters=10)
-            except triton.runtime.autotuner.OutOfResources:
-                # Some configurations may be invalid and fail to compile.
-                continue
-
-            if kernel_time < best_time:
-                best_time = kernel_time
-                best_config = config
+        if current_platform.is_rocm():
+            is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+            search_space = prune_rocm_search_space(num_tokens,
+                                                   shard_intermediate_size,
+                                                   hidden_size, search_space,
+                                                   is_fp16)
+
+        with torch.cuda.device(self.device_id):
+            for config in tqdm(search_space):
+                try:
+                    kernel_time = benchmark_config(config,
+                                                   num_tokens,
+                                                   num_experts,
+                                                   shard_intermediate_size,
+                                                   hidden_size,
+                                                   topk,
+                                                   dtype,
+                                                   use_fp8_w8a8,
+                                                   use_int8_w8a16,
+                                                   num_iters=20)
+                except triton.runtime.autotuner.OutOfResources:
+                    # Some configurations may be invalid and fail to compile.
+                    continue
+
+                if kernel_time < best_time:
+                    best_time = kernel_time
+                    best_config = config
         now = datetime.now()
         print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
         assert best_config is not None
@@ -244,12 +404,27 @@ def tune(
 
 def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
     return {
-        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
-        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
-        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
-        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
-        "num_warps": config["num_warps"],
-        "num_stages": config["num_stages"],
+        "BLOCK_SIZE_M":
+        config["BLOCK_SIZE_M"],
+        "BLOCK_SIZE_N":
+        config["BLOCK_SIZE_N"],
+        "BLOCK_SIZE_K":
+        config["BLOCK_SIZE_K"],
+        "GROUP_SIZE_M":
+        config["GROUP_SIZE_M"],
+        "num_warps":
+        config["num_warps"],
+        "num_stages":
+        config["num_stages"],
+        **({
+            "waves_per_eu": config["waves_per_eu"]
+        } if "waves_per_eu" in config else {}),
+        **({
+            "matrix_instr_nonkdim": config["matrix_instr_nonkdim"]
+        } if "matrix_instr_nonkdim" in config else {}),
+        **({
+            "kpack": config["kpack"]
+        } if "kpack" in config else {}),
     }
 
 
@@ -294,7 +469,7 @@ def main(args: argparse.Namespace):
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
 
     hidden_size = config.hidden_size
-    dtype = config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
 
@@ -322,7 +497,8 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
         return ray.get(outputs)
 
     if args.tune:
-        search_space = get_configs_compute_bound()
+        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+        search_space = get_configs_compute_bound(is_fp16)
         print(f"Start tuning over {len(search_space)} configurations...")
 
         start = time.time()

From 69d765f5a5bbbe1ea9843be19b9480660fc5bc8b Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 17 Jan 2025 15:39:35 +0800
Subject: [PATCH 54/67] [V1] Move more control of kv cache initialization from
 model_executor to EngineCore (#11960)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
---
 tests/v1/test_utils.py                 |  62 +++++++++++++
 vllm/attention/layer.py                |   2 +
 vllm/v1/core/kv_cache_utils.py         | 124 +++++++++++++++++++++++++
 vllm/v1/engine/core.py                 |  31 ++++---
 vllm/v1/executor/abstract.py           |  11 ++-
 vllm/v1/executor/multiproc_executor.py |  25 +++--
 vllm/v1/executor/ray_executor.py       |  40 ++++----
 vllm/v1/executor/uniproc_executor.py   |  25 ++---
 vllm/v1/kv_cache_interface.py          | 111 ++++++++++++++++++++++
 vllm/v1/utils.py                       |  56 ++++++++++-
 vllm/v1/worker/gpu_model_runner.py     |  84 ++++++++++++++---
 vllm/v1/worker/gpu_worker.py           |  48 +++-------
 12 files changed, 515 insertions(+), 104 deletions(-)
 create mode 100644 tests/v1/test_utils.py
 create mode 100644 vllm/v1/kv_cache_interface.py

diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py
new file mode 100644
index 0000000000000..ac773b611f406
--- /dev/null
+++ b/tests/v1/test_utils.py
@@ -0,0 +1,62 @@
+from typing import List
+
+import torch
+
+from vllm.v1.utils import bind_kv_cache
+
+
+def test_bind_kv_cache():
+    from vllm.attention import Attention
+
+    ctx = {
+        'layers.0.self_attn': Attention(32, 128, 0.1),
+        'layers.1.self_attn': Attention(32, 128, 0.1),
+        'layers.2.self_attn': Attention(32, 128, 0.1),
+        'layers.3.self_attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = {
+        'layers.0.self_attn': torch.zeros((1, )),
+        'layers.1.self_attn': torch.zeros((1, )),
+        'layers.2.self_attn': torch.zeros((1, )),
+        'layers.3.self_attn': torch.zeros((1, )),
+    }
+    runner_kv_caches: List[torch.Tensor] = []
+    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
+    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
+        'layers.0.self_attn']
+    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
+        'layers.1.self_attn']
+    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
+        'layers.2.self_attn']
+    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
+        'layers.3.self_attn']
+
+    assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
+    assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
+    assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
+    assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
+
+
+def test_bind_kv_cache_non_attention():
+    from vllm.attention import Attention
+
+    # example from Jamba PP=2
+    ctx = {
+        'model.layers.20.attn': Attention(32, 128, 0.1),
+        'model.layers.28.attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = {
+        'model.layers.20.attn': torch.zeros((1, )),
+        'model.layers.28.attn': torch.zeros((1, )),
+    }
+
+    runner_kv_caches: List[torch.Tensor] = []
+    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
+
+    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
+        'model.layers.20.attn']
+    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
+        'model.layers.28.attn']
+
+    assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
+    assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 9b03fd73fe690..e2403306950a3 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -101,7 +101,9 @@ def __init__(
         self.num_heads = num_heads
         self.head_size = head_size
         self.num_kv_heads = num_kv_heads
+        self.sliding_window = sliding_window
         self.backend = backend_name_to_enum(attn_backend.get_name())
+        self.dtype = dtype
 
         # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
         # torch.compile works by registering the attention as one giant
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 22a5d2fb08a48..bab99fe37caee 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -3,7 +3,10 @@
 from dataclasses import dataclass
 from typing import Any, List, NamedTuple, Optional, Tuple
 
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec,
+                                        KVCacheTensor)
 from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -305,3 +308,124 @@ def hash_request_tokens(block_size: int,
         ret.append(block_hash)
         parent_block_hash_value = block_hash.hash_value
     return ret
+
+
+def check_enough_kv_cache_memory(vllm_config: VllmConfig,
+                                 kv_cache_spec: KVCacheSpec,
+                                 available_memory: int):
+    """
+    Checks whether `available_memory` is enough for the KV cache to hold at 
+    least one request with the model's max_model_len.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Raises:
+        ValueError: If there is not enough memory available for the KV cache.
+    """
+
+    if available_memory <= 0:
+        raise ValueError("No available memory for the cache blocks. "
+                         "Try increasing `gpu_memory_utilization` when "
+                         "initializing the engine.")
+
+    max_model_len = vllm_config.model_config.max_model_len
+    needed_memory = 0
+    for layer_spec in kv_cache_spec.values():
+        needed_memory += layer_spec.bytes_for_tokens(max_model_len)
+
+    if needed_memory > available_memory:
+        raise ValueError(
+            f"To serve at least one request with the models's max seq len "
+            f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GB KV "
+            f"cache is needed, which is larger than the available KV cache "
+            f"memory ({available_memory/1024/1024/1024:.2f} GB). Try "
+            f"increasing `gpu_memory_utilization` or decreasing "
+            f"`max_model_len` when initializing the engine.")
+
+
+def is_kv_cache_type_uniform(kv_cache_spec: KVCacheSpec) -> bool:
+    """
+    Whether all layers in the given KVCacheSpec have the same type of KV cache.
+
+    Args:
+        kv_cache_spec: The KVCacheSpec of the model
+
+    Returns:
+        True if all layers have the same type, False otherwise.
+    """
+
+    layer_keys = set(layer.type_id for layer in kv_cache_spec.values())
+    return len(layer_keys) == 1
+
+
+def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
+                                      kv_cache_spec: KVCacheSpec,
+                                      available_memory: int) -> KVCacheConfig:
+    """
+    Generates the KV cache configuration for a model with one type of KV cache.
+    Divide the available memory equally among all layers.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Returns:
+        The generated KVCacheConfig
+    """
+
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
+    assert len(page_sizes) == 1
+    page_size = page_sizes.pop()
+
+    num_blocks = int(available_memory // page_size // len(kv_cache_spec))
+    num_blocks = max(num_blocks, 0)
+
+    if vllm_config.cache_config.num_gpu_blocks_override is not None:
+        num_gpu_blocks_override = \
+            vllm_config.cache_config.num_gpu_blocks_override
+        logger.info(
+            "Overriding num_gpu_blocks=%d with "
+            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
+        num_blocks = num_gpu_blocks_override
+
+    logger.info("# GPU blocks: %d", num_blocks)
+
+    per_layer_size = page_size * num_blocks
+
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,
+        tensors={
+            layer_name: KVCacheTensor(size=per_layer_size)
+            for layer_name in kv_cache_spec
+        },
+        groups=[[layer_name for layer_name in kv_cache_spec]],
+        kv_cache_spec=kv_cache_spec)
+    return kv_cache_config
+
+
+def get_kv_cache_config(vllm_config: VllmConfig, kv_cache_spec: KVCacheSpec,
+                        available_memory: int) -> KVCacheConfig:
+    """
+    Generates the KV cache configuration for a model
+    TODO: support hybrid models with more than one type of KV cache.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Returns:
+        The generated KVCacheConfig
+    """
+    check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
+    if is_kv_cache_type_uniform(kv_cache_spec):
+        # KV cache of all layers are the same, which is true for most models.
+        # Allocate the same amount of memory for each layer.
+        return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
+                                                 available_memory)
+    else:
+        raise NotImplementedError
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ef616229aa57b..26ebc7edcf03e 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -11,11 +11,12 @@
 import zmq.asyncio
 from msgspec import msgpack
 
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.utils import get_exception_traceback, zmq_socket_ctx
+from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
                             EngineCoreRequest, EngineCoreRequestType,
@@ -49,7 +50,7 @@ def __init__(
 
         # Setup KV Caches and update CacheConfig after profiling.
         num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches(
-            vllm_config.cache_config)
+            vllm_config)
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
@@ -65,21 +66,25 @@ def __init__(
             vllm_config.model_config)
 
     def _initialize_kv_caches(self,
-                              cache_config: CacheConfig) -> Tuple[int, int]:
+                              vllm_config: VllmConfig) -> Tuple[int, int]:
         start = time.time()
-        num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks(
-        )
 
-        if cache_config.num_gpu_blocks_override is not None:
-            num_gpu_blocks_override = cache_config.num_gpu_blocks_override
-            logger.info(
-                "Overriding num_gpu_blocks=%d with "
-                "num_gpu_blocks_override=%d", num_gpu_blocks,
-                num_gpu_blocks_override)
-            num_gpu_blocks = num_gpu_blocks_override
+        # Get all kv cache needed by the model
+        kv_cache_spec = self.model_executor.get_kv_cache_spec()
+
+        # Profiles the peak memory usage of the model to determine how much
+        # memory can be allocated for kv cache.
+        availble_gpu_memory = self.model_executor.determine_available_memory()
 
+        # Get the kv cache tensor size
+        kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                              availble_gpu_memory)
+        num_gpu_blocks = kv_cache_config.num_blocks
         num_cpu_blocks = 0
-        self.model_executor.initialize(num_gpu_blocks)
+
+        # Initialize kv cache and warmup the execution
+        self.model_executor.initialize(kv_cache_config)
+
         elapsed = time.time() - start
         logger.info(("init engine (profile, create kv cache, "
                      "warmup model) took %.2f seconds"), elapsed)
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 7c17f60510ae1..5240778ebf330 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,7 +1,8 @@
 from abc import ABC, abstractmethod
-from typing import Tuple, Type
+from typing import Type
 
 from vllm.config import VllmConfig
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 
 
@@ -31,11 +32,15 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def initialize(self, num_gpu_blocks: int) -> None:
+    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
+    def determine_available_memory(self) -> int:  # in bytes
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_kv_cache_spec(self) -> KVCacheSpec:
         raise NotImplementedError
 
     @abstractmethod
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index e111ac7ee8183..e92acc7cb5e41 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -23,6 +23,7 @@
 from vllm.utils import (get_distributed_init_method, get_mp_context,
                         get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
 
@@ -90,29 +91,33 @@ def sigusr1_handler(signum, frame):
         for w in self.workers:
             w.worker_response_mq.wait_until_ready()
 
-    def initialize(self, num_gpu_blocks: int) -> None:
+    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize the KV caches and begin the model execution loop of the
         underlying workers.
         """
-        logger.info("# GPU blocks: %d", num_gpu_blocks)
-        self.collective_rpc("initialize_cache", args=(num_gpu_blocks, ))
+        self.collective_rpc("initialize_cache", args=(kv_cache_config, ))
         self.collective_rpc("compile_or_warm_up_model")
 
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
+    def determine_available_memory(self) -> int:
         """
-        Determine the number of available KV blocks by invoking the
+        Determine the available memory (in bytes) for KV cache by invoking the
         underlying worker.
         """
-        num_blocks = self.collective_rpc("determine_num_available_blocks")
+        memory_sizes = self.collective_rpc("determine_available_memory")
 
         # Since we use a shared centralized controller, we take the minimum
-        # number of blocks across all workers to make sure all the memory
+        # memory size across all workers to make sure all the memory
         # operators can be applied to all workers.
-        num_gpu_blocks = min(b[0] for b in num_blocks)
-        num_cpu_blocks = min(b[1] for b in num_blocks)
+        return min(memory_sizes)
 
-        return num_gpu_blocks, num_cpu_blocks
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        """
+        Get all kv cache needed by the model by invoking the underlying worker.
+        """
+        kv_cache_specs = self.collective_rpc("get_kv_cache_spec")
+        assert all(s == kv_cache_specs[0] for s in kv_cache_specs)
+        return kv_cache_specs[0]
 
     def collective_rpc(self,
                        method: str,
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
index 79acc60001c99..fd67fa2235770 100644
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -10,6 +10,7 @@
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.executor.ray_utils import (RayWorkerWrapper,
                                         initialize_ray_cluster, ray)
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 
 if ray is not None:
@@ -211,39 +212,40 @@ def _get_worker_kwargs(
             distributed_init_method=distributed_init_method,
         )
 
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
+    def determine_available_memory(self) -> int:
         """
-        Determine the number of available KV blocks.
+        Determine the available GPU memory in bytes.
         
-        This invokes `determine_num_available_blocks` on each worker and takes
+        This invokes `determine_available_memory` on each worker and takes
         the min of the results, guaranteeing that the selected cache sizes are
         compatible with all workers.
-        
-        Returns:
-            - tuple[num_gpu_blocks, num_cpu_blocks]
         """
-        # Get the maximum number of blocks that can be allocated on GPU and CPU.
-        num_blocks = self._run_workers("determine_num_available_blocks")
+
+        memory_sizes = self._run_workers("determine_available_memory")
 
         # Since we use a shared centralized controller, we take the minimum
-        # number of blocks across all workers to make sure all the memory
+        # memory size across all workers to make sure all the memory
         # operators can be applied to all workers.
-        num_gpu_blocks = min(b[0] for b in num_blocks)
-        num_cpu_blocks = min(b[1] for b in num_blocks)
+        return min(memory_sizes)
 
-        return num_gpu_blocks, num_cpu_blocks
-
-    def initialize(self, num_gpu_blocks: int) -> None:
+    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize the KV cache in all workers.
         """
-        # NOTE: This is logged in the executor because there can be >1 worker
-        # with other executors. We could log in the engine level, but work
-        # remains to abstract away the device for non-GPU configurations.
-        logger.info("# GPU blocks: %d", num_gpu_blocks)
-        self._run_workers("initialize_cache", num_gpu_blocks)
+        self._run_workers("initialize_cache", kv_cache_config)
         self._run_workers("compile_or_warm_up_model")
 
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        """
+        Get all kv cache needed by the model
+        
+        This invokes `get_kv_cache_spec` on each worker and asserts that
+        they are identical. The KVCacheSpec is then returned.
+        """
+        kv_cache_specs = self._run_workers("get_kv_cache_spec")
+        assert all(s == kv_cache_specs[0] for s in kv_cache_specs)
+        return kv_cache_specs[0]
+
     def _run_workers(
         self,
         method: str,
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index c63d7a4c47c15..b3997caac726b 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -1,10 +1,11 @@
 import os
-from typing import Optional, Tuple
+from typing import Optional
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_worker import Worker
 
@@ -49,20 +50,22 @@ def _create_worker(
             distributed_init_method=distributed_init_method,
         )
 
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker.
+    def determine_available_memory(self) -> int:
+        """Determine the available memory (in bytes) for KV cache by invoking 
+        the underlying worker.
         """
-        return self.worker.determine_num_available_blocks()
+        return self.worker.determine_available_memory()
 
-    def initialize(self, num_gpu_blocks: int) -> None:
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        """Get all kv cache needed by the model by invoking the underlying
+        worker.
+        """
+        return self.worker.get_kv_cache_spec()
+
+    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
         """Initialize the KV cache by invoking the underlying worker.
         """
-        # NOTE: This is logged in the executor because there can be >1 worker
-        # with other executors. We could log in the engine level, but work
-        # remains to abstract away the device for non-GPU configurations.
-        logger.info("# GPU blocks: %d", num_gpu_blocks)
-        self.worker.initialize_cache(num_gpu_blocks)
+        self.worker.initialize_cache(kv_cache_config)
         self.worker.compile_or_warm_up_model()
 
     def execute_model(
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
new file mode 100644
index 0000000000000..6d5cc32ffc5b8
--- /dev/null
+++ b/vllm/v1/kv_cache_interface.py
@@ -0,0 +1,111 @@
+from dataclasses import dataclass
+from typing import Dict, List
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.utils import cdiv, get_dtype_size
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class KVCacheSpecBase:
+    """
+    A base class for specifying the KV cache format of one layer.
+    """
+
+    # number of tokens in a block
+    block_size: int
+
+    @property
+    def type_id(self) -> str:
+        """
+        The type identifier of this KV cache.
+        Return different strings for layers with different KV cache type (e.g., 
+        different number of tokens like full attention vs sliding window 
+        attention, different KV cache size per token like layers with different 
+        number of heads)
+
+        Returns:
+            The type identifier of this KV cache.
+        """
+        raise NotImplementedError
+
+    @property
+    def page_size_bytes(self) -> int:
+        """
+        The size of a page with `block_size` tokens in bytes.
+
+        Returns:
+            The page size
+        """
+        raise NotImplementedError
+
+    def bytes_for_tokens(self, num_tokens: int) -> int:
+        """
+        The KV cache size for `num_tokens` tokens in bytes. Returns the real
+        memory size after padding `num_tokens` to full blocks.
+
+        Returns:
+            The KV cache size
+        """
+        raise NotImplementedError
+
+
+@dataclass
+class FullAttentionSpec(KVCacheSpecBase):
+    num_kv_heads: int
+    head_size: int
+    dtype: torch.dtype
+
+    @property
+    def type_id(self) -> str:
+        return f"full_attention_{self.block_size}_{self.page_size_bytes}"
+
+    @property
+    def page_size_bytes(self) -> int:
+        return  2 * self.block_size * self.num_kv_heads * self.head_size \
+                * get_dtype_size(self.dtype)
+
+    def bytes_for_tokens(self, num_tokens: int) -> int:
+        return cdiv(num_tokens, self.block_size) * self.page_size_bytes
+
+
+KVCacheSpec = Dict[str, KVCacheSpecBase]
+
+
+@dataclass
+class KVCacheTensor:
+    """
+    A dataclass for specifying how the workers should initialize the KV cache
+    for a layer. Only contains the size of KV cache for that layer for now. Will
+    be extended to support multiple layers sharing the same memory pool.
+    """
+    size: int  # The size of KV cache Tensor in bytes
+
+
+@dataclass
+class KVCacheConfig:
+    """
+    The KV cache configuration of a model.
+    """
+    """The number of KV cache blocks"""
+    num_blocks: int
+    """layer_name -> how to initialize KV cache for that layer"""
+    tensors: Dict[str, KVCacheTensor]
+    """
+    A list of kv-cache groups. Each group includes a set of layers with
+    the same kv-cache spec, and the total page_size of layers inside a group
+    is same across all groups (as the KVCacheManager only supports allocating
+    pages of the same size). For example:
+    1. A model only uses full attention: one group with all layers in the model.
+    2. (not implemented yet) A model with the same number of full attention
+    layers and sliding window attention layers: two groups, one for full
+    attention layers and one for sliding window attention layers.
+    3. (not implemented yet) A model with 2 full attention layers and 4 sliding 
+    window attention layers: three groups, (full * 2), (sw * 2), (sw * 2).
+    """
+    groups: List[List[str]]
+    """the KVCacheSpec of the model"""
+    kv_cache_spec: KVCacheSpec
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index b0a7affbebb7e..8dfcf2dd78606 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,13 +1,20 @@
 import multiprocessing
 import os
 import weakref
+from collections import defaultdict
 from collections.abc import Sequence
-from typing import (Any, Callable, Dict, Generic, List, Optional, TypeVar,
-                    Union, overload)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Generic, List,
+                    Optional, TypeVar, Union, overload)
+
+import torch
 
 from vllm.logger import init_logger
+from vllm.model_executor.models.utils import extract_layer_index
 from vllm.utils import get_mp_context, kill_process_tree
 
+if TYPE_CHECKING:
+    from vllm.attention.layer import Attention
+
 logger = init_logger(__name__)
 
 T = TypeVar("T")
@@ -134,3 +141,48 @@ def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str):
         socket_file = ipc_socket.replace("ipc://", "")
         if os and os.path.exists(socket_file):
             os.remove(socket_file)
+
+
+def bind_kv_cache(
+    kv_caches: Dict[str, torch.Tensor],
+    forward_context: Dict[str, "Attention"],
+    runner_kv_caches: List[torch.Tensor],
+) -> None:
+    """
+    Bind the allocated KV cache to both ModelRunner and forward context so
+    that the KV cache can be used in the forward pass.
+
+    This function:
+      1) Fills the ModelRunner's kv cache list (`runner_kv_caches`) with
+         kv_caches.
+      2) Associates each attention layer in the `forward_context` with its 
+         corresponding KV cache in kv_caches.
+
+    Args:
+        kv_caches: The allocated kv_caches with layer names as keys.
+        forward_context: The global forward context containing all Attention 
+        layers with layer names as keys.
+        runner_kv_caches: The kv_cache declared by ModelRunner.
+    """
+    # Bind kv_caches to ModelRunner
+    assert len(runner_kv_caches) == 0
+
+    # Convert kv_caches dict to a list of tensors in the order of layer_index.
+    index2name = defaultdict(list)
+    for layer_name in kv_caches:
+        index2name[extract_layer_index(layer_name)].append(layer_name)
+
+    for layer_index in sorted(index2name.keys()):
+        layer_names = index2name[layer_index]
+        if len(layer_names) > 1:
+            # One typical case is encoder-decoder model, e.g., bart.
+            # The cross attention and self attention in the same decoder layer
+            # has different layer_name but the same layer_index.
+            raise NotImplementedError
+        layer_name = layer_names[0]
+        runner_kv_caches.append(kv_caches[layer_name])
+
+    # Bind kv_caches to forward context
+    for layer_name, kv_cache in kv_caches.items():
+        # NOTE: Use list because of v0 PP virtual engine.
+        forward_context[layer_name].kv_cache = [kv_cache]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index de83640b27cd6..aa63d9414c296 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -7,6 +7,8 @@
 import torch.distributed
 import torch.nn as nn
 
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.distributed.parallel_state import graph_capture
 from vllm.forward_context import set_forward_context
@@ -16,14 +18,16 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.sampling_params import SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, bind_kv_cache, cdiv,
-                        is_pin_memory_available)
+                        LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheSpec)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 if TYPE_CHECKING:
@@ -856,15 +860,71 @@ def capture_model(self) -> None:
         logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
                     elapsed_time, cuda_graph_size / (1 << 30))
 
-    def initialize_kv_cache(self, num_blocks: int) -> None:
-        assert len(self.kv_caches) == 0
-        kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(
-            num_blocks, self.block_size, self.num_kv_heads, self.head_size)
-        for _ in range(self.num_attn_layers):
-            self.kv_caches.append(
-                torch.zeros(kv_cache_shape,
-                            dtype=self.kv_cache_dtype,
-                            device=self.device))
+    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        """
+        Initialize KV cache based on `kv_cache_config`.
+        Args:
+            kv_cache_config: Configuration for the KV cache, including the KV 
+            cache size of each layer
+        """
+        if len(kv_cache_config.groups) > 1:
+            raise NotImplementedError(
+                "Hybrid models with more than one KV cache type are not "
+                "supported yet.")
+
+        kv_caches: Dict[str, torch.Tensor] = {}
+
+        for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items():
+            tensor_config = kv_cache_config.tensors[layer_name]
+            assert tensor_config.size % layer_spec.page_size_bytes == 0
+            num_blocks = tensor_config.size // layer_spec.page_size_bytes
+            if isinstance(layer_spec, FullAttentionSpec):
+                kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(
+                    num_blocks, layer_spec.block_size, layer_spec.num_kv_heads,
+                    layer_spec.head_size)
+                dtype = layer_spec.dtype
+                kv_caches[layer_name] = torch.zeros(kv_cache_shape,
+                                                    dtype=dtype,
+                                                    device=self.device)
+            else:
+                raise NotImplementedError
+
         bind_kv_cache(
+            kv_caches,
             self.vllm_config.compilation_config.static_forward_context,
-            [self.kv_caches])
+            self.kv_caches)
+
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        """
+        Generates the KVCacheSpec by parsing the kv cache format from each 
+        Attention module in the static forward context.
+        Returns:
+            KVCacheSpec: A dictionary mapping layer names to their KV cache 
+            format. Layers that do not need KV cache are not included.
+        """
+
+        forward_ctx = self.vllm_config.compilation_config.static_forward_context
+        block_size = self.vllm_config.cache_config.block_size
+        kv_cache_spec: KVCacheSpec = {}
+        for layer_name, attn_module in forward_ctx.items():
+            # TODO: Support other attention modules, e.g., sliding window,
+            # cross-attention, MLA.
+            assert isinstance(attn_module, Attention)
+            if attn_module.attn_type == AttentionType.DECODER:
+                kv_cache_spec[layer_name] = FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=attn_module.num_kv_heads,
+                    head_size=attn_module.head_size,
+                    dtype=attn_module.dtype,
+                )
+            elif attn_module.attn_type in (AttentionType.ENCODER,
+                                           AttentionType.ENCODER_ONLY):
+                # encoder-only attention does not need KV cache.
+                continue
+            elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
+                raise NotImplementedError
+            else:
+                raise ValueError(
+                    f"Unknown attention type: {attn_module.attn_type}")
+
+        return kv_cache_spec
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 81b247e07ef4a..4fb4197f1822f 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -1,7 +1,7 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import TYPE_CHECKING, Optional, Tuple
+from typing import TYPE_CHECKING, Optional
 
 import torch
 import torch.distributed
@@ -16,6 +16,7 @@
 from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size
 from vllm.v1.core.scheduler import SchedulerOutput
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
@@ -112,20 +113,18 @@ def load_model(self) -> None:
         self.model_runner.load_model()
 
     @torch.inference_mode()
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Profiles the peak memory usage of the model to determine how many
-        KV blocks may be allocated without OOMs.
+    def determine_available_memory(self) -> int:
+        """Profiles the peak memory usage of the model to determine how much 
+        memory can be used for KV cache without OOMs.
 
         The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of GPU and CPU blocks
-        that can be allocated with the remaining free memory.
+        Then, it calculate the free memory that can be used for KV cache in
+        bytes.
 
         .. tip::
             You may limit the usage of GPU memory
             by adjusting the `gpu_memory_utilization` parameter.
         """
-        # Profile the memory usage of the model and get the maximum number of
-        # cache blocks that can be allocated with the remaining free memory.
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
 
@@ -161,33 +160,14 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
             total_gpu_memory * self.cache_config.gpu_memory_utilization -
             peak_memory)
 
-        # Calculate the number of blocks that can be allocated with the
-        # profiled peak memory.
-        cache_block_size = _get_cache_block_size(self.cache_config,
-                                                 self.model_config,
-                                                 self.parallel_config)
-        num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
-        num_gpu_blocks = max(num_gpu_blocks, 0)
-        return num_gpu_blocks, 0
-
-    def initialize_cache(self, num_gpu_blocks: int) -> None:
-        """Allocate GPU and CPU KV cache with the specified number of blocks."""
-        if num_gpu_blocks <= 0:
-            raise ValueError("No available memory for the cache blocks. "
-                             "Try increasing `gpu_memory_utilization` when "
-                             "initializing the engine.")
-
-        max_seq_len = self.cache_config.block_size * num_gpu_blocks
-        max_model_len = self.model_config.max_model_len
-        if max_model_len > max_seq_len:
-            raise ValueError(
-                f"The model's max seq len ({max_model_len}) "
-                "is larger than the maximum number of tokens that can be "
-                f"stored in KV cache ({max_seq_len}). Try increasing "
-                "`gpu_memory_utilization` or decreasing `max_model_len` when "
-                "initializing the engine.")
+        return int(available_kv_cache_memory)
+
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        return self.model_runner.get_kv_cache_spec()
 
-        self.model_runner.initialize_kv_cache(num_gpu_blocks)
+    def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        """Allocate GPU KV cache with the specified kv_cache_config."""
+        self.model_runner.initialize_kv_cache(kv_cache_config)
 
     def compile_or_warm_up_model(self) -> None:
         if not self.model_config.enforce_eager:

From a685225df4c4e8843f416452c497b7c0e9283f28 Mon Sep 17 00:00:00 2001
From: Karol Damaszke <kdamaszke@habana.ai>
Date: Fri, 17 Jan 2025 10:08:21 +0100
Subject: [PATCH 55/67] Check if kv_cache is tuple before calling
 split_kv_cache (#697)

---
 vllm/attention/backends/hpu_attn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 98f7cc689278b..cfb6ecd57181c 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -215,7 +215,7 @@ def forward(
         if attn_metadata.is_prompt:
             key = key.unflatten(0, (block_indices.size(0), -1))
             value = value.unflatten(0, (block_indices.size(0), -1))
-        if kv_cache is not None:
+        if kv_cache is not None and isinstance(kv_cache, tuple):
             key_cache, value_cache = HPUPagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
 
@@ -342,7 +342,7 @@ def forward_encoder_decoder(
 
         block_indices = attn_metadata.cross_block_indices
         block_offsets = attn_metadata.cross_block_offsets
-        if kv_cache is not None:
+        if kv_cache is not None and isinstance(kv_cache, tuple):
             key_cache, value_cache = HPUPagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
 

From 07934cc237d16427d705e5abc3c83e4eb0f9b7f4 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 17 Jan 2025 19:32:28 +0800
Subject: [PATCH 56/67] [Misc][LoRA] Improve the readability of LoRA error
 messages (#12102)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../entrypoints/openai/test_lora_adapters.py  |  69 ++++++++---
 tests/lora/test_lora_checkpoints.py           |  16 +++
 tests/lora/test_lora_huggingface.py           |   3 +
 tests/lora/test_lora_manager.py               |  59 +---------
 tests/lora/test_peft_helper.py                | 109 ++++++++++++++++++
 vllm/engine/multiprocessing/engine.py         |   1 +
 vllm/entrypoints/openai/serving_models.py     |  24 ++--
 vllm/lora/models.py                           |  12 +-
 vllm/lora/peft_helper.py                      |  49 ++++++--
 vllm/lora/worker_manager.py                   |  19 ++-
 10 files changed, 245 insertions(+), 116 deletions(-)
 create mode 100644 tests/lora/test_peft_helper.py

diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index 46a064f6d9e68..6ff99f6faa143 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -17,6 +17,33 @@
 # generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
+BADREQUEST_CASES = [
+    (
+        "test_rank",
+        {
+            "r": 1024
+        },
+        "is greater than max_lora_rank",
+    ),
+    (
+        "test_bias",
+        {
+            "bias": "all"
+        },
+        "Adapter bias cannot be used without bias_enabled",
+    ),
+    ("test_dora", {
+        "use_dora": True
+    }, "does not yet support DoRA"),
+    (
+        "test_modules_to_save",
+        {
+            "modules_to_save": ["lm_head"]
+        },
+        "only supports modules_to_save being None",
+    ),
+]
+
 
 @pytest.fixture(scope="module")
 def zephyr_lora_files():
@@ -138,32 +165,36 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_dynamic_lora_invalid_lora_rank(client: openai.AsyncOpenAI,
-                                              tmp_path, zephyr_lora_files):
-    invalid_rank = tmp_path / "invalid_rank"
-
-    # Copy adapter from zephyr_lora_files to invalid_rank
-    shutil.copytree(zephyr_lora_files, invalid_rank)
-
-    with open(invalid_rank / "adapter_config.json") as f:
+@pytest.mark.parametrize("test_name,config_change,expected_error",
+                         BADREQUEST_CASES)
+async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path,
+                                        zephyr_lora_files, test_name: str,
+                                        config_change: dict,
+                                        expected_error: str):
+    # Create test directory
+    test_dir = tmp_path / test_name
+
+    # Copy adapter files
+    shutil.copytree(zephyr_lora_files, test_dir)
+
+    # Load and modify configuration
+    config_path = test_dir / "adapter_config.json"
+    with open(config_path) as f:
         adapter_config = json.load(f)
+    # Apply configuration changes
+    adapter_config.update(config_change)
 
-    print(adapter_config)
-
-    # assert False
-
-    # Change rank to invalid value
-    adapter_config["r"] = 1024
-    with open(invalid_rank / "adapter_config.json", "w") as f:
+    # Save modified configuration
+    with open(config_path, "w") as f:
         json.dump(adapter_config, f)
 
-    with pytest.raises(openai.BadRequestError,
-                       match="is greater than max_lora_rank"):
+    # Test loading the adapter
+    with pytest.raises(openai.BadRequestError, match=expected_error):
         await client.post("load_lora_adapter",
                           cast_to=str,
                           body={
-                              "lora_name": "invalid-json",
-                              "lora_path": str(invalid_rank)
+                              "lora_name": test_name,
+                              "lora_path": str(test_dir)
                           })
 
 
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 537d95b025a9d..b907af47d08d7 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -3,6 +3,7 @@
 import pytest
 
 from vllm.lora.models import LoRAModel
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 from vllm.model_executor.models.utils import WeightsMapper
 
@@ -30,11 +31,14 @@ def test_load_checkpoints(
         else:
             expected_lora_modules.append(module)
     if lora_name == "baichuan7B":
+        peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
+                                                max_position_embeddings=4096)
         # For the baichuan7B model, load it's LoRA,
         # and the test should pass.
         LoRAModel.from_local_checkpoint(
             baichuan_lora_files,
             expected_lora_modules,
+            peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
             embedding_modules=embedding_modules,
@@ -43,9 +47,12 @@ def test_load_checkpoints(
         # Test that the target_modules contain prefix
         # such as "model.layers.0.self_atten.W_pack", and
         # the test should pass.
+        peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files,
+                                                max_position_embeddings=4096)
         LoRAModel.from_local_checkpoint(
             baichuan_zero_lora_files,
             expected_lora_modules,
+            peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
             embedding_modules=embedding_modules,
@@ -53,9 +60,12 @@ def test_load_checkpoints(
     elif lora_name == "baichuan7B-zero-regex":
         # Test that the `target_modules` in the form of regular expressions,
         # such as `model\\..*(W_pack|o_proj)`, and the test should pass.
+        peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files,
+                                                max_position_embeddings=4096)
         LoRAModel.from_local_checkpoint(
             baichuan_regex_lora_files,
             expected_lora_modules,
+            peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
             embedding_modules=embedding_modules,
@@ -64,10 +74,13 @@ def test_load_checkpoints(
         # For the baichuan7B model, load chatglm3-6b's LoRA,
         # and the test should raise the following error.
         expected_error = "Please verify that the loaded LoRA module is correct"  # noqa: E501
+        peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files,
+                                                max_position_embeddings=4096)
         with pytest.raises(ValueError, match=expected_error):
             LoRAModel.from_local_checkpoint(
                 chatglm3_lora_files,
                 expected_lora_modules,
+                peft_helper=peft_helper,
                 lora_model_id=1,
                 device="cpu",
                 embedding_modules=embedding_modules,
@@ -94,9 +107,12 @@ def test_lora_weights_mapping(baichuan_lora_files):
             ".layers.": ".baichuan_layers.",
         },
     )
+    peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
+                                            max_position_embeddings=4096)
     lora_model = LoRAModel.from_local_checkpoint(
         baichuan_lora_files,
         expected_lora_modules,
+        peft_helper=peft_helper,
         lora_model_id=1,
         device="cpu",
         embedding_modules=embedding_modules,
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index e2daf9d135113..1c0ee01c038d0 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -3,6 +3,7 @@
 import pytest
 
 from vllm.lora.models import LoRAModel
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.model_executor.models.llama import LlamaForCausalLM
 
@@ -27,9 +28,11 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     lora_path = get_adapter_absolute_path(lora_name)
 
     # lora loading should work for either absolute path and hugggingface id.
+    peft_helper = PEFTHelper.from_local_dir(lora_path, 4096)
     lora_model = LoRAModel.from_local_checkpoint(
         lora_path,
         expected_lora_modules,
+        peft_helper=peft_helper,
         lora_model_id=1,
         device="cpu",
         embedding_modules=embedding_modules,
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index ca523c66abe42..9a5b9aabf5078 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,5 +1,3 @@
-import json
-import math
 import os
 from typing import Dict, List
 
@@ -34,56 +32,6 @@
 ] if current_platform.is_cuda_alike() else ["cpu"])
 
 
-def test_peft_helper(sql_lora_files):
-    lora_config_path = os.path.join(sql_lora_files, "adapter_config.json")
-    with open(lora_config_path) as f:
-        config = json.load(f)
-    peft_helper = PEFTHelper.from_dict(config)
-    assert peft_helper.r == 8
-    assert peft_helper.lora_alpha == 16
-    assert peft_helper.target_modules == [
-        "q_proj",
-        "v_proj",
-        "k_proj",
-        "o_proj",
-        "gate_proj",
-        "up_proj",
-        "down_proj",
-        "embed_tokens",
-        "lm_head",
-    ]
-    scaling = peft_helper.lora_alpha / peft_helper.r
-    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
-
-    # test RSLoRA
-    config = dict(r=8,
-                  lora_alpha=16,
-                  target_modules=["gate_proj"],
-                  use_rslora=True)
-    peft_helper = PEFTHelper.from_dict(config)
-
-    scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
-    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
-
-    expected_error = "vLLM only supports modules_to_save being None."
-    with pytest.raises(ValueError, match=expected_error):
-        config = dict(
-            r=8,
-            lora_alpha=16,
-            target_modules=["gate_proj"],
-            modules_to_save=["lm_head"],
-        )
-        PEFTHelper.from_dict(config)
-
-    expected_error = "vLLM does not yet support DoRA."
-    with pytest.raises(ValueError, match=expected_error):
-        config = dict(r=8,
-                      lora_alpha=16,
-                      target_modules=["gate_proj"],
-                      use_dora=True)
-        PEFTHelper.from_dict(config)
-
-
 @pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
     tensors = load_file(
@@ -91,11 +39,8 @@ def test_from_lora_tensors(sql_lora_files, device):
     new_embeddings = load_file(
         os.path.join(sql_lora_files, "new_embeddings.safetensors"))
 
-    lora_config_path = os.path.join(sql_lora_files, "adapter_config.json")
-    with open(lora_config_path) as f:
-        config = json.load(f)
-
-    peft_helper = PEFTHelper.from_dict(config)
+    peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
+                                            max_position_embeddings=4096)
     lora_model = LoRAModel.from_lora_tensors(
         1,
         tensors,
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
new file mode 100644
index 0000000000000..a524d5ce5f34a
--- /dev/null
+++ b/tests/lora/test_peft_helper.py
@@ -0,0 +1,109 @@
+import json
+import math
+import shutil
+
+import pytest
+
+from vllm.config import LoRAConfig
+from vllm.lora.peft_helper import PEFTHelper
+
+ERROR_CASES = [
+    (
+        "test_rank",
+        {
+            "r": 1024
+        },
+        "is greater than max_lora_rank",
+    ),
+    (
+        "test_bias",
+        {
+            "bias": "all"
+        },
+        "Adapter bias cannot be used without bias_enabled",
+    ),
+    ("test_dora", {
+        "use_dora": True
+    }, "does not yet support DoRA"),
+    (
+        "test_modules_to_save",
+        {
+            "modules_to_save": ["lm_head"]
+        },
+        "only supports modules_to_save being None",
+    ),
+]
+
+
+def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
+    peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1,
+                                            max_position_embeddings=4096)
+    lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
+    peft_helper.validate_legal(lora_config)
+    assert peft_helper.r == 8
+    assert peft_helper.lora_alpha == 16
+    assert peft_helper.target_modules == [
+        "q_proj",
+        "v_proj",
+        "k_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    assert peft_helper.context_length == 16384
+    assert peft_helper.vllm_max_position_embeddings == 4096
+    assert peft_helper.vllm_long_context_scaling_factor == float(
+        math.ceil(peft_helper.context_length /
+                  peft_helper.vllm_max_position_embeddings))
+    # test RSLoRA
+    rslora_config = dict(use_rslora=True)
+    test_dir = tmp_path / "test_rslora"
+    shutil.copytree(long_context_lora_files_16k_1, test_dir)
+
+    # Load and modify configuration
+    config_path = test_dir / "adapter_config.json"
+    with open(config_path) as f:
+        adapter_config = json.load(f)
+    # Apply configuration changes
+    adapter_config.update(rslora_config)
+
+    # Save modified configuration
+    with open(config_path, "w") as f:
+        json.dump(adapter_config, f)
+
+    peft_helper = PEFTHelper.from_local_dir(test_dir,
+                                            max_position_embeddings=4096)
+    peft_helper.validate_legal(lora_config)
+    scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
+    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
+
+
+@pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES)
+def test_peft_helper_error(
+    sql_lora_files,
+    tmp_path,
+    test_name: str,
+    config_change: dict,
+    expected_error: str,
+):
+    test_dir = tmp_path / test_name
+    shutil.copytree(sql_lora_files, test_dir)
+
+    # Load and modify configuration
+    config_path = test_dir / "adapter_config.json"
+    with open(config_path) as f:
+        adapter_config = json.load(f)
+    # Apply configuration changes
+    adapter_config.update(config_change)
+
+    # Save modified configuration
+    with open(config_path, "w") as f:
+        json.dump(adapter_config, f)
+    lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
+    # Test loading the adapter
+    with pytest.raises(ValueError, match=expected_error):
+        PEFTHelper.from_local_dir(
+            test_dir, max_position_embeddings=4096).validate_legal(lora_config)
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 8f231de912c95..3aa9d30549f36 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -296,6 +296,7 @@ def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
                                is_engine_errored=False,
                                exception=e)
             self._send_outputs(rpc_err)
+            return
         # Otherwise, send back the successful load message
         self._send_outputs(
             RPCAdapterLoadedResponse(request_id=request.request_id))
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index a222eafadcb68..fc422f0917bd5 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -157,24 +157,16 @@ async def load_lora_adapter(
         # This will also pre-load it for incoming requests
         try:
             await self.engine_client.add_lora(lora_request)
-        except ValueError as e:
-            # Adapter not found or lora configuration errors
-            if "No adapter found" in str(e):
-                return create_error_response(message=str(e),
-                                             err_type="NotFoundError",
-                                             status_code=HTTPStatus.NOT_FOUND)
-            else:
-                return create_error_response(
-                    message=str(e),
-                    err_type="BadRequestError",
-                    status_code=HTTPStatus.BAD_REQUEST)
         except BaseException as e:
-            # Some other unexpected problem loading the adapter, e.g. malformed
-            # input files.
-            # More detailed error messages for the user would be nicer here
+            error_type = "BadRequestError"
+            status_code = HTTPStatus.BAD_REQUEST
+            if isinstance(e, ValueError) and "No adapter found" in str(e):
+                error_type = "NotFoundError"
+                status_code = HTTPStatus.NOT_FOUND
+
             return create_error_response(message=str(e),
-                                         err_type="BadRequestError",
-                                         status_code=HTTPStatus.BAD_REQUEST)
+                                         err_type=error_type,
+                                         status_code=status_code)
 
         self.lora_requests.append(lora_request)
         logger.info("Loaded new LoRA adapter: name '%s', path '%s'", lora_name,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 5b7225bdc8f37..9809405ca9a61 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -1,5 +1,4 @@
 import copy
-import json
 import math
 import os
 import re
@@ -180,8 +179,8 @@ def from_local_checkpoint(
         cls,
         lora_dir: str,
         expected_lora_modules: List[str],
+        peft_helper: PEFTHelper,
         *,
-        max_position_embeddings: Optional[int] = None,
         lora_model_id: Optional[int] = None,
         device: str = "cuda",
         dtype: Optional[torch.dtype] = None,
@@ -196,9 +195,7 @@ def from_local_checkpoint(
             lora_dir: The local path that has lora data.
             expected_lora_modules: Name of modules that are expected to be
                 replaced by lora.
-            max_position_embeddings: Max position embedding length. Used to
-                scaling the largest context length. If None, the lora model's
-                context length is not scaled.
+            peft_helper: Loaded lora configuration information.
             lora_model_id: Lora model id. If not given, automatically set by
                 a global counter.
             device: Device where the lora model is loaded.
@@ -207,18 +204,13 @@ def from_local_checkpoint(
         Returns:
             Loaded LoRA Model.
         """
-        lora_config_path = os.path.join(lora_dir, "adapter_config.json")
         lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
         lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
         new_embeddings_tensor_path = os.path.join(
             lora_dir, "new_embeddings.safetensors")
         new_embeddings_bin_file_path = os.path.join(lora_dir,
                                                     "new_embeddings.bin")
-        with open(lora_config_path) as f:
-            config = json.load(f)
 
-        config["vllm_max_position_embeddings"] = max_position_embeddings
-        peft_helper = PEFTHelper.from_dict(config)
         unexpected_modules: List[Union[list[str], str]]
         if os.path.isfile(lora_tensor_path):
             tensors: Dict[str, torch.Tensor] = {}
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index dacfb9ebd1480..b9c506f6e0bfd 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -1,9 +1,12 @@
 # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py
 
+import json
 import math
+import os
 from dataclasses import MISSING, dataclass, field, fields
-from typing import Literal, Optional, Union
+from typing import List, Literal, Optional, Union
 
+from vllm.config import LoRAConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -11,6 +14,12 @@
 
 @dataclass
 class PEFTHelper:
+    """ 
+    A helper class for PEFT configurations, specifically designed for LoRA.
+    This class handles configuration validation, compatibility checks for 
+    various LoRA implementations.
+    """
+
     # Required fields
     r: int
     lora_alpha: int
@@ -29,20 +38,18 @@ class PEFTHelper:
     vllm_max_position_embeddings: Optional[int] = field(default=False)
     vllm_long_context_scaling_factor: Optional[float] = field(default=None)
 
-    def _validate_features(self):
+    def _validate_features(self) -> List[str]:
+        """
+        Check if there are any unsupported Lora features.
+        """
         error_msg = []
-
         if self.modules_to_save:
             error_msg.append("vLLM only supports modules_to_save being None.")
-
         if self.use_dora:
             error_msg.append("vLLM does not yet support DoRA.")
-
-        if error_msg:
-            raise ValueError(f"{', '.join(error_msg)}")
+        return error_msg
 
     def __post_init__(self):
-        self._validate_features()
         if self.use_rslora:
             logger.info_once("Loading LoRA weights trained with rsLoRA.")
             self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
@@ -78,3 +85,29 @@ def from_dict(cls, config_dict: dict) -> "PEFTHelper":
             for k, v in config_dict.items() if k in class_fields
         }
         return cls(**filtered_dict)
+
+    @classmethod
+    def from_local_dir(cls, lora_path: str,
+                       max_position_embeddings: Optional[int]) -> "PEFTHelper":
+        lora_config_path = os.path.join(lora_path, "adapter_config.json")
+
+        with open(lora_config_path) as f:
+            config = json.load(f)
+        config["vllm_max_position_embeddings"] = max_position_embeddings
+        return cls.from_dict(config)
+
+    def validate_legal(self, lora_config: LoRAConfig) -> None:
+        """
+        Validates the LoRA configuration settings against application 
+        constraints and requirements.
+        """
+        error_msg = self._validate_features()
+        if self.r > lora_config.max_lora_rank:
+            error_msg.append(
+                f"LoRA rank {self.r} is greater than max_lora_rank"
+                f" {lora_config.max_lora_rank}.")
+        if self.bias != "none" and not lora_config.bias_enabled:
+            error_msg.append(
+                "Adapter bias cannot be used without bias_enabled.")
+        if error_msg:
+            raise ValueError(f"{' '.join(error_msg)}")
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index eec462743fe9d..a64296f7fd902 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -12,6 +12,7 @@
 from vllm.logger import init_logger
 from vllm.lora.models import (LoRAModel, LoRAModelManager,
                               LRUCacheLoRAModelManager, create_lora_manager)
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
 
@@ -95,6 +96,13 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
             expected_lora_modules = list(set(expected_lora_modules))
             lora_path = get_adapter_absolute_path(lora_request.lora_path)
 
+            peft_helper = PEFTHelper.from_local_dir(
+                lora_path, self.max_position_embeddings)
+
+            # Validates the LoRA configuration against requirements before
+            # loading weights, throwing an exception if validation fails.
+            peft_helper.validate_legal(self.lora_config)
+
             # For some models like Qwen2VL, we need to use hf_to_vllm_mapper
             # to ensure correct loading of lora weights.
             hf_to_vllm_mapper = None
@@ -105,7 +113,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
             lora = self._lora_model_cls.from_local_checkpoint(
                 lora_path,
                 expected_lora_modules,
-                max_position_embeddings=self.max_position_embeddings,
+                peft_helper=peft_helper,
                 lora_model_id=lora_request.lora_int_id,
                 device="cpu",
                 dtype=self.lora_config.lora_dtype,
@@ -120,15 +128,14 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
             # - No adapter found to download from huggingface (or in
             #       offline mode)
             # - No local adapter files found at `lora_request.lora_path`
+            # For NotFoundError
             raise ValueError(
                 f"Loading lora {lora_request.lora_name} failed: No adapter "
                 f"found for {lora_path}") from e
         except Exception as e:
-            raise RuntimeError(f"Loading lora {lora_path} failed") from e
-        if lora.rank > self.lora_config.max_lora_rank:
-            raise ValueError(
-                f"LoRA rank {lora.rank} is greater than max_lora_rank "
-                f"{self.lora_config.max_lora_rank}.")
+            # For BadRequestError
+            raise e
+
         if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size:
             raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} "
                              f"is greater than lora_extra_vocab_size "

From d4e619457075c0dd917b84644f467f7f8aae10f0 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 17 Jan 2025 19:39:52 +0800
Subject: [PATCH 57/67] [CI/Build][CPU][Bugfix] Fix CPU CI (#12150)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .buildkite/run-cpu-test.sh               | 4 ++--
 vllm/model_executor/layers/activation.py | 8 ++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 9925db7bea593..e19ace782feb5 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -83,6 +83,6 @@ function cpu_tests() {
     tests/lora/test_qwen2vl.py"
 }
 
-# All of CPU tests are expected to be finished less than 25 mins.
+# All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index af7894b42c560..fb9684ac1c184 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -30,8 +30,10 @@ class FatreluAndMul(CustomOp):
     def __init__(self, threshold: float = 0.):
         super().__init__()
         self.threshold = threshold
-        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+        if current_platform.is_cuda_alike():
             self.op = torch.ops._C.fatrelu_and_mul
+        elif current_platform.is_cpu():
+            self._forward_method = self.forward_native
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
@@ -100,11 +102,13 @@ class MulAndSilu(CustomOp):
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+        if current_platform.is_cuda_alike():
             self.op = torch.ops._C.mul_and_silu
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
             self.op = ipex_ops.silu_and_mul
+        elif current_platform.is_cpu():
+            self._forward_method = self.forward_native
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""

From 87a0c076afafb93dd082ff3876bea08adca56c56 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 17 Jan 2025 20:47:01 +0800
Subject: [PATCH 58/67] [core] allow callable in collective_rpc (#12151)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                |  4 ++-
 tests/engine/test_custom_executor.py         |  4 +--
 tests/entrypoints/llm/test_collective_rpc.py | 36 ++++++++++++++++++++
 vllm/engine/llm_engine.py                    | 17 +++++++--
 vllm/entrypoints/llm.py                      | 14 +++++---
 vllm/executor/executor_base.py               |  9 ++---
 vllm/executor/mp_distributed_executor.py     | 21 ++++++++----
 vllm/executor/multiproc_worker_utils.py      | 12 +++----
 vllm/executor/ray_distributed_executor.py    | 14 +++++---
 vllm/executor/uniproc_executor.py            | 14 +++-----
 vllm/utils.py                                | 23 +++++++++++++
 vllm/v1/executor/multiproc_executor.py       | 19 ++++++++---
 vllm/worker/worker_base.py                   | 10 +++---
 13 files changed, 147 insertions(+), 50 deletions(-)
 create mode 100644 tests/entrypoints/llm/test_collective_rpc.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7442de245bd80..bff557d7fc92f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -107,7 +107,7 @@ steps:
   source_file_dependencies:
   - vllm/
   commands:
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
@@ -466,7 +466,9 @@ steps:
   - vllm/worker/worker_base.py
   - vllm/worker/worker.py
   - vllm/worker/model_runner.py
+  - entrypoints/llm/test_collective_rpc.py
   commands:
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py
index 2a057ca488a50..fdfcd4f4c9d50 100644
--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
@@ -1,6 +1,6 @@
 import asyncio
 import os
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import pytest
 
@@ -18,7 +18,7 @@ class Mock:
 class CustomUniExecutor(UniProcExecutor):
 
     def collective_rpc(self,
-                       method: str,
+                       method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
new file mode 100644
index 0000000000000..22473ce275295
--- /dev/null
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -0,0 +1,36 @@
+import pytest
+
+from vllm import LLM
+
+from ...utils import fork_new_process_for_each_test
+
+
+@pytest.mark.parametrize("tp_size", [1, 2])
+@pytest.mark.parametrize("backend", ["mp", "ray"])
+@fork_new_process_for_each_test
+def test_collective_rpc(tp_size, backend):
+    if tp_size == 1 and backend == "ray":
+        pytest.skip("Skip duplicate test case")
+    if tp_size == 1:
+        backend = None
+
+    # intentionally define the method and class in the test function,
+    # to test if they can be serialized and sent to the workers
+    def echo_rank(self):
+        return self.rank
+
+    from vllm.worker.worker import Worker
+
+    class MyWorker(Worker):
+
+        def echo_rank(self):
+            return self.rank
+
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+              enforce_eager=True,
+              load_format="dummy",
+              tensor_parallel_size=tp_size,
+              distributed_executor_backend=backend,
+              worker_cls=MyWorker)
+    for method in ["echo_rank", echo_rank]:
+        assert llm.collective_rpc(method) == list(range(tp_size))
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 5d19ce03d5b58..88c21f9a6d31b 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -5,10 +5,10 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable,
-                    List, Mapping, NamedTuple, Optional)
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
+                    Iterable, List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
-from typing import Set, Type, Union, cast, overload
+from typing import Set, Tuple, Type, Union, cast, overload
 
 import torch
 from typing_extensions import TypeVar, deprecated
@@ -1816,6 +1816,17 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.model_executor.stop_profile()
 
+    def collective_rpc(self,
+                       method: Union[str, Callable],
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        """
+        See LLM.collective_rpc for more details.
+        """
+        return self.model_executor.collective_rpc(method, timeout, args,
+                                                  kwargs)
+
     def check_health(self) -> None:
         if self.tokenizer:
             self.tokenizer.check_health()
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index b78d5c65a40f8..0cfe6be9ac767 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1,8 +1,8 @@
 import itertools
 import warnings
 from contextlib import contextmanager
-from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type,
-                    Union, cast, overload)
+from typing import (Any, Callable, ClassVar, Dict, List, Optional, Sequence,
+                    Tuple, Type, Union, cast, overload)
 
 import cloudpickle
 from tqdm import tqdm
@@ -464,7 +464,7 @@ def generate(
         return self.engine_class.validate_outputs(outputs, RequestOutput)
 
     def collective_rpc(self,
-                       method: str,
+                       method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
@@ -476,9 +476,13 @@ def collective_rpc(self,
         Then, users can call the new methods through this API.
         It is recommended to use this API to only pass control messages,
         and set up data-plane communication to pass data.
+        The method can also be a callable, which will be serialized
+        and sent to all workers to execute.
+        If the method is a callable, it should accept an additional
+        `self` argument, in addition to the arguments passed in `args`
+        and `kwargs`. The `self` argument will be the worker object.
         """
-        return self.llm_engine.model_executor.collective_rpc(
-            method, timeout, args, kwargs)
+        return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
 
     def beam_search(
         self,
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 00ecadcf92667..d8457cb693cdb 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,6 +1,7 @@
 import asyncio
 from abc import ABC, abstractmethod
-from typing import Any, Awaitable, Dict, List, Optional, Set, Tuple, Union
+from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
+                    Union)
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
@@ -47,7 +48,7 @@ def _init_executor(self) -> None:
 
     @abstractmethod
     def collective_rpc(self,
-                       method: str,
+                       method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
@@ -260,7 +261,7 @@ def _driver_execute_model(
         raise NotImplementedError
 
     def collective_rpc(self,
-                       method: str,
+                       method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
@@ -269,7 +270,7 @@ def collective_rpc(self,
     @abstractmethod
     def _run_workers(
         self,
-        method: str,
+        method: Union[str, Callable],
         *args,
         async_run_tensor_parallel_workers_only: bool = False,
         max_concurrent_workers: Optional[int] = None,
diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
index d9dde949b844a..8ae88e646aad6 100644
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -1,5 +1,7 @@
 import asyncio
-from typing import Any, List, Optional
+from typing import Any, Callable, List, Optional, Union
+
+import cloudpickle
 
 from vllm.executor.executor_base import DistributedExecutorBase
 from vllm.executor.multiproc_worker_utils import (
@@ -9,7 +11,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
-                        get_ip, get_open_port, make_async)
+                        get_ip, get_open_port, make_async, run_method)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -107,7 +109,7 @@ def _driver_execute_model(
 
     def _run_workers(
         self,
-        method: str,
+        method: Union[str, Callable],
         *args,
         async_run_tensor_parallel_workers_only: bool = False,
         max_concurrent_workers: Optional[int] = None,
@@ -121,6 +123,11 @@ def _run_workers(
                 It will also be run asynchronously and return a list of futures
                 rather than blocking on the results.
         """
+        if isinstance(method, str):
+            sent_method = method
+        else:
+            sent_method = cloudpickle.dumps(method)
+        del method
 
         if max_concurrent_workers:
             raise NotImplementedError(
@@ -129,18 +136,18 @@ def _run_workers(
         if async_run_tensor_parallel_workers_only:
             # Run only non-driver workers and just return futures.
             return [
-                worker.execute_method(method, *args, **kwargs)
+                worker.execute_method(sent_method, *args, **kwargs)
                 for worker in self.non_driver_workers
             ]
 
         # Start all remote workers first.
         worker_outputs = [
-            worker.execute_method(method, *args, **kwargs)
+            worker.execute_method(sent_method, *args, **kwargs)
             for worker in self.workers
         ]
 
-        driver_worker_method = getattr(self.driver_worker, method)
-        driver_worker_output = driver_worker_method(*args, **kwargs)
+        driver_worker_output = run_method(self.driver_worker, sent_method,
+                                          args, kwargs)
 
         # Get the results of the workers.
         return [driver_worker_output
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index c9fb3c664c575..539b6ae2d3572 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -15,7 +15,7 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.triton_utils.importing import HAS_TRITON
-from vllm.utils import _check_multiproc_method, get_mp_context
+from vllm.utils import _check_multiproc_method, get_mp_context, run_method
 
 if HAS_TRITON:
     from vllm.triton_utils import maybe_set_triton_cache_manager
@@ -169,7 +169,7 @@ def __init__(self, result_handler: ResultHandler,
         self.process.start()
 
     def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future],
-                      method: str, args, kwargs):
+                      method: Union[str, bytes], args, kwargs):
         task_id = uuid.uuid4()
         self.tasks[task_id] = future
         try:
@@ -180,12 +180,13 @@ def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future],
             del self.tasks[task_id]
             raise ChildProcessError("worker died") from e
 
-    def execute_method(self, method: str, *args, **kwargs):
+    def execute_method(self, method: Union[str, bytes], *args, **kwargs):
         future: ResultFuture = ResultFuture()
         self._enqueue_task(future, method, args, kwargs)
         return future
 
-    async def execute_method_async(self, method: str, *args, **kwargs):
+    async def execute_method_async(self, method: Union[str, bytes], *args,
+                                   **kwargs):
         future = asyncio.get_running_loop().create_future()
         self._enqueue_task(future, method, args, kwargs)
         return await future
@@ -230,8 +231,7 @@ def _run_worker_process(
             exception = None
             task_id, method, args, kwargs = items
             try:
-                executor = getattr(worker, method)
-                output = executor(*args, **kwargs)
+                output = run_method(worker, method, args, kwargs)
             except SystemExit:
                 raise
             except KeyboardInterrupt:
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 3baeb63918a62..2afd99f99b353 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -2,8 +2,9 @@
 import os
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
+import cloudpickle
 import msgspec
 
 import vllm.envs as envs
@@ -410,7 +411,7 @@ def execute_model(
 
     def _run_workers(
         self,
-        method: str,
+        method: Union[str, Callable],
         *args,
         async_run_tensor_parallel_workers_only: bool = False,
         max_concurrent_workers: Optional[int] = None,
@@ -426,6 +427,11 @@ def _run_workers(
           rather than blocking on the results.
         - args/kwargs: All workers share the same args/kwargs
         """
+        if isinstance(method, str):
+            sent_method = method
+        else:
+            sent_method = cloudpickle.dumps(method)
+        del method
         if self.use_ray_spmd_worker:
             assert not async_run_tensor_parallel_workers_only, (
                 "async_run_tensor_parallel_workers_only is not supported for "
@@ -440,7 +446,7 @@ def _run_workers(
         if async_run_tensor_parallel_workers_only:
             ray_workers = self.non_driver_workers
         ray_worker_outputs = [
-            worker.execute_method.remote(method, *args, **kwargs)
+            worker.execute_method.remote(sent_method, *args, **kwargs)
             for worker in ray_workers
         ]
 
@@ -455,7 +461,7 @@ def _run_workers(
         if not self.use_ray_spmd_worker:
             # Start the driver worker after all the ray workers.
             driver_worker_output = [
-                self.driver_worker.execute_method(method, *args, **kwargs)
+                self.driver_worker.execute_method(sent_method, *args, **kwargs)
             ]
 
         # Get the results of the ray workers.
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index 27b83e95ba95b..a5c4dcf0ec7f9 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.distributed as dist
@@ -7,7 +7,8 @@
 import vllm.envs as envs
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        run_method)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -39,18 +40,13 @@ def _init_executor(self) -> None:
         self.collective_rpc("load_model")
 
     def collective_rpc(self,
-                       method: str,
+                       method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
         if kwargs is None:
             kwargs = {}
-        try:
-            func = getattr(self.driver_worker, method)
-        except AttributeError:
-            raise NotImplementedError(f"Method {method} is not implemented.") \
-                from None
-        answer = func(*args, **kwargs)
+        answer = run_method(self.driver_worker, method, args, kwargs)
         return [answer]
 
     def check_health(self) -> None:
diff --git a/vllm/utils.py b/vllm/utils.py
index 7477e7028f5ef..89ba119bb5e55 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -36,6 +36,7 @@
                     overload)
 from uuid import uuid4
 
+import cloudpickle
 import numpy as np
 import numpy.typing as npt
 import psutil
@@ -2166,3 +2167,25 @@ def bind_kv_cache(
         assert len(forward_ctx.kv_cache) == len(kv_cache)
         for ve, ve_kv_cache in enumerate(kv_cache):
             forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
+
+
+def run_method(obj: Any, method: Union[str, bytes, Callable], args: Tuple[Any],
+               kwargs: Dict[str, Any]) -> Any:
+    """
+    Run a method of an object with the given arguments and keyword arguments.
+    If the method is string, it will be converted to a method using getattr.
+    If the method is serialized bytes and will be deserialized using
+    cloudpickle.
+    If the method is a callable, it will be called directly.
+    """
+    if isinstance(method, bytes):
+        func = partial(cloudpickle.loads(method), obj)
+    elif isinstance(method, str):
+        try:
+            func = getattr(obj, method)
+        except AttributeError:
+            raise NotImplementedError(f"Method {method!r} is not"
+                                      " implemented.") from None
+    else:
+        func = partial(method, obj)  # type: ignore
+    return func(*args, **kwargs)
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index e92acc7cb5e41..fd977d07e8d81 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -6,9 +6,11 @@
 import weakref
 from dataclasses import dataclass
 from enum import Enum, auto
+from functools import partial
 from multiprocessing.process import BaseProcess
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
+import cloudpickle
 import psutil
 import zmq
 
@@ -120,7 +122,7 @@ def get_kv_cache_spec(self) -> KVCacheSpec:
         return kv_cache_specs[0]
 
     def collective_rpc(self,
-                       method: str,
+                       method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
@@ -141,7 +143,12 @@ def collective_rpc(self,
         kwargs = kwargs or {}
 
         try:
-            self.rpc_broadcast_mq.enqueue((method, args, kwargs))
+            if isinstance(method, str):
+                send_method = method
+            else:
+                send_method = cloudpickle.dumps(
+                    method, protocol=pickle.HIGHEST_PROTOCOL)
+            self.rpc_broadcast_mq.enqueue((send_method, args, kwargs))
 
             responses = [None] * self.world_size
             for w in self.workers:
@@ -408,7 +415,11 @@ def worker_busy_loop(self):
             method, args, kwargs = self.rpc_broadcast_mq.dequeue()
 
             try:
-                output = getattr(self.worker, method)(*args, **kwargs)
+                if isinstance(method, str):
+                    func = getattr(self.worker, method)
+                elif isinstance(method, bytes):
+                    func = partial(cloudpickle.loads(method), self.worker)
+                output = func(*args, **kwargs)
             except Exception as e:
                 self.worker_response_mq.enqueue(
                     (WorkerProc.ResponseStatus.FAILURE, e))
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index bced5b9f44228..fb9919f7a7b6a 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -14,7 +14,8 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import (enable_trace_function_call_for_thread,
-                        resolve_obj_by_qualname, update_environment_variables)
+                        resolve_obj_by_qualname, run_method,
+                        update_environment_variables)
 from vllm.worker.model_runner_base import (BroadcastableModelInput,
                                            ModelRunnerBase,
                                            ModelRunnerInputBase)
@@ -539,17 +540,16 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
         self.worker = worker_class(**kwargs)
         assert self.worker is not None
 
-    def execute_method(self, method: str, *args, **kwargs):
+    def execute_method(self, method: Union[str, bytes], *args, **kwargs):
         try:
             target = self if self.worker is None else self.worker
-            executor = getattr(target, method)
-            return executor(*args, **kwargs)
+            return run_method(target, method, args, kwargs)
         except Exception as e:
             # if the driver worker also execute methods,
             # exceptions in the rest worker may cause deadlock in rpc like ray
             # see https://github.com/vllm-project/vllm/issues/3455
             # print the error and inform the user to solve the error
-            msg = (f"Error executing method {method}. "
+            msg = (f"Error executing method {method!r}. "
                    "This might cause deadlock in distributed execution.")
             logger.exception(msg)
             raise e

From 7eea2df029ce283799151d5f4fce8c1e7a939c2a Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 17 Jan 2025 14:00:22 +0100
Subject: [PATCH 59/67] [CI] Cleanup run_tests.sh logs  (#700)

---
 .jenkins/lm-eval-harness/run-tests.sh                |  1 +
 .jenkins/lm-eval-harness/test_lm_eval_correctness.py | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.jenkins/lm-eval-harness/run-tests.sh b/.jenkins/lm-eval-harness/run-tests.sh
index 2370388ebc66d..179b75e1137f7 100644
--- a/.jenkins/lm-eval-harness/run-tests.sh
+++ b/.jenkins/lm-eval-harness/run-tests.sh
@@ -42,6 +42,7 @@ do
     export LM_EVAL_TP_SIZE=$TP_SIZE
     export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
     export VLLM_SKIP_WARMUP=true
+    export TQDM_BAR_FORMAT="{desc}: {percentage:3.0f}% {bar:10} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]" 
     RANDOM_SUFFIX=$(tr -dc A-Za-z0-9 </dev/urandom | head -c 4; echo)
     JUNIT_FAMILY=""
     JUNIT_XML=""
diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
index 55d633e51ce97..9b9b0b3691047 100644
--- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
@@ -24,6 +24,9 @@
     "LM_EVAL_TEST_DATA_FILE",
     ".jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
 
+REPORT_PERFORMANCE = os.environ.get("LM_EVAL_REPORT_PERFORMANCE",
+                                    "false") in ['1', 'true']
+
 TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
 
 
@@ -170,9 +173,10 @@ def test_lm_eval_correctness(record_xml_attribute, record_property):
                                 x['resps'])))['input_ids'])) for x in samples
             ]
             tokenized_outputs_lens = [len(x) for x in tokenized_outputs]
-            report_performance(task['name'], tokenized_inputs_lens,
-                               tokenized_outputs_lens, total_time,
-                               record_property)
+            if REPORT_PERFORMANCE:
+                report_performance(task['name'], tokenized_inputs_lens,
+                                   tokenized_outputs_lens, total_time,
+                                   record_property)
 
             for metric in task["metrics"]:
                 ground_truth = metric["value"]

From a128878daadb5284f5f35a283c73e50441d668da Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 17 Jan 2025 15:51:17 +0200
Subject: [PATCH 60/67] fix TP crashes

---
 vllm/worker/hpu_worker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 2af3349641776..20fe0017a1010 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -222,7 +222,6 @@ def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None,
     ) -> Optional[List[SamplerOutput]]:
-        assert execute_model_req is not None
         # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
         # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
         # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any # noqa:E501

From 2e53e75fed27f105facacd09a948d8f2fe6f257f Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 17 Jan 2025 16:41:26 +0200
Subject: [PATCH 61/67] make mypy happy

---
 vllm/worker/hpu_worker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 20fe0017a1010..14403c3d59f77 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -235,7 +235,8 @@ def execute_model(
             'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
         log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
                                            '0') != '0' or log_cpu_fallbacks_all
-        if log_graph_compilation or log_cpu_fallbacks:
+        if log_graph_compilation or log_cpu_fallbacks \
+            and execute_model_req is not None:
             from habana_frameworks.torch.hpu.metrics import metric_localcontext
             seq_group_metadata_list = execute_model_req.seq_group_metadata_list
             is_prompt = any([

From 21f5fb2b0d72ec558746745b9a39b6cd31389593 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 17 Jan 2025 16:42:13 +0200
Subject: [PATCH 62/67] =?UTF-8?q?=C2=BFwhat=20the=20heck=20is=20incquark?=
 =?UTF-8?q?=3F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm/model_executor/layers/quantization/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 66bf8d232d10f..c2387638e360d 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -26,7 +26,7 @@
     "experts_int8",
     "neuron_quant",
     "ipex",
-    "inc"
+    "inc",
     "quark"
 ]
 

From f1e911d8cce3603e3fa2a45eba7b3e4a847c5da8 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 17 Jan 2025 16:45:04 +0200
Subject: [PATCH 63/67] i forgot brackets again

---
 vllm/worker/hpu_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 14403c3d59f77..a83039054fc78 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -235,7 +235,7 @@ def execute_model(
             'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
         log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
                                            '0') != '0' or log_cpu_fallbacks_all
-        if log_graph_compilation or log_cpu_fallbacks \
+        if (log_graph_compilation or log_cpu_fallbacks) \
             and execute_model_req is not None:
             from habana_frameworks.torch.hpu.metrics import metric_localcontext
             seq_group_metadata_list = execute_model_req.seq_group_metadata_list

From b10992b8c0b5e4b0d1a6b91a31a47de42cd5ef14 Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Mon, 20 Jan 2025 12:46:51 +0530
Subject: [PATCH 64/67] Fix LoRA tests (#696)

This PR updates `test/lora/utils.py` based on latest rebase.
---
 tests/lora/test_lora_hpu.py | 6 +++---
 tests/lora/utils.py         | 9 ++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py
index bfdf1228b1b55..f3a3d42dffb7c 100644
--- a/tests/lora/test_lora_hpu.py
+++ b/tests/lora/test_lora_hpu.py
@@ -41,7 +41,7 @@ def createLoraMask(indices, batch_size, seq_len, max_loras, max_lora_rank,
 @pytest.mark.parametrize("rank", RANKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_apply_lora(m, n, k, rank, dtype) -> None:
-    manager = DummyLoRAManager()
+    manager = DummyLoRAManager(device="hpu")
 
     module_name = "module"
     weight = torch.rand([m, n], device="hpu", dtype=dtype)
@@ -111,7 +111,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
     if m // 2 not in TENSOR_SIZES:
         pytest.skip("m//2 must be in TENSOR_SIZES")
 
-    manager = DummyLoRAManager()
+    manager = DummyLoRAManager(device="hpu")
 
     module_name = "module"
     weight = torch.rand([m // 2, n], device="hpu", dtype=dtype)
@@ -183,7 +183,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
 @pytest.mark.parametrize("rank", RANKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
-    manager = DummyLoRAManager()
+    manager = DummyLoRAManager(device="hpu")
 
     module_name = "module"
     weight_q = torch.empty(qkv[0], n, device="hpu", dtype=dtype)
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index 8096288777271..a35aa47f1b4b8 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -3,7 +3,6 @@
 import torch
 
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.utils import get_device
 
 
 class DummyLoRAManager:
@@ -32,10 +31,10 @@ def init_random_lora(
             lora_alpha=1,
             lora_a=torch.rand([weight.shape[1], rank],
                               dtype=weight.dtype,
-                              device=get_device()),
+                              device=self._device),
             lora_b=torch.rand([rank, weight.shape[0]],
                               dtype=weight.dtype,
-                              device=get_device()),
+                              device=self._device),
         )
         if generate_embeddings_tensor:
             lora.embeddings_tensor = torch.rand(
@@ -61,8 +60,8 @@ def init_lora(
             module_name,
             rank=rank,
             lora_alpha=1,
-            lora_a=torch.rand([input_dim, rank], device=get_device()),
-            lora_b=torch.rand([rank, output_dim], device=get_device()),
+            lora_a=torch.rand([input_dim, rank], device=self._device),
+            lora_b=torch.rand([rank, output_dim], device=self._device),
             embeddings_tensor=embeddings_tensor,
         )
         self.set_module_lora(module_name, lora)

From 12526462c66ad7b8571fa01b6c3f91d2b057a7bf Mon Sep 17 00:00:00 2001
From: Mohit Deopujari <mdeopujari@habana.ai>
Date: Mon, 20 Jan 2025 01:20:43 -0800
Subject: [PATCH 65/67] Updating README_GAUDI in habana_main  (#690)

1. This PR updates habana_main README_GAUDI to the Technical Writer
reviewed version as seen in v1.19.0.
(habana_main README_GAUDI and v1.19.0 README_GAUDI had diverged. )
2. It also fixes broken urls due to recent restructuring in upstream
vllm examples folder.
3. Adds notes in examples folder for new users and redirects them to see
the Gaudi specific examples in README_GAUDI.md.
---
 README.md                    |   2 +
 README_GAUDI.md              | 262 +++++++++++++++++++++++------------
 examples/README.md           |   4 +
 examples/other/fp8/README.md |   3 +
 4 files changed, 183 insertions(+), 88 deletions(-)
 create mode 100644 examples/README.md

diff --git a/README.md b/README.md
index 2c274467e134a..80d36e4cccc0c 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,8 @@ Easy, fast, and cheap LLM serving for everyone
 </p>
 
 ---
+> [!NOTE] 
+> For Intel Gaudi specific setup instructions and examples, please refer [Intel® Gaudi® README](https://github.com/HabanaAI/vllm-fork/blob/habana_main/README_GAUDI.md). For jupyter notebook based quickstart tutorials refer [Getting Started with vLLM](https://github.com/HabanaAI/Gaudi-tutorials/blob/main/PyTorch/Getting_Started_with_vLLM/Getting_Started_with_vLLM.ipynb) and [Understanding vLLM on Gaudi](https://github.com/HabanaAI/Gaudi-tutorials/blob/main/PyTorch/Understanding_vLLM_on_Gaudi/Understanding_vLLM_on_Gaudi.ipynb).
 
 The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui)
 
diff --git a/README_GAUDI.md b/README_GAUDI.md
index 49d032469de6b..74d742e815df5 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -1,32 +1,39 @@
 # vLLM with Intel® Gaudi® AI Accelerators
 
-This README provides instructions on running vLLM with Intel Gaudi devices.
+This README provides instructions on how to run vLLM with Intel Gaudi devices.
 
 # Requirements and Installation
 
-Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) to set up the execution environment. To achieve the best performance, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
+Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) to set up the execution environment.
+To achieve the best performance, please follow the methods outlined in the
+[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
 ## Requirements
 
-- OS: Ubuntu 22.04 LTS
-- Python: 3.10
+- Ubuntu 22.04 LTS OS
+- Python 3.10
 - Intel Gaudi accelerator
-- Intel Gaudi software version 1.18.0
+- Intel Gaudi software version 1.19.0 and above
+
+## Quick Start Using Dockerfile
+Set up the container with latest release of Gaudi Software Suite using the Dockerfile:
 
-## Quick start using Dockerfile
 ```
 $ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
 $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
 ```
 
 > [!TIP]
-> If you're facing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install optional packages" section of [Install Driver and Software](https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html#install-driver-and-software) and "Configure Container Runtime" section of [Docker Installation] (https://docs.habana.ai/en/latest/Installation_Guide/Installation_Methods/Docker_Installation.html#configure-container-runtime). Make sure you have ``habanalabs-container-runtime`` package installed and that ``habana`` container runtime is registered.
+> If you are facing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Optional Packages" section
+  of [Install Driver and Software](https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html#install-driver-and-software) and "Configure Container
+  Runtime" section of [Docker Installation](https://docs.habana.ai/en/latest/Installation_Guide/Installation_Methods/Docker_Installation.html#configure-container-runtime).
+  Make sure you have ``habanalabs-container-runtime`` package installed and that ``habana`` container runtime is registered.
 
 
-## Build from source
+## Build from Source
 
-### Environment verification
-To verify that the Intel Gaudi software was correctly installed, run:
+### Environment Verification
+To verify that the Intel Gaudi software was correctly installed, run the following:
 
 ```{.console}
 $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
@@ -39,18 +46,36 @@ Refer to [System Verification and Final Tests](https://docs.habana.ai/en/latest/
 
 ### Run Docker Image
 
-It is highly recommended to use the latest Docker image from Intel Gaudi vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) for more details.
+It is highly recommended to use the latest Docker image from Intel Gaudi vault.
+Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) for more details.
 
-Use the following commands to run a Docker image:
+Use the following commands to run a Docker image. Make sure to update the versions below as listed in the [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html):
 
 ```{.console}
-$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+$ docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
 ```
 
-### Build and Install vLLM-fork
+### Build and Install vLLM
+
+Currently, multiple ways are provided which can be used to install vLLM with Intel® Gaudi®, pick **one** option:
 
-Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
+#### 1. Build and Install the stable version
+
+vLLM releases are being performed periodically to align with Intel® Gaudi® software releases. The stable version is released with a tag, and supports fully validated features and performance optimizations in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork). To install the stable release from [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
+
+```{.console}
+$ git clone https://github.com/HabanaAI/vllm-fork.git
+$ cd vllm-fork
+$ git checkout v0.6.4.post2+Gaudi-1.19.0
+$ pip install -r requirements-hpu.txt
+$ python setup.py develop
+```
+
+#### 2. Build and Install the latest from vLLM-fork
+
+Currently, the latest features and performance optimizations are being developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and periodically upstreamed to vLLM main repository.
+To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
 
 ```{.console}
 $ git clone https://github.com/HabanaAI/vllm-fork.git
@@ -60,22 +85,38 @@ $ pip install -r requirements-hpu.txt
 $ python setup.py develop
 ```
 
-# Supported Features
+#### 3. Build and Install from vLLM main source
 
-- [Offline batched inference](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#offline-batched-inference)
-- Online inference via [OpenAI-Compatible Server](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#openai-compatible-server)
-- HPU autodetection - no need to manually select device within vLLM
-- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
-- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, prefill attention, Root Mean Square Layer Normalization, Rotary Positional Encoding
-- Tensor parallelism support for multi-card inference
-- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) for accelerating low-batch latency and throughput
-- Attention with Linear Biases (ALiBi)
-- INC quantization
+If you prefer to build and install directly from the main vLLM source, where periodically we are upstreaming new features, run the following:
+
+```{.console}
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ pip install -r requirements-hpu.txt
+$ python setup.py develop
+```
+
+# Supported Features
+| **Feature** 	| **Description** 	| **References** 	|
+|---	|---	|---	|
+| Offline batched inference 	| Offline inference using LLM class from vLLM Python API 	| [Quickstart](https://docs.vllm.ai/en/stable/getting_started/quickstart.html#offline-batched-inference)<br>[Example](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference.html) 	|
+| Online inference via OpenAI-Compatible Server 	| Online inference using HTTP server that implements OpenAI Chat and Completions API 	| [Documentation](https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html)<br>[Example](https://docs.vllm.ai/en/stable/getting_started/examples/openai_chat_completion_client.html) 	|
+| HPU autodetection 	| HPU users do not need to specify the target platform, it will be detected automatically upon vLLM startup 	| N/A 	|
+| Paged KV cache with algorithms enabled for Intel Gaudi accelerators 	| vLLM HPU backend contains a custom Paged Attention and cache operators implementations optimized for Gaudi devices. 	| N/A 	|
+| Custom Intel Gaudi operator implementations 	| vLLM HPU backend provides optimized implementations of operators such as prefill attention, Root Mean Square Layer Normalization, Rotary Positional Encoding. 	| N/A 	|
+| Tensor parallel inference (single-node multi-HPU) 	| vLLM HPU backend support multi-HPU inference across a single node with tensor parallelism with Ray and HCCL. 	| [Documentation](https://docs.vllm.ai/en/stable/serving/distributed_serving.html)<br>[Example](https://docs.ray.io/en/latest/serve/tutorials/vllm-example.html)<br>[HCCL reference](https://docs.habana.ai/en/latest/API_Reference_Guides/HCCL_APIs/index.html) 	|
+| Inference with HPU Graphs 	| vLLM HPU backend uses HPU Graphs by default for optimal performance. When HPU Graphs are enabled, execution graphs will be recorded ahead of time, to be later replayed during inference, significantly reducing host overheads. 	| [Documentation](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)<br>[vLLM HPU backend execution modes](https://docs.vllm.ai/en/stable/getting_started/gaudi-installation.html#execution-modes)<br>[Optimization guide](https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html#hpu-graph-capture) 	|
+| Inference with torch.compile (experimental) 	| vLLM HPU backend experimentally supports inference with torch.compile. 	| [vLLM HPU backend execution modes](https://docs.vllm.ai/en/stable/getting_started/gaudi-installation.html#execution-modes) 	|
+| Attention with Linear Biases (ALiBi) 	| vLLM HPU backend supports models utilizing Attention with Linear Biases (ALiBi) such as mpt-7b. 	| [vLLM supported models](https://docs.vllm.ai/en/latest/models/supported_models.html) 	|
+| INC quantization 	| vLLM HPU backend supports FP8 model and KV cache quantization and calibration with Intel Neural Compressor (INC). 	| [Documentation](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html) 	|
+| LoRA/MultiLoRA support 	| vLLM HPU backend includes support for LoRA and MultiLoRA on supported models. 	| [Documentation](https://docs.vllm.ai/en/stable/models/lora.html)<br>[Example](https://docs.vllm.ai/en/stable/getting_started/examples/multilora_inference.html)<br>[vLLM supported models](https://docs.vllm.ai/en/latest/models/supported_models.html) 	|
+| Multi-step scheduling support 	| vLLM HPU backend includes multi-step scheduling support for host overhead reduction, configurable by standard `--num-scheduler-seqs` parameter.  	| [Feature RFC](https://github.com/vllm-project/vllm/issues/6854) 	|
+| Automatic prefix caching (experimental) 	| vLLM HPU backend includes automatic prefix caching (APC) support for more efficient prefills, configurable by standard `--enable-prefix-caching` parameter. 	| [Documentation](https://docs.vllm.ai/en/stable/automatic_prefix_caching/apc.html)<br>[Details](https://docs.vllm.ai/en/stable/automatic_prefix_caching/details.html) 	|
+| Speculative decoding (experimental) 	| vLLM HPU backend includes experimental speculative decoding support for improving inter-token latency in some scenarios, configurabie via standard `--speculative_model` and `--num_speculative_tokens` parameters. 	| [Documentation](https://docs.vllm.ai/en/stable/models/spec_decode.html)<br>[Example](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference_mlpspeculator.html) 	|
 
 # Unsupported Features
 
 - Beam search
-- LoRA adapters
 - AWQ quantization
 - Prefill chunking (mixed-batch inferencing)
 
@@ -100,28 +141,33 @@ The following configurations have been validated to be function with Gaudi2 devi
 
 # Performance Tuning
 
-## Execution modes
+## Execution Modes
 
 Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
 
-| `PT_HPU_LAZY_MODE` | `enforce_eager` | execution mode     |
+| `PT_HPU_LAZY_MODE` | `enforce_eager` | Execution Mode     |
 | ------------------ | --------------- | ------------------ |
 | 0                  | 0               | torch.compile      |
 | 0                  | 1               | PyTorch eager mode |
 | 1                  | 0               | HPU Graphs         |
 | 1                  | 1               | PyTorch lazy mode  |
 
-> [!WARNING] 
-> In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
+> [!WARNING]
+> All modes using PT_HPU_LAZY_MODE=0 are experimental and should only be used for validating functional correctness. To achieve the best performance, use HPU Graphs or PyTorch Lazy Mode. Performance improvements are planned for future releases.
 
-## Bucketing mechanism
+## Bucketing Mechanism
 
-Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
+Intel Gaudi accelerators perform best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime)
+generates optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be highly dependent on input and output tensor shapes, requiring graph recompilation
+when encountering tensors with different shapes within the same topology. While these binaries efficiently utilize Gaudi, the compilation process itself can introduce noticeable overhead in end-to-end execution.
+In dynamic inference serving scenarios, it is important to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently, this is achieved by
+"bucketing" the model's forward pass across two dimensions: `batch_size` and `sequence_length`.
 
 > [!NOTE]
-> Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
+> Bucketing helps significantly reduce the number of required graphs, but it does not handle graph compilation or device code generation. These tasks are performed during the warmup and HPUGraph capture phase.
 
-Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
+Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters
+can be observed in logs during vLLM startup:
 
 ```{.}
 INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
@@ -130,9 +176,11 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma
 INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
 ```
 
-`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
+`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step`
+has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage,
+while allowing larger padding on larger batch sizes.
 
-Example (with ramp-up)
+**Example with ramp-up**
 
 ```{.}
 min = 2, step = 32, max = 64
@@ -141,7 +189,7 @@ min = 2, step = 32, max = 64
 => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
 ```
 
-Example (without ramp-up)
+**Example without ramp-up**
 
 ```{.}
 min = 128, step = 128, max = 512
@@ -150,19 +198,28 @@ min = 128, step = 128, max = 512
 => buckets = ramp_up + stable => (128, 256, 384, 512)
 ```
 
-In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
+In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor
+shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
 
-> [!WARNING] 
-> If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
+> [!WARNING]
+> If a request exceeds the maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency.
+  The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
 
-As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
+For example, if a request with 3 sequences, each having a maximum sequence length of 412, is sent to an idle vLLM server, it will be padded and executed as a `(4, 512)` prefill bucket. This is because the `batch_size`
+(number of sequences) will be padded to 4 (the nearest batch size dimension higher than 3), and the maximum sequence length will be padded to 512 (the nearest sequence length dimension higher than 412). After the
+prefill stage, it will be executed as a `(4, 512)` decode bucket and will remain in this bucket until either the batch dimension changes (e.g., due to a request being completed), in which case it will become
+a `(2, 512)` bucket, or the context length increases beyond 512 tokens, at which point it will become a `(4, 640)` bucket.
 
 > [!NOTE]
-> Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
+> Bucketing is transparent to the user – padding in the sequence length dimension is never returned, and padding in the batch dimension does not create new requests.
 
 ## Warmup
 
-Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
+Warmup is an optional but highly recommended step that occurs before the vLLM server starts listening. It executes a forward pass for each bucket using dummy data. The goal is to pre-compile all graphs
+and avoid any graph compilation overhead within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup.
+
+This example uses the same buckets as those in the Bucketing Mechanism section. Each output line corresponds to the execution of a single bucket. When a bucket is executed for the first time, its graph
+is compiled and can be reused later, avoiding further graph compilations.
 
 ```{.}
 INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
@@ -178,28 +235,48 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size
 INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
 ```
 
-This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
 
-> [!TIP] 
-> Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
+> [!TIP]
+> Compiling all the buckets may take some time and can be disabled by setting the VLLM_SKIP_WARMUP=true environment variable. Keep in mind that if you do this, you may encounter graph compilations
+  when executing a given bucket for the first time. Disabling warmup is fine for development, but it is highly recommended to enable it in deployment.
+
+## HPU Graph Capture
 
-## HPU Graph capture
+[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled,
+execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which
+needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
 
-[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
+When HPU Graphs are used, they share the common memory pool ("usable memory") with the KV cache, as determined by the `gpu_memory_utilization` flag (default value is `0.9`). Before the KV cache is allocated,
+the model weights are loaded onto the device, and a forward pass of the model is executed on dummy data to estimate memory usage. Only after that, the `gpu_memory_utilization` flag is applied. At its default value,
+it marks 90% of the free device memory at that point as usable. Next, the KV cache is allocated, the model is warmed up, and HPU Graphs are captured. The `VLLM_GRAPH_RESERVED_MEM` environment variable defines
+the ratio of memory reserved for HPU Graph capture. With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of the usable memory will be reserved for graph capture (referred to as "usable graph memory"),
+and the remaining 90% will be used for the KV cache. The environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default
+(`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages share equal memory constraints. A lower value corresponds to less usable graph memory reserved for the prefill stage. For example, setting `VLLM_GRAPH_PROMPT_RATIO=0.2`
+reserves 20% of usable graph memory for prefill graphs, while 80% is allocated for decode graphs.
 
-When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default). Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture. With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints. Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
+> [!NOTE]
+> `gpu_memory_utilization` does not represent the absolute memory usage across the HPU. Instead, it specifies the memory margin after loading the model and running a profile. For example, if a device has 100 GiB of
+  total memory and 50 GiB of free memory after loading the model weights and executing the profiling run, the default value of `gpu_memory_utilization` will mark 90% of the 50 GiB as usable, leaving 5 GiB as a margin,
+  regardless of the total device memory.
 
-> [!NOTE] 
-> `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
+You can also configure the strategy for capturing HPU graphs separately for the prompt and decode stages. The strategy affects the order in which graphs are captured. Two strategies are implemented:
 
-User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: - `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode - `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
+ - `max_bs` - The graph capture queue is sorted in descending order by batch size. Buckets with equal batch sizes are sorted by sequence length in an ascending order
+   (e.g., `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), which is the default strategy for decode.
+ - `min_tokens` - The graph capture queue is sorted in an ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), which is the default strategy for prompt.
 
-When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
+When a large number of requests are pending, the vLLM scheduler attempts to fill the maximum batch size for decoding as quickly as possible. Once a request is finished, the decode batch size decreases.
+When this happens, vLLM attempts to schedule a prefill iteration for requests in the waiting queue to restore the decode batch size to its previous state. In a fully loaded scenario, the decode
+batch size is often at its maximum, making large-batch HPU graphs critical to capture, as indicated by the `max_bs` strategy. Conversely, prefill iterations will typically be executed with very low
+batch sizes (1-4), as reflected in the `min_tokens` strategy.
 
-> [!NOTE] 
-> `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
+> [!NOTE]
+> `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on the memory allocated for graphs in each stage (prefill and decode). vLLM first attempts to use the entire usable prefill graph memory
+  (usable graph memory * VLLM_GRAPH_PROMPT_RATIO) for capturing prefill HPU Graphs. It will then attempt to do the same for decode graphs and the usable decode graph memory pool. If one stage is fully
+  captured and there is unused memory remaining in the usable graph memory pool, vLLM will attempt to capture more graphs for the other stage, until no more HPU Graphs can be captured without exceeding
+  the reserved memory pool. The behavior of this mechanism is illustrated in the example below.
 
-Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
+Each step outlined is logged by the vLLM server, with negative values indicating memory release:
 
 ```{.}
 INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
@@ -235,28 +312,30 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 
 ## Recommended vLLM Parameters
 
-- We recommend running inference on Gaudi 2 with `block_size` of 128 for BF16 data type. Using default values (16, 32) might lead to sub-optimal performance due to Matrix Multiplication Engine under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
-- For max throughput on Llama 7B, we recommend running with batch size of 128 or 256 and max context length of 2048 with HPU Graphs enabled. If you encounter out-of-memory issues, see troubleshooting section.
+- It is recommended to run inference on Gaudi 2 with `block_size` of 128 for BF16 data type. Using the default values (16, 32) may result in suboptimal performance due to underutilization of the Matrix
+  Multiplication Engine (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
+- To achieve maximum throughput on Llama 7B, it is recommended to use a batch size of 128 or 256 and a maximum context length of 2048 with HPU Graphs enabled. If you experience out-of-memory issues,
+  please refer to the Troubleshooting section below.
 
-## Environment variables
+## Environment Variables
 
-**Diagnostic and profiling knobs:**
+**Diagnostic and Profiling Knobs:**
 
-- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
-- `VLLM_REGIONAL_COMPILATION`: if `false`, turn off regional compilation (when using torch.compile execution mode).
+- `VLLM_PROFILER_ENABLED`: if `true` - enables high level profiler. Resulting JSON traces can be viewed at [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true` - logs graph compilations for each vLLM engine step, but only if any compilation occurs. It is highly recommended to use this in conjunction with `PT_HPU_METRICS_GC_DETAILS=1`.
+  Disabled by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true` - logs graph compilations for every vLLM engine step, even if no compilation occurs. Disabled by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true` - logs CPU fallbacks for each vLLM engine step, but only if any fallback occurs. Disabled by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true` - logs CPU fallbacks for each vLLM engine step, even if no fallback occur. Disabled by default.
 
-**Performance tuning knobs:**
+**Performance Tuning Knobs:**
 
-- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default
-- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default
-- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default
-- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
-- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default
-- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
+- `VLLM_SKIP_WARMUP`: if `true` - warmup is skipped. `false` by default.
+- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default.
+- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default.
+- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default.
+- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default.
+- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism.
   - `{phase}` is either `PROMPT` or `DECODE`
   - `{dim}` is either `BS`, `SEQ` or `BLOCK`
   - `{param}` is either `MIN`, `STEP` or `MAX`
@@ -278,38 +357,45 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
       - block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
       - block size step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
       - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)`
--  `VLLM_HANDLE_TOPK_DUPLICATES`: if ``true``, will handle duplicates that are outside of top-k, ``false`` by default
--  `VLLM_CONFIG_HIDDEN_LAYERS`: configure how many hidden layers to run in a HPUGraph for model splitting among hidden layers when TP is 1. The default is 1. It helps with throughput improvement under inter-token latency limitation for some models.
+-  `VLLM_HANDLE_TOPK_DUPLICATES`, if ``true`` - handles duplicates that are outside of top-k. `false` by default.
+-  `VLLM_CONFIG_HIDDEN_LAYERS` - configures how many hidden layers to run in a HPUGraph for model splitting among hidden layers when TP is 1. The default is 1.
+    It helps improve throughput by reducing inter-token latency limitations in some models.
 
 Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
 
-- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default
-- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
+- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used. `1` is the default.
+- `PT_HPU_ENABLE_LAZY_COLLECTIVES` must be set to `true` for tensor parallel inference with HPU Graphs.
 
-# Quantization, FP8 inference and model calibration process
+# Quantization, FP8 Inference and Model Calibration Process
 
 > [!NOTE]
-> Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package.
+> Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described
+  in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package.
 
-Once you've completed the model calibration process and collected the measurements, you can run FP8 inference with vLLM using the following command:
+Once you have completed the model calibration process and collected the measurements, you can run FP8 inference with vLLM using the following command:
 ```bash
 export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_measure_g3.json
 vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --weights-load-device cpu --tensor_paralel_size 8
 ```
 
-`QUANT_CONFIG` is an environment variable that points to the measurement or quantization configuration file. The measurement configuration file is used during the calibration procedure to collect measurements for a given model. The quantization configuration is used during inference.
+`QUANT_CONFIG` is an environment variable that points to the measurement or quantization configuration file. The measurement configuration file is used during the calibration procedure to collect
+measurements for a given model. The quantization configuration is used during inference.
 
 > [!TIP]
-> If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments, as it causes a dramatic performance drop.
+> If you are prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which is time-consuming.
+  However, disabling this feature in production environments is not recommended, as it can lead to a significant performance decrease.
 
 > [!TIP]
-> When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use these two environment variables:
+> When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this, set the following environment variables:
 > - `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.
 > - `VLLM_RPC_TIMEOUT` - to adjust the RPC protocol timeout used by the OpenAI-compatible API. This value is in microseconds, e.g., 600000 equals 10 minutes.
 
-# Troubleshooting: Tweaking HPU Graphs
+# Troubleshooting
 
-If you experience device out-of-memory issues or want to attempt inference at higher batch sizes, try tweaking HPU Graphs by following the below:
+If you encounter device out-of-memory issues or want to attempt inference with higher batch sizes, try tweaking HPU Graphs as follows:
 
-- Tweak `gpu_memory_utilization` knob. It will decrease the allocation of KV cache, leaving some headroom for capturing graphs with larger batch size. By default `gpu_memory_utilization` is set to 0.9. It attempts to allocate ~90% of HBM left for KV cache after short profiling run. Note that decreasing reduces the number of KV cache blocks you have available, and therefore reduces the effective maximum number of tokens you can handle at a given time.
-- If this method is not efficient, you can disable `HPUGraph` completely. With HPU Graphs disabled, you are trading latency and throughput at lower batches for potentially higher throughput on higher batches. You can do that by adding `--enforce-eager` flag to server (for online inference), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference).
+- Tweak `gpu_memory_utilization` knob. This will decrease the allocation of KV cache, leaving some headroom for capturing graphs with larger batch size. By default, `gpu_memory_utilization` is set to 0.9.
+  It attempts to allocate ~90% of HBM left for KV cache after short profiling run. Note that this reduces the number of KV cache blocks you have available, and therefore reduces the effective maximum
+  number of tokens handled at a given time.
+- If this method is not efficient, you can disable `HPUGraph` completely. With HPU Graphs disabled, you are trading latency and throughput at lower batches for potentially higher throughput on higher batches.
+  You can do that by adding `--enforce-eager` flag to the server (for online inference), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference).
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000000000..5dfb4bb1c32f9
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,4 @@
+# Please refer Intel® Gaudi® README for Gaudi examples
+
+> [!NOTE] 
+> Not all examples in this folder are Intel Gaudi specific and come from the original vllm-project repository from where this fork was created. For examples such as offline inference with openAI on Intel Gaudi please refer to [Intel® Gaudi® README supported features table](https://github.com/HabanaAI/vllm-fork/blob/v0.6.4.post2%2BGaudi-1.19.0/README_GAUDI.md#supported-features) and the [quantization section](https://github.com/HabanaAI/vllm-fork/blob/v0.6.4.post2%2BGaudi-1.19.0/README_GAUDI.md#quantization-fp8-inference-and-model-calibration-process) for FP8 examples.
\ No newline at end of file
diff --git a/examples/other/fp8/README.md b/examples/other/fp8/README.md
index 4e8031d954113..ee09f09dfdcd2 100644
--- a/examples/other/fp8/README.md
+++ b/examples/other/fp8/README.md
@@ -1,3 +1,6 @@
+> [!NOTE] 
+>The examples in this folder are **NOT** Intel Gaudi specific and come from the original vllm-project repository from where this fork was created. For FP8 examples on Intel Gaudi please refer to <a href=https://github.com/HabanaAI/vllm-fork/blob/habana_main/README_GAUDI.md#quantization-fp8-inference-and-model-calibration-process><b>Intel® Gaudi® README</b></a>.
+
 # FP8 KV Cache 
 
 This utility extracts the KV cache scaling factors from a quantized HF (Hugging Face) model. The extracted scaling factors are saved to a JSON file, which can later be used by vLLM (variable-length language model) during runtime. This tool is particularly useful when the KV cache data type is FP8 and is intended for use on ROCm (AMD GPU) platforms.

From 293bd87145b8a32068f8fc3887795438e38ce2f9 Mon Sep 17 00:00:00 2001
From: Iryna Boiko <iboiko@habana.ai>
Date: Mon, 20 Jan 2025 12:10:30 +0200
Subject: [PATCH 66/67] Change vllm-hpu-extension revision to ae726d4

---
 requirements-hpu.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 084d2a736521a..ed6e57548edd5 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@d05c0a7
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@ae726d4

From fedf7066fead31f81a84c71b4aeda2871df93a0e Mon Sep 17 00:00:00 2001
From: Michal Adamczyk <madamczyk@habana.ai>
Date: Mon, 20 Jan 2025 15:47:15 +0100
Subject: [PATCH 67/67] Capabilities overhaul (#692)

Supporting PR for https://github.com/HabanaAI/vllm-hpu-extension/pull/76
---
 requirements-hpu.txt                    |  2 +-
 vllm/attention/backends/hpu_attn.py     | 17 ++++-------------
 vllm/model_executor/layers/layernorm.py |  3 ++-
 vllm/worker/hpu_model_runner.py         |  6 ++++--
 4 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index ed6e57548edd5..ab4b823784bdc 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@ae726d4
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@01090a8
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index cfb6ecd57181c..1893f98d8af77 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -2,12 +2,13 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
-import os
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
+import vllm_hpu_extension.kernels as kernels
 import vllm_hpu_extension.ops as ops
+from vllm_hpu_extension.flags import enabled_flags
 from vllm_hpu_extension.utils import (Matmul, ModuleFusedSDPA, Softmax,
                                       VLLMKVCache)
 
@@ -17,18 +18,9 @@
 from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
                                                HPUPagedAttentionMetadata)
 from vllm.logger import init_logger
-from vllm.utils import is_fake_hpu
 
 logger = init_logger(__name__)
 
-HPUFusedSDPA = None
-try:
-    from habana_frameworks.torch.hpex.kernels import FusedSDPA
-    HPUFusedSDPA = FusedSDPA
-except ImportError:
-    logger.warning("Could not import HPU FusedSDPA kernel. "
-                   "vLLM will use native implementation.")
-
 
 class HPUAttentionBackend(AttentionBackend):
 
@@ -139,6 +131,7 @@ def __init__(
         self.block2batch_matmul = Matmul()
         self.k_cache = VLLMKVCache()
         self.v_cache = VLLMKVCache()
+        HPUFusedSDPA = kernels.fsdpa()
         self.fused_scaled_dot_product_attention = None if HPUFusedSDPA is None \
             else ModuleFusedSDPA(HPUFusedSDPA)
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
@@ -151,9 +144,7 @@ def __init__(
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
-                                               '1').lower() in ['1', 'true'] \
-                                               and not is_fake_hpu()
+        self.prefill_use_fusedsdpa = "fsdpa" in enabled_flags()
         if self.prefill_use_fusedsdpa:
             assert alibi_slopes is None, \
                 'Prefill with FusedSDPA not supported with alibi slopes!'
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 43ea4eb5a4d1a..58e82884df7a1 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -104,7 +104,8 @@ def forward_hpu(
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        from vllm_hpu_extension.ops import HPUFusedRMSNorm
+        from vllm_hpu_extension.kernels import rms_norm
+        HPUFusedRMSNorm = rms_norm()
         if HPUFusedRMSNorm is None:
             return self.forward_native(x, residual)
         if residual is not None:
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 88699369116a3..fd0c40e803f54 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -19,7 +19,9 @@
 import habana_frameworks.torch as htorch
 import habana_frameworks.torch.internal.bridge_config as bc
 import torch
+import vllm_hpu_extension.environment as environment
 from vllm_hpu_extension.bucketing import HPUBucketingContext
+from vllm_hpu_extension.flags import enabled_flags
 from vllm_hpu_extension.ops import LoraMask as LoraMask
 from vllm_hpu_extension.ops import batch2block, block2batch
 from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler,
@@ -214,8 +216,7 @@ class HpuModelAdapter:
 
     def __init__(self, model, vllm_config, layer_names):
         self.model = model
-        self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
-                                               '0').lower() in ['1', 'true']
+        self.prefill_use_fusedsdpa = "fsdpa" in enabled_flags()
         self.recompute_cos_sin = os.getenv('VLLM_COS_SIN_RECOMPUTE',
                                            'false').lower() in ['1', 'true']
         self.vllm_config = vllm_config
@@ -597,6 +598,7 @@ def __init__(
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
         ModelRunnerBase.__init__(self, vllm_config=vllm_config)
+        environment.set_model_config(self.model_config)
         self.is_driver_worker = is_driver_worker
         self.return_hidden_states = return_hidden_states