From 37eb4fc30cb5ab545ac4059183890b56d1b2f639 Mon Sep 17 00:00:00 2001 From: Dudi Lester <160421192+dudilester@users.noreply.github.com> Date: Tue, 21 Jan 2025 13:48:11 +0200 Subject: [PATCH 1/2] [SW-216156] Fix mixtral Fused MoE issues after rebase (#708) --- vllm/model_executor/layers/fused_moe/layer.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 163ec3d6cff69..634e57dafa4de 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -160,16 +160,19 @@ def forward_cuda( topk_ids=topk_ids, inplace=True) - def forward_hpu(self, - layer: torch.nn.Module, - x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, - router_logits: torch.Tensor, - renormalize: bool, - topk_group: Optional[int] = None, - num_expert_group: Optional[int] = None, - custom_routing_function: Optional[Callable] = None): + def forward_hpu( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + **kwargs, + ): assert not use_grouped_topk, 'use_grouped_topk must be False on HPU' assert num_expert_group is None, ('num_expert_group is ' 'not supported on HPU') From 1df1c2c6242a8f1cd0d63dbd336dcadd668cd101 Mon Sep 17 00:00:00 2001 From: Jan Kaniecki Date: Tue, 21 Jan 2025 14:28:45 +0100 Subject: [PATCH 2/2] Disable enforcing eager mode for mllama and deepseek_v3 on hpu (#713) --- vllm/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index dc70e6f579830..29cc6887177ed 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -608,8 +608,9 @@ def _verify_cuda_graph(self) -> None: self.max_model_len) MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama'] + from vllm.platforms import current_platform if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH - and not self.enforce_eager): + and not self.enforce_eager and not current_platform.is_hpu()): logger.warning( "CUDA graph is not supported for %s yet, fallback to the eager " "mode.", self.hf_config.model_type)