diff --git a/vllm/config.py b/vllm/config.py index dc70e6f579830..29cc6887177ed 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -608,8 +608,9 @@ def _verify_cuda_graph(self) -> None: self.max_model_len) MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama'] + from vllm.platforms import current_platform if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH - and not self.enforce_eager): + and not self.enforce_eager and not current_platform.is_hpu()): logger.warning( "CUDA graph is not supported for %s yet, fallback to the eager " "mode.", self.hf_config.model_type) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 163ec3d6cff69..634e57dafa4de 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -160,16 +160,19 @@ def forward_cuda( topk_ids=topk_ids, inplace=True) - def forward_hpu(self, - layer: torch.nn.Module, - x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, - router_logits: torch.Tensor, - renormalize: bool, - topk_group: Optional[int] = None, - num_expert_group: Optional[int] = None, - custom_routing_function: Optional[Callable] = None): + def forward_hpu( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + **kwargs, + ): assert not use_grouped_topk, 'use_grouped_topk must be False on HPU' assert num_expert_group is None, ('num_expert_group is ' 'not supported on HPU')