From 75f64d8b94d012ea37dddde1058ce17e55001a4a Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Fri, 12 Jul 2024 14:33:33 -0700 Subject: [PATCH] [Bugfix] Fix illegal memory access in FP8 MoE kernel (#6382) --- vllm/model_executor/layers/fused_moe/fused_moe.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index a29622b7d25c3..3c62008fbfcc1 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -492,12 +492,14 @@ def fused_experts(hidden_states: torch.Tensor, if tokens_in_chunk == 0: break - if tokens_in_chunk < CHUNK_SIZE: - # will only happen in the last chunk + if tokens_in_chunk < CHUNK_SIZE and chunk > 0: + # Adjust the intermediate cache size and config for the last + # chunk. Note that in most cases we only have one chunk + # so the cache size and config are already set correctly and + # do not need to be adjusted. intermediate_cache1 = intermediate_cache1[:tokens_in_chunk] intermediate_cache2 = intermediate_cache2[:tokens_in_chunk] intermediate_cache3 = intermediate_cache3[:tokens_in_chunk] - # reload config to get better performance on the last chunk config = get_config_func(tokens_in_chunk) curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]