diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 98f109accea06..9042924f68b3d 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -5,6 +5,8 @@ # LICENSE file in the root directory of this source tree. ############################################################################### +import math + import habana_frameworks.torch as htorch import torch @@ -30,8 +32,7 @@ def reshape_and_cache(key, # lots of padding, or are doing warmup. # This loop is a workaround for this issue. Please remove it # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = torch.div(num_slots_requested, - num_slots_available).ceil().int().item() + num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available) for i in range(num_kv_cache_passes): start_idx = i * num_slots_available end_idx = (i + 1) * num_slots_available @@ -58,8 +59,7 @@ def prepare_to_cache(cache, slot_mapping): # lots of padding, or are doing warmup. # This loop is a workaround for this issue. Please remove it # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = torch.div(num_slots_requested, - num_slots_available).ceil().int().item() + num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available) return num_kv_cache_passes, num_slots_available, indices, offsets diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 3ae3c8c8f712c..5d4387dbb9f48 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -44,6 +44,8 @@ from .interfaces import SupportsLoRA +is_hpu = current_platform.is_hpu() + class GPTBigCodeAttention(nn.Module): @@ -225,13 +227,13 @@ def forward( position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds - if current_platform.is_hpu(): + if is_hpu: import habana_frameworks.torch as htorch htorch.core.mark_step() for i in range(len(self.h)): layer = self.h[i] hidden_states = layer(hidden_states, kv_caches[i], attn_metadata) - if current_platform.is_hpu(): + if is_hpu: htorch.core.mark_step() hidden_states = self.ln_f(hidden_states) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d659d0a3f1127..51716b12513d8 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -55,6 +55,8 @@ from .interfaces import SupportsLoRA from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers +is_hpu = current_platform.is_hpu() + class LlamaMLP(nn.Module): @@ -318,7 +320,7 @@ def forward( hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - if current_platform.is_hpu(): + if is_hpu: import habana_frameworks.torch as htorch htorch.core.mark_step() for i in range(self.start_layer, self.end_layer): @@ -330,7 +332,7 @@ def forward( attn_metadata, residual, ) - if current_platform.is_hpu(): + if is_hpu: htorch.core.mark_step() if not get_pp_group().is_last_rank: