From 6e2c36d803c3741e7f058d07a6a3de44dcdcfe71 Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Mon, 26 Aug 2024 08:38:44 +0000 Subject: [PATCH 1/5] get rid of graph breaks for torch.compile mode Signed-off-by: yuwenzho --- vllm/hpu/cache_ops.py | 6 ++---- vllm/model_executor/models/gpt_bigcode.py | 5 +++-- vllm/model_executor/models/llama.py | 5 +++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 98f109accea06..5676787b7b2fe 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -30,8 +30,7 @@ def reshape_and_cache(key, # lots of padding, or are doing warmup. # This loop is a workaround for this issue. Please remove it # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = torch.div(num_slots_requested, - num_slots_available).ceil().int().item() + num_kv_cache_passes = -(-num_slots_requested // num_slots_available) for i in range(num_kv_cache_passes): start_idx = i * num_slots_available end_idx = (i + 1) * num_slots_available @@ -58,8 +57,7 @@ def prepare_to_cache(cache, slot_mapping): # lots of padding, or are doing warmup. # This loop is a workaround for this issue. Please remove it # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = torch.div(num_slots_requested, - num_slots_available).ceil().int().item() + num_kv_cache_passes = -(-num_slots_requested // num_slots_available) return num_kv_cache_passes, num_slots_available, indices, offsets diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 3ae3c8c8f712c..b06ea89aeded6 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -44,6 +44,7 @@ from .interfaces import SupportsLoRA +is_hpu = current_platform.is_hpu() class GPTBigCodeAttention(nn.Module): @@ -225,13 +226,13 @@ def forward( position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds - if current_platform.is_hpu(): + if is_hpu: import habana_frameworks.torch as htorch htorch.core.mark_step() for i in range(len(self.h)): layer = self.h[i] hidden_states = layer(hidden_states, kv_caches[i], attn_metadata) - if current_platform.is_hpu(): + if is_hpu: htorch.core.mark_step() hidden_states = self.ln_f(hidden_states) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 676a51ce67f96..9bb3456d3f0fb 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -55,6 +55,7 @@ from .interfaces import SupportsLoRA from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers +is_hpu = current_platform.is_hpu() class LlamaMLP(nn.Module): @@ -318,7 +319,7 @@ def forward( hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - if current_platform.is_hpu(): + if is_hpu: import habana_frameworks.torch as htorch htorch.core.mark_step() for i in range(self.start_layer, self.end_layer): @@ -330,7 +331,7 @@ def forward( attn_metadata, residual, ) - if current_platform.is_hpu(): + if is_hpu: htorch.core.mark_step() if not get_pp_group().is_last_rank: From fd1e617e0274c88326e7e8cfeba5b2c639f3e25f Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Tue, 27 Aug 2024 01:57:37 +0000 Subject: [PATCH 2/5] update code Signed-off-by: yuwenzho --- vllm/hpu/cache_ops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 5676787b7b2fe..8296441eac530 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -7,7 +7,7 @@ import habana_frameworks.torch as htorch import torch - +import math def reshape_and_cache(key, value, @@ -30,7 +30,7 @@ def reshape_and_cache(key, # lots of padding, or are doing warmup. # This loop is a workaround for this issue. Please remove it # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = -(-num_slots_requested // num_slots_available) + num_kv_cache_passes = int(math.ceil(num_slots_requested / num_slots_available)) for i in range(num_kv_cache_passes): start_idx = i * num_slots_available end_idx = (i + 1) * num_slots_available @@ -57,7 +57,7 @@ def prepare_to_cache(cache, slot_mapping): # lots of padding, or are doing warmup. # This loop is a workaround for this issue. Please remove it # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = -(-num_slots_requested // num_slots_available) + num_kv_cache_passes = int(math.ceil(num_slots_requested / num_slots_available)) return num_kv_cache_passes, num_slots_available, indices, offsets From 45f0cc932df233be509f8e3fcb9ea824b4de330e Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Thu, 29 Aug 2024 04:12:21 +0300 Subject: [PATCH 3/5] fix line too long Signed-off-by: yuwenzho --- vllm/hpu/cache_ops.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 8296441eac530..2f8f387eb6529 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -30,7 +30,8 @@ def reshape_and_cache(key, # lots of padding, or are doing warmup. # This loop is a workaround for this issue. Please remove it # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = int(math.ceil(num_slots_requested / num_slots_available)) + num_kv_cache_passes = int( + math.ceil(num_slots_requested / num_slots_available)) for i in range(num_kv_cache_passes): start_idx = i * num_slots_available end_idx = (i + 1) * num_slots_available @@ -57,7 +58,8 @@ def prepare_to_cache(cache, slot_mapping): # lots of padding, or are doing warmup. # This loop is a workaround for this issue. Please remove it # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = int(math.ceil(num_slots_requested / num_slots_available)) + num_kv_cache_passes = int( + math.ceil(num_slots_requested / num_slots_available)) return num_kv_cache_passes, num_slots_available, indices, offsets From 941963fddbf25f58f9c0bce08d45f840e7d635cb Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Thu, 29 Aug 2024 08:32:58 +0300 Subject: [PATCH 4/5] update code Signed-off-by: yuwenzho --- vllm/hpu/cache_ops.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 2f8f387eb6529..facc01751dc5c 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -30,8 +30,7 @@ def reshape_and_cache(key, # lots of padding, or are doing warmup. # This loop is a workaround for this issue. Please remove it # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = int( - math.ceil(num_slots_requested / num_slots_available)) + num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available) for i in range(num_kv_cache_passes): start_idx = i * num_slots_available end_idx = (i + 1) * num_slots_available @@ -58,8 +57,7 @@ def prepare_to_cache(cache, slot_mapping): # lots of padding, or are doing warmup. # This loop is a workaround for this issue. Please remove it # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = int( - math.ceil(num_slots_requested / num_slots_available)) + num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available) return num_kv_cache_passes, num_slots_available, indices, offsets From 2a72462b1fea62f69721c35bd623870205c094f9 Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Tue, 3 Sep 2024 02:20:24 +0000 Subject: [PATCH 5/5] fix format Signed-off-by: yuwenzho --- vllm/hpu/cache_ops.py | 4 +++- vllm/model_executor/models/gpt_bigcode.py | 1 + vllm/model_executor/models/llama.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index facc01751dc5c..9042924f68b3d 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -5,9 +5,11 @@ # LICENSE file in the root directory of this source tree. ############################################################################### +import math + import habana_frameworks.torch as htorch import torch -import math + def reshape_and_cache(key, value, diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index b06ea89aeded6..5d4387dbb9f48 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -46,6 +46,7 @@ is_hpu = current_platform.is_hpu() + class GPTBigCodeAttention(nn.Module): def __init__( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 9bb3456d3f0fb..d809da29e5e69 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -57,6 +57,7 @@ is_hpu = current_platform.is_hpu() + class LlamaMLP(nn.Module): def __init__(