diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index fd0c40e803f54..7c3666eca50f3 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -305,7 +305,10 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype): attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( mask, -math.inf)) - if not is_fake_hpu(): + if (is_fake_hpu() and htorch.utils.internal.is_lazy())\ + or "compile_one_hot" in enabled_flags(): + # Use one_hot directly on HPU on lazy or on t.compile when + # build is >= 20.0.0.370 block_mapping = torch.nn.functional.one_hot(metadata.block_groups, num_classes=batch_size) else: