From cae5a3dc415ea0bad7705eef414897027df8da6b Mon Sep 17 00:00:00 2001 From: Aleksandr Malyshev Date: Tue, 16 Apr 2024 19:32:22 +0000 Subject: [PATCH] Giving prio to Triton FA over native for Navi3x --- vllm/attention/backends/rocm_flash_attn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 6019d917b4494..9ef1a0c7ce1ca 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -147,10 +147,11 @@ def __init__( f"Head size {head_size} is not supported by PagedAttention. " f"Supported head sizes are: {suppored_head_sizes}.") - self.use_naive_attn = torch.cuda.get_device_capability()[0] != 9 # NOTE: Allow for switching between Triton and CK. Defaulting to triton. self.use_triton_flash_attn = (os.environ.get( "VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")) + self.use_naive_attn = (not self.use_triton_flash_attn) and torch.cuda.get_device_capability()[0] != 9 + if self.use_naive_attn: # AMD Radeon 7900 series (gfx1100) currently does not support # xFormers nor FlashAttention. As a temporary workaround, we use