Giving prio to Triton FA over native for Navi3x

ROCm · Apr 16, 2024 · cae5a3d · cae5a3d
1 parent 3489ce7
commit cae5a3d
Showing 1 changed file with 2 additions and 1 deletion.
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -147,10 +147,11 @@ def __init__(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {suppored_head_sizes}.")
 
-        self.use_naive_attn = torch.cuda.get_device_capability()[0] != 9
         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
         self.use_triton_flash_attn = (os.environ.get(
             "VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1"))
+        self.use_naive_attn = (not self.use_triton_flash_attn) and torch.cuda.get_device_capability()[0] != 9
+
         if self.use_naive_attn:
             # AMD Radeon 7900 series (gfx1100) currently does not support
             # xFormers nor FlashAttention. As a temporary workaround, we use