From cae5a3dc415ea0bad7705eef414897027df8da6b Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <maleksan@amd.com>
Date: Tue, 16 Apr 2024 19:32:22 +0000
Subject: [PATCH] Giving prio to Triton FA over native for Navi3x

---
 vllm/attention/backends/rocm_flash_attn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 6019d917b4494..9ef1a0c7ce1ca 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -147,10 +147,11 @@ def __init__(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {suppored_head_sizes}.")
 
-        self.use_naive_attn = torch.cuda.get_device_capability()[0] != 9
         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
         self.use_triton_flash_attn = (os.environ.get(
             "VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1"))
+        self.use_naive_attn = (not self.use_triton_flash_attn) and torch.cuda.get_device_capability()[0] != 9
+        
         if self.use_naive_attn:
             # AMD Radeon 7900 series (gfx1100) currently does not support
             # xFormers nor FlashAttention. As a temporary workaround, we use