From 35585798f61289486ea787d9c6015fbe6ea73881 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Date: Mon, 20 Jan 2025 16:08:25 +0000
Subject: [PATCH] missing specify fa versions

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 CMakeLists.txt                        | 2 +-
 vllm/attention/backends/flash_attn.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 93d1d18a6e360..33c5e707f7947 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -551,7 +551,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 36c76f3e481c6bbdd147e4bbe83e942d71d519e6
+          GIT_TAG bdd49bf2c0bc1b7dffe2893f60c4c2e122474e0c
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 1a0a0b6a0f2d7..8fbe96060f8b5 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -647,7 +647,7 @@ def __init__(
             assert VLLM_FLASH_ATTN_VERSION in [2, 3]
             self.fa_version = VLLM_FLASH_ATTN_VERSION
 
-        is_fa_version_supported(self.fa_version)
+        assert is_fa_version_supported(self.fa_version)
 
     def forward(
         self,
@@ -768,6 +768,7 @@ def forward(
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
                     out=prefill_output,
+                    fa_version=self.fa_version,
                 )
             else:
                 # prefix-enabled attention
@@ -790,6 +791,7 @@ def forward(
                     block_table=prefill_meta.block_tables,
                     softcap=logits_soft_cap,
                     out=prefill_output,
+                    fa_version=self.fa_version,
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
@@ -818,6 +820,7 @@ def forward(
                     softcap=logits_soft_cap,
                     block_table=decode_meta.block_tables,
                     out=decode_output,
+                    fa_version=self.fa_version,
                 )
             else:
                 # Use flash_attn_with_kvcache for normal decoding.
@@ -838,6 +841,7 @@ def forward(
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
                     out=decode_output.unsqueeze(1),
+                    fa_version=self.fa_version,
                 )
         return output