From 35585798f61289486ea787d9c6015fbe6ea73881 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 20 Jan 2025 16:08:25 +0000 Subject: [PATCH] missing specify fa versions Signed-off-by: Lucas Wilkinson --- CMakeLists.txt | 2 +- vllm/attention/backends/flash_attn.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 93d1d18a6e360..33c5e707f7947 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -551,7 +551,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 36c76f3e481c6bbdd147e4bbe83e942d71d519e6 + GIT_TAG bdd49bf2c0bc1b7dffe2893f60c4c2e122474e0c GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 1a0a0b6a0f2d7..8fbe96060f8b5 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -647,7 +647,7 @@ def __init__( assert VLLM_FLASH_ATTN_VERSION in [2, 3] self.fa_version = VLLM_FLASH_ATTN_VERSION - is_fa_version_supported(self.fa_version) + assert is_fa_version_supported(self.fa_version) def forward( self, @@ -768,6 +768,7 @@ def forward( alibi_slopes=alibi_slopes, softcap=logits_soft_cap, out=prefill_output, + fa_version=self.fa_version, ) else: # prefix-enabled attention @@ -790,6 +791,7 @@ def forward( block_table=prefill_meta.block_tables, softcap=logits_soft_cap, out=prefill_output, + fa_version=self.fa_version, ) if decode_meta := attn_metadata.decode_metadata: @@ -818,6 +820,7 @@ def forward( softcap=logits_soft_cap, block_table=decode_meta.block_tables, out=decode_output, + fa_version=self.fa_version, ) else: # Use flash_attn_with_kvcache for normal decoding. @@ -838,6 +841,7 @@ def forward( alibi_slopes=alibi_slopes, softcap=logits_soft_cap, out=decode_output.unsqueeze(1), + fa_version=self.fa_version, ) return output