diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 55835d945b00c..87f45cf695c8d 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -333,7 +333,7 @@ Text Embedding * - :code:`MistralModel` - Mistral-based - :code:`intfloat/e5-mistral-7b-instruct`, etc. - - + - ✅︎ - ✅︎ .. important:: diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 38a31f420cec9..69b6595b0f9e0 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -628,7 +628,7 @@ def permute(w: torch.Tensor, n_heads: int): return name, loaded_weight -class LlamaEmbeddingModel(nn.Module, SupportsPP): +class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): """ A model that uses Llama with additional embedding functionalities. @@ -639,6 +639,19 @@ class LlamaEmbeddingModel(nn.Module, SupportsPP): model: An instance of LlamaModel used for forward operations. _pooler: An instance of Pooler used for pooling operations. """ + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens" + ] + embedding_modules = { + "embed_tokens": "input_embeddings", + } + embedding_padding_modules = [] def __init__( self, @@ -680,3 +693,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): def load_kv_cache_scales(self, quantization_param_path: str) -> None: self.model.load_kv_cache_scales(quantization_param_path) + + # LRUCacheWorkerLoRAManager instantiation requires model config. + @property + def config(self): + return self.model.config