diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 87f45cf695c8d..55835d945b00c 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -333,7 +333,7 @@ Text Embedding * - :code:`MistralModel` - Mistral-based - :code:`intfloat/e5-mistral-7b-instruct`, etc. - - ✅︎ + - - ✅︎ .. important:: diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 69b6595b0f9e0..38a31f420cec9 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -628,7 +628,7 @@ def permute(w: torch.Tensor, n_heads: int): return name, loaded_weight -class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): +class LlamaEmbeddingModel(nn.Module, SupportsPP): """ A model that uses Llama with additional embedding functionalities. @@ -639,19 +639,6 @@ class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): model: An instance of LlamaModel used for forward operations. _pooler: An instance of Pooler used for pooling operations. """ - packed_modules_mapping = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"] - } - - # LoRA specific attributes - supported_lora_modules = [ - "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens" - ] - embedding_modules = { - "embed_tokens": "input_embeddings", - } - embedding_padding_modules = [] def __init__( self, @@ -693,8 +680,3 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): def load_kv_cache_scales(self, quantization_param_path: str) -> None: self.model.load_kv_cache_scales(quantization_param_path) - - # LRUCacheWorkerLoRAManager instantiation requires model config. - @property - def config(self): - return self.model.config