diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index d2e22c273b2be..f8a239048c86b 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -160,7 +160,7 @@ def llama3_1_8b_chess_lora(): @pytest.fixture(scope="session") def llama3_1_8b_ultravox_chess_lora(): - # ultravox chess lora is result of transformation of above chess lora for llama + # ultravox chess lora is result of transformation of above chess llama lora return snapshot_download(repo_id="thedebugger11/ultravox-chess-lora") diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py index 1e45564901dbb..f3986c0ba29fc 100644 --- a/tests/lora/test_ultravox.py +++ b/tests/lora/test_ultravox.py @@ -1,7 +1,9 @@ from typing import List, Tuple from transformers import AutoTokenizer + from vllm.lora.request import LoRARequest + from ..models.utils import check_outputs_equal ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3" @@ -12,7 +14,7 @@ PROMPT = "Tell me about a silly chess move in 20 words" -def _get_prompt(audio_count, question, placeholder, model_name)->str: +def _get_prompt(audio_count, question, placeholder, model_name) -> str: tokenizer = AutoTokenizer.from_pretrained(model_name) placeholder = f"{placeholder}\n" * audio_count @@ -36,12 +38,18 @@ def test_ultravox_lora(vllm_runner, llama3_1_8b_chess_lora, dtype="bfloat16", max_model_len=4096, ) as vllm_model: - ultravox_outputs: List[Tuple[List[int], str]] = vllm_model.generate_greedy( - [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, ULTRAVOX_MODEL_NAME)], - 256, - lora_request=LoRARequest(str(1), 1, - llama3_1_8b_ultravox_chess_lora), - ) + ultravox_outputs: List[Tuple[List[int], + str]] = vllm_model.generate_greedy( + [ + _get_prompt( + 0, PROMPT, VLLM_PLACEHOLDER, + ULTRAVOX_MODEL_NAME) + ], + 256, + lora_request=LoRARequest( + str(1), 1, + llama3_1_8b_ultravox_chess_lora), + ) # run llama with and without lora to compare outputs with above with vllm_runner( @@ -54,15 +62,27 @@ def test_ultravox_lora(vllm_runner, llama3_1_8b_chess_lora, dtype="bfloat16", max_model_len=4096, ) as vllm_model: - llama_outputs_no_lora: List[Tuple[List[int], str]] = vllm_model.generate_greedy( - [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)], - 256, - ) - llama_outputs: List[Tuple[List[int], str]] = vllm_model.generate_greedy( - [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)], - 256, - lora_request=LoRARequest(str(1), 1, llama3_1_8b_chess_lora), - ) + llama_outputs_no_lora: List[Tuple[List[int], + str]] = vllm_model.generate_greedy( + [ + _get_prompt( + 0, PROMPT, + VLLM_PLACEHOLDER, + LLMA_MODEL_NAME) + ], + 256, + ) + llama_outputs: List[Tuple[List[int], + str]] = vllm_model.generate_greedy( + [ + _get_prompt(0, PROMPT, + VLLM_PLACEHOLDER, + LLMA_MODEL_NAME) + ], + 256, + lora_request=LoRARequest( + str(1), 1, llama3_1_8b_chess_lora), + ) check_outputs_equal( outputs_0_lst=ultravox_outputs, diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py index a8dc97bb71aa5..ea668d857d8db 100644 --- a/vllm/assets/audio.py +++ b/vllm/assets/audio.py @@ -10,19 +10,15 @@ ASSET_DIR = "multimodal_asset" -@dataclass +@dataclass(frozen=True) class AudioAsset: name: Literal["winning_call", "mary_had_lamb"] - def __init__(self, audio_path=None): - if audio_path is None: - audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg", - s3_prefix=ASSET_DIR) - self._audio_path = audio_path - @property def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]: - y, sr = librosa.load(self._audio_path, sr=None) + audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg", + s3_prefix=ASSET_DIR) + y, sr = librosa.load(audio_path, sr=None) assert isinstance(sr, int) return y, sr diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 53e79bf933be2..64cc213976a31 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -387,8 +387,6 @@ def activate_adapter( for module_name, module in self.modules.items(): module_lora = lora_model.get_lora(module_name) if module_lora: - logger.debug("Setting LoRA. int id: %d, module: %s", - lora_model.id, module_name) module_lora.optimize() # Bias is not explicitly enabled with the flag enable_lora_bias. bias = module_lora.bias @@ -409,7 +407,8 @@ def activate_adapter( if len(missing_modules) > 0: logger.warning( - "Lora adapter int id %d is activated but is missing base model modules %s", + "Lora adapter int id %d is activated but is missing \ + base model modules %s which could impact output", lora_model.id, missing_modules) return True @@ -467,10 +466,6 @@ def _create_lora_modules(self): for module_name, module in self.model.named_modules( remove_duplicate=False): - logger.debug( - "Create lora module if applicable %s", - module_name, - ) if isinstance(module, PPMissingLayer): continue if not self._match_target_modules(module_name): @@ -517,15 +512,12 @@ def _create_lora_modules(self): if self.supports_mm and not isinstance(new_module, BaseLayerWithLoRA): logger.warning( - "%s module will be ignored because it isn't of type BaseLayerWithLoRA", + "%s module will be ignored because it isn't of type \ + BaseLayerWithLoRA", module_name, ) continue - logger.debug( - "Going to apply lora on %s module", - module_name, - ) self.register_module(module_name, new_module) self._register_packed_modules(module_name) # All lora layers share the same punica_wrapper based on reference. @@ -541,7 +533,6 @@ def create_dummy_lora( rank: int, scaling_factor: Optional[float], embedding_modules: Optional[Dict[str, str]] = None) -> LoRAModel: - logger.debug(f"Creating a dummy lora with id: {lora_id}") """Create zero-initialized LoRAModel for warmup.""" model = LoRAModel(lora_id, rank, {}, scaling_factor) for module_name, module in self.model.named_modules(): @@ -654,7 +645,6 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None: if replacement_loras[i]: continue replacement_loras[i] = None - lora_model.loras[module_name] = PackedLoRALayerWeights.pack( replacement_loras) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 18734d8fae2b1..2902e6999c2fd 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -467,14 +467,14 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # LoRA specific attributes supported_lora_modules = [ - "qkv_proj", - "o_proj", - "gate_up_proj", - "down_proj", - "embed_tokens", + "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens", + "lm_head" ] - embedding_modules = {} - embedding_padding_modules = [] + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings" + } + embedding_padding_modules = ["lm_head"] # BitandBytes specific attributes bitsandbytes_stacked_params_mapping = { diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index fa022d7b05849..6b73fcf9c94ae 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -22,6 +22,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.loader import DefaultModelLoader +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -29,10 +30,9 @@ MultiModalDataItems, ProcessorInputs, PromptReplacement) from vllm.sequence import IntermediateTensors -from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.transformers_utils.configs.ultravox import UltravoxConfig -from .interfaces import SupportsMultiModal, SupportsPP, SupportsLoRA +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings_from_map) @@ -303,14 +303,16 @@ def forward( "audio", get_ultravox_max_audio_tokens) @MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): - # same as llamaforcasuallm (language model) minus embedding and other modules - # embedding modules haven't been added as a caution since it could impact text - # but not audio + # same as llamaforcasuallm (language model) minus embedding and other + # modules. embedding modules haven't been added as a caution + # since it could affect text but not audio packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"] } + #lm_head is not added for now since it requires logits_processor + # which is missing from ultravox supported_lora_modules = [ "qkv_proj", "o_proj", "gate_up_proj", "down_proj" ] @@ -325,9 +327,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.multi_modal_config = multimodal_config assert self.multi_modal_config - #TODO: figure out if these prefixes need tweaking to support LoRA and/or - #use LLMWrapper or not like this https://github.com/vllm-project/vllm/pull/7199/files#diff-7b8a4e258637b7c94389c745c449c52137d33cf92957f3e5bcb18a0ee204b21bR807 - self.secondary_weights = [] self.audio_tower = ModifiedWhisperEncoder(config.audio_config) if config.audio_model_id is not None: