diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 9444dc43ea97e..1eb749f64d36b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -321,7 +321,6 @@ steps: - tests/models/decoder_only/language commands: - pytest -v -s models/decoder_only/language/test_models.py - - pytest -v -s models/decoder_only/language/test_big_models.py - label: Decoder-only Language Models Test (Extended) # 1h20min nightly: true @@ -329,7 +328,7 @@ steps: - vllm/ - tests/models/decoder_only/language commands: - - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py + - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py - label: Decoder-only Multi-Modal Models Test (Standard) #mirror_hardwares: [amd] diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py deleted file mode 100644 index fcfc159e4f5a0..0000000000000 --- a/tests/models/decoder_only/language/test_big_models.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Compare the outputs of HF and vLLM when using greedy sampling. - -This tests bigger models and use half precision. - -Run `pytest tests/models/test_big_models.py`. -""" -import pytest - -from vllm.platforms import current_platform - -from ...utils import check_logprobs_close, check_outputs_equal - -MODELS = [ - "meta-llama/Llama-2-7b-hf", - # "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py - # "Deci/DeciLM-7b", # Broken - # "tiiuae/falcon-7b", # Broken - "EleutherAI/gpt-j-6b", - # "mosaicml/mpt-7b", # Broken - # "Qwen/Qwen1.5-0.5B" # Broken, -] - -if not current_platform.is_cpu(): - MODELS += [ - # fused_moe which not supported on CPU - "openbmb/MiniCPM3-4B", - # Head size isn't supported on CPU - "h2oai/h2o-danube3-4b-base", - ] - -# TODO: remove this after CPU float16 support ready -target_dtype = "float" if current_platform.is_cpu() else "half" - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [32]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - - if model == "openbmb/MiniCPM3-4B": - # the output becomes slightly different when upgrading to - # pytorch 2.5 . Changing to logprobs checks instead of exact - # output checks. - NUM_LOG_PROBS = 8 - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, NUM_LOG_PROBS) - - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - else: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py index 5a947ce62c785..f874bf6c73142 100644 --- a/tests/models/decoder_only/language/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -21,11 +21,11 @@ "kv_cache_dtype,base_model,test_model,scale_path", [ # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. - ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None), + ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct", + "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None), # Test FP16 checkpoint w. fp8_e5m2 kv-cache. - ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct", - "meta-llama/Meta-Llama-3-8B-Instruct", None), + ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-1B-Instruct", None), # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-7b-chat-hf", @@ -33,7 +33,7 @@ ]) # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) -@pytest.mark.parametrize("enforce_eager", [False, True]) +@pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py index 2155e83dbe915..a896f145c11f1 100644 --- a/tests/models/decoder_only/language/test_gptq_marlin.py +++ b/tests/models/decoder_only/language/test_gptq_marlin.py @@ -22,24 +22,11 @@ MAX_MODEL_LEN = 1024 MODELS = [ - # act_order==False, group_size=channelwise - ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"), - # act_order==False, group_size=128 - ("TheBloke/Llama-2-7B-GPTQ", "main"), - # act_order==True, group_size=128 ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"), - # act_order==True, group_size=64 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"), - # act_order==True, group_size=32 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"), # 8-bit, act_order==True, group_size=channelwise ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"), - # 8-bit, act_order==True, group_size=128 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"), - # 8-bit, act_order==True, group_size=32 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"), # 4-bit, act_order==True, group_size=128 ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main") diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py index d65be05f141b4..aa63f9f36a3a8 100644 --- a/tests/models/decoder_only/language/test_gptq_marlin_24.py +++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py @@ -25,16 +25,16 @@ class ModelPair: # 4-bit, group_size == 128 ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128", model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"), - # 4-bit, group_size == channelwise - ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise", - model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"), + # # 4-bit, group_size == channelwise + # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise", + # model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"), # 8-bit, group_size == 128 ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128", model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"), - # 8-bit, group_size == channelwise - ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise", - model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"), + # # 8-bit, group_size == channelwise + # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise", + # model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"), ] diff --git a/tests/models/decoder_only/language/test_marlin.py b/tests/models/decoder_only/language/test_marlin.py deleted file mode 100644 index c802346dee8af..0000000000000 --- a/tests/models/decoder_only/language/test_marlin.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Compare the outputs of a GPTQ model to a Marlin model. - -Note: GPTQ and Marlin do not have bitwise correctness. -As a result, in this test, we just confirm that the top selected tokens of the -Marlin/GPTQ models are in the top 3 selections of each other. - -Note: Marlin internally uses locks to synchronize the threads. This can -result in very slight nondeterminism for Marlin. As a result, we re-run the test -up to 3 times to see if we pass. - -Run `pytest tests/models/test_marlin.py`. -""" -from dataclasses import dataclass - -import pytest - -from tests.quantization.utils import is_quant_method_supported - -from ...utils import check_logprobs_close - - -@dataclass -class ModelPair: - model_marlin: str - model_gptq: str - - -model_pairs = [ - ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128", - model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"), - ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin", - model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"), - ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", - model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq") -] - - -@pytest.mark.flaky(reruns=2) -@pytest.mark.skipif(not is_quant_method_supported("marlin"), - reason="Marlin is not supported on this GPU type.") -@pytest.mark.parametrize("model_pair", model_pairs) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - vllm_runner, - example_prompts, - model_pair: ModelPair, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: - with vllm_runner(model_pair.model_marlin, - dtype=dtype, - quantization="marlin") as marlin_model: - marlin_outputs = marlin_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - with vllm_runner(model_pair.model_gptq, dtype=dtype, - quantization="gptq") as gptq_model: - gptq_outputs = gptq_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - check_logprobs_close( - outputs_0_lst=gptq_outputs, - outputs_1_lst=marlin_outputs, - name_0="gptq", - name_1="marlin", - ) diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index 174b905d9cbb9..5be44c54a717c 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -4,7 +4,7 @@ """ import pytest -from vllm import LLM, SamplingParams +from vllm import SamplingParams from ...utils import check_logprobs_close @@ -15,6 +15,10 @@ # "mistralai/Mistral-Nemo-Instruct-2407" ] +MISTRAL_FORMAT_MODELS = [ + "mistralai/Mistral-7B-Instruct-v0.3", +] + SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5) SYMBOLIC_LANG_PROMPTS = [ "勇敢な船乗りについての詩を書く", # japanese @@ -95,7 +99,7 @@ def test_models( ) -@pytest.mark.parametrize("model", MODELS[1:]) +@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) @@ -135,28 +139,29 @@ def test_mistral_format( ) -@pytest.mark.parametrize("model", MODELS[1:]) +@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS) def test_mistral_symbolic_languages( + vllm_runner, model: str, dtype: str, - prompt: str, ) -> None: - prompt = "hi" - msg = {"role": "user", "content": prompt} - llm = LLM(model=model, - dtype=dtype, - max_model_len=8192, - tokenizer_mode="mistral", - config_format="mistral", - load_format="mistral") - outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS) - assert "�" not in outputs[0].outputs[0].text.strip() + with vllm_runner(model, + dtype=dtype, + max_model_len=8192, + tokenizer_mode="mistral", + config_format="mistral", + load_format="mistral") as vllm_model: + for prompt in SYMBOLIC_LANG_PROMPTS: + msg = {"role": "user", "content": prompt} + outputs = vllm_model.model.chat([msg], + sampling_params=SAMPLING_PARAMS) + assert "�" not in outputs[0].outputs[0].text.strip() @pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("model", MODELS[1:]) # v1 can't do func calling +@pytest.mark.parametrize("model", + MISTRAL_FORMAT_MODELS) # v1 can't do func calling def test_mistral_function_calling( vllm_runner, model: str, diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 68055cbe29095..05117666f8c3f 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -7,25 +7,39 @@ """ import pytest -from ...utils import check_outputs_equal +from vllm.platforms import current_platform + +from ...utils import check_logprobs_close MODELS = [ - "facebook/opt-125m", - "gpt2", - "bigcode/tiny_starcoder_py", - "EleutherAI/pythia-70m", - "bigscience/bloom-560m", # Testing alibi slopes. - "microsoft/phi-2", - "stabilityai/stablelm-3b-4e1t", - # "allenai/OLMo-1B", # Broken - "bigcode/starcoder2-3b", - "google/gemma-1.1-2b-it", + "facebook/opt-125m", # opt + "openai-community/gpt2", # gpt2 + # "Milos/slovak-gpt-j-405M", # gptj + # "bigcode/tiny_starcoder_py", # gpt_bigcode + # "EleutherAI/pythia-70m", # gpt_neox + "bigscience/bloom-560m", # bloom - testing alibi slopes + "microsoft/phi-2", # phi + # "stabilityai/stablelm-3b-4e1t", # stablelm + # "bigcode/starcoder2-3b", # starcoder2 + "google/gemma-1.1-2b-it", # gemma + "Qwen/Qwen2.5-0.5B-Instruct", # qwen2 + "meta-llama/Llama-3.2-1B-Instruct", # llama ] +if not current_platform.is_cpu(): + MODELS += [ + # fused_moe which not supported on CPU + "openbmb/MiniCPM3-4B", + ] + +# TODO: remove this after CPU float16 support ready +target_dtype = "float" if current_platform.is_cpu() else "half" + @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) def test_models( hf_runner, vllm_runner, @@ -33,33 +47,24 @@ def test_models( model: str, dtype: str, max_tokens: int, + num_logprobs: int, ) -> None: - # To pass the small model tests, we need full precision. - assert dtype == "float" with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) - check_outputs_equal( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", name_1="vllm", ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/decoder_only/language/test_qwen.py b/tests/models/decoder_only/language/test_qwen.py deleted file mode 100644 index 128fe65afbb84..0000000000000 --- a/tests/models/decoder_only/language/test_qwen.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Ensure that a text-only Qwen model can be run without throwing an error. -We explicitly test this because Qwen is implemented as a multimodal and -supports a visual encoder for models like Qwen-VL. -""" -from typing import List, Type - -import pytest - -from ....conftest import VllmRunner - -models = [ - "Qwen/Qwen-7B-Chat" # Has no visual encoder -] - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_text_only_qwen_model_can_be_loaded_and_run( - vllm_runner: Type[VllmRunner], - example_prompts: List[str], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, -): - with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_model.generate_greedy_logprobs( - example_prompts, - max_tokens, - num_logprobs=num_logprobs, - )