From b7e60967660b1876b6eeecf5684570ba4a4dd2e2 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Nov 2024 13:12:48 +0000 Subject: [PATCH] Support and test Mantis model Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 5 +-- examples/offline_inference_vision_language.py | 17 +++++++++ requirements-test.in | 3 -- tests/conftest.py | 1 - .../vision_language/test_models.py | 28 +++++++++++---- .../vision_language/vlm_utils/core.py | 16 ++++++--- .../vision_language/vlm_utils/model_utils.py | 35 ++++++++++++++++++- .../vision_language/vlm_utils/types.py | 16 +++++---- tests/models/registry.py | 1 + vllm/model_executor/models/llava.py | 32 +++++++++++++++-- vllm/model_executor/models/registry.py | 1 + 11 files changed, 128 insertions(+), 27 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fc23c9cff0d87..23a0451148005 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -334,7 +334,6 @@ steps: commands: - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - pytest -v -s models/embedding/language -m core_model - - pytest -v -s models/embedding/vision_language -m core_model - label: Language Models Test (Extended) # 50min optional: true @@ -346,7 +345,6 @@ steps: commands: - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - pytest -v -s models/embedding/language -m 'not core_model' - - pytest -v -s models/embedding/vision_language -m 'not core_model' - label: Multi-Modal Models Test (Standard) # 26min #mirror_hardwares: [amd] @@ -357,8 +355,10 @@ steps: - tests/models/embedding/vision_language - tests/models/encoder_decoder/vision_language commands: + - python -m pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' + - pytest -v -s models/embedding/vision_language -m core_model - pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model @@ -376,6 +376,7 @@ steps: # https://github.com/huggingface/transformers/issues/34307 - pytest -v -s models/decoder_only/vision_language/test_phi3v.py - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' + - pytest -v -s models/embedding/vision_language -m 'not core_model' - pytest -v -s models/encoder_decoder/language -m 'not core_model' - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index f08f22eec164a..e7ca816503c1b 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -419,6 +419,22 @@ def run_aria(question: str, modality: str): return llm, prompt, stop_token_ids +# Mantis +def run_mantis(question: str, modality: str): + assert modality == "image" + + llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501 + prompt = llama3_template.format(f"{question}\n") + + llm = LLM( + model="TIGER-Lab/Mantis-8B-siglip-llama3", + max_model_len=4096, + hf_overrides={"architectures": ["MantisForConditionalGeneration"]}, + ) + stop_token_ids = [128009] + return llm, prompt, stop_token_ids + + model_example_map = { "llava": run_llava, "llava-next": run_llava_next, @@ -441,6 +457,7 @@ def run_aria(question: str, modality: str): "glm4v": run_glm4v, "idefics3": run_idefics3, "aria": run_aria, + "mantis": run_mantis, } diff --git a/requirements-test.in b/requirements-test.in index 76f6de2f77c34..1ea0c44ee1035 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -24,9 +24,6 @@ mistral_common[opencv] >= 1.4.4 # required for pixtral test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.4 # required for model evaluation test -# TODO: Add this after fully implementing llava(mantis) -# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test - # quantization bitsandbytes>=0.44.0 buildkite-test-collector==0.1.9 diff --git a/tests/conftest.py b/tests/conftest.py index d56942d8912af..36f1d477fab59 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -263,7 +263,6 @@ def __init__( dtype: str = "half", *, model_kwargs: Optional[Dict[str, Any]] = None, - is_embedding_model: bool = False, is_sentence_transformer: bool = False, is_cross_encoder: bool = False, skip_tokenizer_init: bool = False, diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 3f6d8ef42cd5f..0bd6e97019258 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -34,7 +34,7 @@ "dtype": "half", "max_tokens": 5, "tensor_parallel_size": 2, - "model_kwargs": {"device_map": "auto"}, + "hf_model_kwargs": {"device_map": "auto"}, "image_size_factors": [(.25, 0.5, 1.0)], "distributed_executor_backend": ( "ray", @@ -108,7 +108,7 @@ "cherry_blossom": "What is in the picture?", }), auto_cls=AutoModelForVision2Seq, - postprocess_inputs=model_utils.get_key_type_post_processor( + postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" ), vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, @@ -148,7 +148,7 @@ prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, auto_cls=AutoModelForVision2Seq, - postprocess_inputs=model_utils.get_key_type_post_processor( + postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" ), # For chameleon, we only compare the sequences @@ -264,7 +264,7 @@ prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 num_video_frames=16, max_model_len=16384, - postprocess_inputs=model_utils.get_key_type_post_processor( + postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values_videos" ), auto_cls=AutoModelForVision2Seq, @@ -295,6 +295,20 @@ ) ], ), + "mantis": VLMTestInfo( + models=["TIGER-Lab/Mantis-8B-siglip-llama3"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 + max_model_len=4096, + postprocess_inputs=model_utils.cast_dtype_post_processor( + "pixel_values" + ), + vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}}, # noqa: E501 + get_stop_token_ids=lambda tok: [128009], + auto_cls=AutoModelForVision2Seq, + vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output, + patch_hf_runner=model_utils.mantis_patch_hf_runner, + ), "minicpmv": VLMTestInfo( models=["openbmb/MiniCPM-Llama3-V-2_5"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), @@ -318,7 +332,7 @@ # max_num_seqs=2, # task="generate", # # use eager mode for hf runner since phi3v didn't work with flash_attn - # model_kwargs={"_attn_implementation": "eager"}, + # hf_model_kwargs={"_attn_implementation": "eager"}, # use_tokenizer_eos=True, # vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output, # num_logprobs=10, @@ -349,7 +363,7 @@ prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, auto_cls=AutoModelForVision2Seq, - postprocess_inputs=model_utils.get_key_type_post_processor( + postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" ), vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2], @@ -418,7 +432,7 @@ test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=16384, max_num_seqs=2, - postprocess_inputs=model_utils.get_key_type_post_processor( + postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" ), auto_cls=AutoModelForVision2Seq, diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py index 7e8c6dabb15af..c161036312d89 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/core.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py @@ -3,7 +3,7 @@ import torch from PIL.Image import Image -from transformers import AutoTokenizer, BatchEncoding +from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase from transformers.models.auto.auto_factory import _BaseAutoModelClass from .....conftest import HfRunner, VllmRunner @@ -28,9 +28,11 @@ def run_test( use_tokenizer_eos: bool, postprocess_inputs: Callable[[BatchEncoding], BatchEncoding], comparator: Callable[..., None], - get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]], + get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase], + List[int]]], limit_mm_per_prompt: Dict[str, int], - model_kwargs: Optional[Dict[str, Any]], + vllm_runner_kwargs: Optional[Dict[str, Any]], + hf_model_kwargs: Optional[Dict[str, Any]], patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]], task: str = "auto", runner_mm_key: str = "images", @@ -54,6 +56,9 @@ def run_test( if get_stop_token_ids is not None: vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer) + if vllm_runner_kwargs is None: + vllm_runner_kwargs = {} + with vllm_runner(model, max_model_len=max_model_len, max_num_seqs=max_num_seqs, @@ -62,7 +67,8 @@ def run_test( tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=enforce_eager, - task=task) as vllm_model: + task=task, + **vllm_runner_kwargs) as vllm_model: for prompts, media in vllm_inputs: vllm_kwargs[runner_mm_key] = media vllm_output = vllm_model.generate_greedy_logprobs( @@ -73,7 +79,7 @@ def run_test( dtype=dtype, auto_cls=auto_cls, postprocess_inputs=postprocess_inputs, - model_kwargs=model_kwargs) + model_kwargs=hf_model_kwargs) # Some models need to patch things like the model processor, e.g., internvl if patch_hf_runner is not None: diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 849857b4232e7..2f238b4e18a74 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -126,6 +126,16 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput, return hf_output_ids, hf_output_str, out_logprobs +def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, + model: str) -> RunnerOutput: + """Sanitize vllm output [mantis] to compare with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + + hf_output_str = output_str + "<|eot_id|>" + + return output_ids, hf_output_str, out_logprobs + + def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput: """Sanitize vllm output [phi3v] to be comparable with hf output.""" @@ -184,7 +194,7 @@ def get_llava_embeddings(image_assets: _ImageAssets): ####### postprocessors to run on HF BatchEncoding -def get_key_type_post_processor( +def cast_dtype_post_processor( hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]: """Gets a handle to a post processor which converts a given key into a target data type.""" @@ -407,3 +417,26 @@ def _internvl_generate( ) return outputs + + +def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + from mantis.models.mllava import MLlavaProcessor + + hf_model.processor = MLlavaProcessor.from_pretrained(hf_model.model_name) + + orig_generate = hf_model.model.generate + tokenizer = hf_model.processor.tokenizer + + def _generate(self, *args, **kwargs): + return orig_generate( + *args, + **kwargs, + eos_token_id=[ + tokenizer.eos_token_id, + tokenizer.convert_tokens_to_ids("<|eot_id|>"), + ], + ) + + hf_model.model.generate = types.MethodType(_generate, hf_model.model) + + return hf_model diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py index 8459476dc2d07..ad5095a3eaee1 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/types.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py @@ -7,7 +7,8 @@ import torch from PIL.Image import Image from pytest import MarkDecorator -from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding +from transformers import (AutoModelForCausalLM, BatchEncoding, + PreTrainedTokenizerBase) from transformers.models.auto.auto_factory import _BaseAutoModelClass from vllm.sequence import SampleLogprobs @@ -66,7 +67,7 @@ class ImageSizeWrapper(NamedTuple): class VLMTestInfo(NamedTuple): """Holds the configuration for 1+ tests for one model architecture.""" - models: Union[List[str]] + models: List[str] test_type: Union[VLMTestType, Iterable[VLMTestType]] # Should be None only if this is a CUSTOM_INPUTS test @@ -94,13 +95,15 @@ class VLMTestInfo(NamedTuple): max_num_seqs: int = 256 task: str = "auto" tensor_parallel_size: int = 1 + vllm_runner_kwargs: Optional[Dict[str, Any]] = None # Optional callable which gets a list of token IDs from the model tokenizer - get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]] = None + get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase], + List[int]]] = None # Exposed options for HF runner - model_kwargs: Optional[Dict[str, Any]] = None - # Indicates we should explicitly pass the EOS from the tokeniezr + hf_model_kwargs: Optional[Dict[str, Any]] = None + # Indicates we should explicitly pass the EOS from the tokenizer use_tokenizer_eos: bool = False auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM # Callable to pass to the HF runner to run on inputs; for now, we also pass @@ -159,6 +162,7 @@ def get_non_parametrized_runner_kwargs(self): "max_num_seqs": self.max_num_seqs, "task": self.task, "tensor_parallel_size": self.tensor_parallel_size, + "vllm_runner_kwargs": self.vllm_runner_kwargs, "hf_output_post_proc": self.hf_output_post_proc, "vllm_output_post_proc": self.vllm_output_post_proc, "auto_cls": self.auto_cls, @@ -166,7 +170,7 @@ def get_non_parametrized_runner_kwargs(self): "postprocess_inputs": self.postprocess_inputs, "comparator": self.comparator, "get_stop_token_ids": self.get_stop_token_ids, - "model_kwargs": self.model_kwargs, + "hf_model_kwargs": self.hf_model_kwargs, "patch_hf_runner": self.patch_hf_runner, } diff --git a/tests/models/registry.py b/tests/models/registry.py index 865e90b3f8b0e..2fa61e5dd0095 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -173,6 +173,7 @@ class _HfExamplesInfo: "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"), # noqa: E501 "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 + "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3"), # noqa: E501 "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", trust_remote_code=True), "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924", diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index c1159ab8e4589..86918aec4c43d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -7,7 +7,7 @@ from PIL.Image import Image from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, PretrainedConfig, - SiglipVisionConfig) + ProcessorMixin, SiglipVisionConfig) from vllm.attention import AttentionMetadata from vllm.config import VllmConfig @@ -21,7 +21,7 @@ from vllm.multimodal.processing import (InputProcessingContext, ModalityProcessingMetadata, MultiModalProcessingMetadata, - PromptReplacement) + MultiModalProcessor, PromptReplacement) from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel, get_max_clip_image_tokens @@ -499,3 +499,31 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) + + +class MantisProcessor(MultiModalProcessor): + + def _get_hf_processor(self) -> ProcessorMixin: + try: + from mantis.models.mllava import MLlavaProcessor + except ModuleNotFoundError as exc: + raise ModuleNotFoundError( + "You need to `pip install " + "git+https://github.com/TIGER-AI-Lab/Mantis.git` " + "to use this model") from exc + + processor = MLlavaProcessor.from_pretrained( + self.ctx.model_config.tokenizer) + assert isinstance(processor, ProcessorMixin) + return processor + + +# To use this model, please use +# `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) +@MULTIMODAL_REGISTRY.register_processor(lambda ctx: MantisProcessor( + ctx=ctx, + metadata=create_metadata_for_llava(ctx), +)) +class MantisForConditionalGeneration(LlavaForConditionalGeneration): + pass diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f5a02a5b25ca2..bae4acb03b711 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -147,6 +147,7 @@ "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501 + "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501 "MiniCPMV": ("minicpmv", "MiniCPMV"), "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"), "NVLM_D": ("nvlm_d", "NVLM_D_Model"),