From b7e60967660b1876b6eeecf5684570ba4a4dd2e2 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 27 Nov 2024 13:12:48 +0000
Subject: [PATCH] Support and test Mantis model

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 |  5 +--
 examples/offline_inference_vision_language.py | 17 +++++++++
 requirements-test.in                          |  3 --
 tests/conftest.py                             |  1 -
 .../vision_language/test_models.py            | 28 +++++++++++----
 .../vision_language/vlm_utils/core.py         | 16 ++++++---
 .../vision_language/vlm_utils/model_utils.py  | 35 ++++++++++++++++++-
 .../vision_language/vlm_utils/types.py        | 16 +++++----
 tests/models/registry.py                      |  1 +
 vllm/model_executor/models/llava.py           | 32 +++++++++++++++--
 vllm/model_executor/models/registry.py        |  1 +
 11 files changed, 128 insertions(+), 27 deletions(-)
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index fc23c9cff0d87..23a0451148005 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -334,7 +334,6 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/language -m core_model
-    - pytest -v -s models/embedding/vision_language -m core_model
 
 - label: Language Models Test (Extended) # 50min
   optional: true
@@ -346,7 +345,6 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/language -m 'not core_model'
-    - pytest -v -s models/embedding/vision_language -m 'not core_model'
 
 - label: Multi-Modal Models Test (Standard) # 26min
   #mirror_hardwares: [amd]
@@ -357,8 +355,10 @@ steps:
   - tests/models/embedding/vision_language
   - tests/models/encoder_decoder/vision_language
   commands:
+    - python -m pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/vision_language -m core_model
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
@@ -376,6 +376,7 @@ steps:
     # https://github.com/huggingface/transformers/issues/34307
     - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/vision_language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
 
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index f08f22eec164a..e7ca816503c1b 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -419,6 +419,22 @@ def run_aria(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# Mantis
+def run_mantis(question: str, modality: str):
+    assert modality == "image"
+
+    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
+    prompt = llama3_template.format(f"{question}\n<image>")
+
+    llm = LLM(
+        model="TIGER-Lab/Mantis-8B-siglip-llama3",
+        max_model_len=4096,
+        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
+    )
+    stop_token_ids = [128009]
+    return llm, prompt, stop_token_ids
+
+
 model_example_map = {
     "llava": run_llava,
     "llava-next": run_llava_next,
@@ -441,6 +457,7 @@ def run_aria(question: str, modality: str):
     "glm4v": run_glm4v,
     "idefics3": run_idefics3,
     "aria": run_aria,
+    "mantis": run_mantis,
 }
 
 
diff --git a/requirements-test.in b/requirements-test.in
index 76f6de2f77c34..1ea0c44ee1035 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -24,9 +24,6 @@ mistral_common[opencv] >= 1.4.4 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 
-# TODO: Add this after fully implementing llava(mantis)
-# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
-
 # quantization
 bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.9
diff --git a/tests/conftest.py b/tests/conftest.py
index d56942d8912af..36f1d477fab59 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -263,7 +263,6 @@ def __init__(
         dtype: str = "half",
         *,
         model_kwargs: Optional[Dict[str, Any]] = None,
-        is_embedding_model: bool = False,
         is_sentence_transformer: bool = False,
         is_cross_encoder: bool = False,
         skip_tokenizer_init: bool = False,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 3f6d8ef42cd5f..0bd6e97019258 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -34,7 +34,7 @@
     "dtype": "half",
     "max_tokens": 5,
     "tensor_parallel_size": 2,
-    "model_kwargs": {"device_map": "auto"},
+    "hf_model_kwargs": {"device_map": "auto"},
     "image_size_factors": [(.25, 0.5, 1.0)],
     "distributed_executor_backend": (
         "ray",
@@ -108,7 +108,7 @@
             "cherry_blossom": "What is in the picture?",
         }),
         auto_cls=AutoModelForVision2Seq,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
         vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
@@ -148,7 +148,7 @@
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         auto_cls=AutoModelForVision2Seq,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
         # For chameleon, we only compare the sequences
@@ -264,7 +264,7 @@
         prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
         num_video_frames=16,
         max_model_len=16384,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values_videos"
         ),
         auto_cls=AutoModelForVision2Seq,
@@ -295,6 +295,20 @@
             )
         ],
     ),
+    "mantis": VLMTestInfo(
+        models=["TIGER-Lab/Mantis-8B-siglip-llama3"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        max_model_len=4096,
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
+            "pixel_values"
+        ),
+        vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}},  # noqa: E501
+        get_stop_token_ids=lambda tok: [128009],
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
+        patch_hf_runner=model_utils.mantis_patch_hf_runner,
+    ),
     "minicpmv": VLMTestInfo(
         models=["openbmb/MiniCPM-Llama3-V-2_5"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -318,7 +332,7 @@
     #     max_num_seqs=2,
     #     task="generate",
     #     # use eager mode for hf runner since phi3v didn't work with flash_attn
-    #     model_kwargs={"_attn_implementation": "eager"},
+    #     hf_model_kwargs={"_attn_implementation": "eager"},
     #     use_tokenizer_eos=True,
     #     vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
     #     num_logprobs=10,
@@ -349,7 +363,7 @@
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         auto_cls=AutoModelForVision2Seq,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
         vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
@@ -418,7 +432,7 @@
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=16384,
         max_num_seqs=2,
-        postprocess_inputs=model_utils.get_key_type_post_processor(
+        postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
         auto_cls=AutoModelForVision2Seq,
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index 7e8c6dabb15af..c161036312d89 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -3,7 +3,7 @@
 
 import torch
 from PIL.Image import Image
-from transformers import AutoTokenizer, BatchEncoding
+from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from .....conftest import HfRunner, VllmRunner
@@ -28,9 +28,11 @@ def run_test(
     use_tokenizer_eos: bool,
     postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
     comparator: Callable[..., None],
-    get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]],
+    get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
+                                          List[int]]],
     limit_mm_per_prompt: Dict[str, int],
-    model_kwargs: Optional[Dict[str, Any]],
+    vllm_runner_kwargs: Optional[Dict[str, Any]],
+    hf_model_kwargs: Optional[Dict[str, Any]],
     patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
     task: str = "auto",
     runner_mm_key: str = "images",
@@ -54,6 +56,9 @@ def run_test(
     if get_stop_token_ids is not None:
         vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
 
+    if vllm_runner_kwargs is None:
+        vllm_runner_kwargs = {}
+
     with vllm_runner(model,
                      max_model_len=max_model_len,
                      max_num_seqs=max_num_seqs,
@@ -62,7 +67,8 @@ def run_test(
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=enforce_eager,
-                     task=task) as vllm_model:
+                     task=task,
+                     **vllm_runner_kwargs) as vllm_model:
         for prompts, media in vllm_inputs:
             vllm_kwargs[runner_mm_key] = media
             vllm_output = vllm_model.generate_greedy_logprobs(
@@ -73,7 +79,7 @@ def run_test(
                          dtype=dtype,
                          auto_cls=auto_cls,
                          postprocess_inputs=postprocess_inputs,
-                         model_kwargs=model_kwargs)
+                         model_kwargs=hf_model_kwargs)
 
     # Some models need to patch things like the model processor, e.g., internvl
     if patch_hf_runner is not None:
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 849857b4232e7..2f238b4e18a74 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -126,6 +126,16 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
+def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
+                             model: str) -> RunnerOutput:
+    """Sanitize vllm output [mantis] to compare with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|eot_id|>"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
 def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
                             model: str) -> RunnerOutput:
     """Sanitize vllm output [phi3v] to be comparable with hf output."""
@@ -184,7 +194,7 @@ def get_llava_embeddings(image_assets: _ImageAssets):
 
 
 ####### postprocessors to run on HF BatchEncoding
-def get_key_type_post_processor(
+def cast_dtype_post_processor(
         hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
     """Gets a handle to a post processor which converts a given key into a
     target data type."""
@@ -407,3 +417,26 @@ def _internvl_generate(
     )
 
     return outputs
+
+
+def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    from mantis.models.mllava import MLlavaProcessor
+
+    hf_model.processor = MLlavaProcessor.from_pretrained(hf_model.model_name)
+
+    orig_generate = hf_model.model.generate
+    tokenizer = hf_model.processor.tokenizer
+
+    def _generate(self, *args, **kwargs):
+        return orig_generate(
+            *args,
+            **kwargs,
+            eos_token_id=[
+                tokenizer.eos_token_id,
+                tokenizer.convert_tokens_to_ids("<|eot_id|>"),
+            ],
+        )
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
index 8459476dc2d07..ad5095a3eaee1 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -7,7 +7,8 @@
 import torch
 from PIL.Image import Image
 from pytest import MarkDecorator
-from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding
+from transformers import (AutoModelForCausalLM, BatchEncoding,
+                          PreTrainedTokenizerBase)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from vllm.sequence import SampleLogprobs
@@ -66,7 +67,7 @@ class ImageSizeWrapper(NamedTuple):
 class VLMTestInfo(NamedTuple):
     """Holds the configuration for 1+ tests for one model architecture."""
 
-    models: Union[List[str]]
+    models: List[str]
     test_type: Union[VLMTestType, Iterable[VLMTestType]]
 
     # Should be None only if this is a CUSTOM_INPUTS test
@@ -94,13 +95,15 @@ class VLMTestInfo(NamedTuple):
     max_num_seqs: int = 256
     task: str = "auto"
     tensor_parallel_size: int = 1
+    vllm_runner_kwargs: Optional[Dict[str, Any]] = None
 
     # Optional callable which gets a list of token IDs from the model tokenizer
-    get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]] = None
+    get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
+                                          List[int]]] = None
 
     # Exposed options for HF runner
-    model_kwargs: Optional[Dict[str, Any]] = None
-    # Indicates we should explicitly pass the EOS from the tokeniezr
+    hf_model_kwargs: Optional[Dict[str, Any]] = None
+    # Indicates we should explicitly pass the EOS from the tokenizer
     use_tokenizer_eos: bool = False
     auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM
     # Callable to pass to the HF runner to run on inputs; for now, we also pass
@@ -159,6 +162,7 @@ def get_non_parametrized_runner_kwargs(self):
             "max_num_seqs": self.max_num_seqs,
             "task": self.task,
             "tensor_parallel_size": self.tensor_parallel_size,
+            "vllm_runner_kwargs": self.vllm_runner_kwargs,
             "hf_output_post_proc": self.hf_output_post_proc,
             "vllm_output_post_proc": self.vllm_output_post_proc,
             "auto_cls": self.auto_cls,
@@ -166,7 +170,7 @@ def get_non_parametrized_runner_kwargs(self):
             "postprocess_inputs": self.postprocess_inputs,
             "comparator": self.comparator,
             "get_stop_token_ids": self.get_stop_token_ids,
-            "model_kwargs": self.model_kwargs,
+            "hf_model_kwargs": self.hf_model_kwargs,
             "patch_hf_runner": self.patch_hf_runner,
         }
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 865e90b3f8b0e..2fa61e5dd0095 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -173,6 +173,7 @@ class _HfExamplesInfo:
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"),  # noqa: E501
     "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
+    "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3"),  # noqa: E501
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 trust_remote_code=True),
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index c1159ab8e4589..86918aec4c43d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -7,7 +7,7 @@
 from PIL.Image import Image
 from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
                           PixtralVisionConfig, PretrainedConfig,
-                          SiglipVisionConfig)
+                          ProcessorMixin, SiglipVisionConfig)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
@@ -21,7 +21,7 @@
 from vllm.multimodal.processing import (InputProcessingContext,
                                         ModalityProcessingMetadata,
                                         MultiModalProcessingMetadata,
-                                        PromptReplacement)
+                                        MultiModalProcessor, PromptReplacement)
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel, get_max_clip_image_tokens
@@ -499,3 +499,31 @@ def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
+
+
+class MantisProcessor(MultiModalProcessor):
+
+    def _get_hf_processor(self) -> ProcessorMixin:
+        try:
+            from mantis.models.mllava import MLlavaProcessor
+        except ModuleNotFoundError as exc:
+            raise ModuleNotFoundError(
+                "You need to `pip install "
+                "git+https://github.com/TIGER-AI-Lab/Mantis.git` "
+                "to use this model") from exc
+
+        processor = MLlavaProcessor.from_pretrained(
+            self.ctx.model_config.tokenizer)
+        assert isinstance(processor, ProcessorMixin)
+        return processor
+
+
+# To use this model, please use
+# `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
+@MULTIMODAL_REGISTRY.register_processor(lambda ctx: MantisProcessor(
+    ctx=ctx,
+    metadata=create_metadata_for_llava(ctx),
+))
+class MantisForConditionalGeneration(LlavaForConditionalGeneration):
+    pass
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index f5a02a5b25ca2..bae4acb03b711 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -147,6 +147,7 @@
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
+    "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),