From 7870f9cb8dfef4a380c45e86b5cab4d1b7386f12 Mon Sep 17 00:00:00 2001
From: Jiahao Li <liplus17@163.com>
Date: Thu, 7 Nov 2024 16:22:30 +0800
Subject: [PATCH] Use min_pixels and max_pixels arguments directly

Signed-off-by: Jiahao Li <liplus17@163.com>
---
 vllm/model_executor/models/qwen2_vl.py | 29 +++++++++++++++++---------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 6d3b57cf99945..0e820cf123139 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -22,8 +22,8 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from functools import partial
-from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
-                    Tuple, Type, TypedDict, Union)
+from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
+                    Optional, Tuple, Type, TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -558,6 +558,17 @@ def forward(
 # === Vision input helpers === #
 
 
+def get_mm_processor_kwargs(
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None) -> Dict[str, int]:
+    mm_processor_kwargs = {}
+    if min_pixels:
+        mm_processor_kwargs["min_pixels"] = min_pixels
+    if max_pixels:
+        mm_processor_kwargs["max_pixels"] = max_pixels
+    return mm_processor_kwargs
+
+
 def mm_input_mapper_for_qwen2_vl(
     ctx: InputContext,
     data: MultiModalData[object],
@@ -575,12 +586,8 @@ def mm_input_mapper_for_qwen2_vl(
     model_config = ctx.model_config
     # Handle mm processor kwargs; we pass these at creation time
     # because preprocess() in transformers doesn't expose them
-    mm_processor_kwargs = {}
-    if min_pixels:
-        mm_processor_kwargs["min_pixels"] = min_pixels
-    if max_pixels:
-        mm_processor_kwargs["max_pixels"] = max_pixels
-
+    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
+                                                  max_pixels=max_pixels)
     image_processor = cached_get_image_processor(
         model_config.model,
         trust_remote_code=model_config.trust_remote_code,
@@ -683,7 +690,8 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
                                *,
                                min_pixels=None,
                                max_pixels=None) -> int:
-    mm_processor_kwargs = ctx.model_config.mm_processor_kwargs or {}
+    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
+                                                  max_pixels=max_pixels)
     image_processor = cached_get_image_processor(ctx.model_config.model,
                                                  **mm_processor_kwargs)
     max_resized_height, max_resized_width, max_llm_image_tokens = \
@@ -707,7 +715,8 @@ def dummy_data_for_qwen2_vl(
     min_pixels: Optional[int] = None,
     max_pixels: Optional[int] = None
 ) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
-    mm_processor_kwargs = ctx.model_config.mm_processor_kwargs or {}
+    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
+                                                  max_pixels=max_pixels)
     image_processor = cached_get_image_processor(ctx.model_config.model,
                                                  **mm_processor_kwargs)