From 7870f9cb8dfef4a380c45e86b5cab4d1b7386f12 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Thu, 7 Nov 2024 16:22:30 +0800 Subject: [PATCH] Use min_pixels and max_pixels arguments directly Signed-off-by: Jiahao Li --- vllm/model_executor/models/qwen2_vl.py | 29 +++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 6d3b57cf99945..0e820cf123139 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -22,8 +22,8 @@ # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" from functools import partial -from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, - Tuple, Type, TypedDict, Union) +from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, + Optional, Tuple, Type, TypedDict, Union) import torch import torch.nn as nn @@ -558,6 +558,17 @@ def forward( # === Vision input helpers === # +def get_mm_processor_kwargs( + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None) -> Dict[str, int]: + mm_processor_kwargs = {} + if min_pixels: + mm_processor_kwargs["min_pixels"] = min_pixels + if max_pixels: + mm_processor_kwargs["max_pixels"] = max_pixels + return mm_processor_kwargs + + def mm_input_mapper_for_qwen2_vl( ctx: InputContext, data: MultiModalData[object], @@ -575,12 +586,8 @@ def mm_input_mapper_for_qwen2_vl( model_config = ctx.model_config # Handle mm processor kwargs; we pass these at creation time # because preprocess() in transformers doesn't expose them - mm_processor_kwargs = {} - if min_pixels: - mm_processor_kwargs["min_pixels"] = min_pixels - if max_pixels: - mm_processor_kwargs["max_pixels"] = max_pixels - + mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels, + max_pixels=max_pixels) image_processor = cached_get_image_processor( model_config.model, trust_remote_code=model_config.trust_remote_code, @@ -683,7 +690,8 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext, *, min_pixels=None, max_pixels=None) -> int: - mm_processor_kwargs = ctx.model_config.mm_processor_kwargs or {} + mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels, + max_pixels=max_pixels) image_processor = cached_get_image_processor(ctx.model_config.model, **mm_processor_kwargs) max_resized_height, max_resized_width, max_llm_image_tokens = \ @@ -707,7 +715,8 @@ def dummy_data_for_qwen2_vl( min_pixels: Optional[int] = None, max_pixels: Optional[int] = None ) -> Tuple[SequenceData, Optional[MultiModalDataDict]]: - mm_processor_kwargs = ctx.model_config.mm_processor_kwargs or {} + mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels, + max_pixels=max_pixels) image_processor = cached_get_image_processor(ctx.model_config.model, **mm_processor_kwargs)