From 8c3572bf7f5125c7e431606a5b8a1197c3c1918e Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Fri, 1 Nov 2024 21:18:18 +0800 Subject: [PATCH] [Misc] Multi-Modality Support for Loading Local image Files FIX #8730 Signed-off-by: chaunceyjiang --- vllm/config.py | 12 ++++--- vllm/engine/arg_utils.py | 32 ++++++++++--------- vllm/entrypoints/chat_utils.py | 13 +++++--- vllm/entrypoints/llm.py | 11 +++++-- vllm/multimodal/utils.py | 57 ++++++++++++++++++++++++++++------ 5 files changed, 91 insertions(+), 34 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 17e9b1c100498..1d587d5383f47 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -55,6 +55,10 @@ class ModelConfig: "mistral" will always use the tokenizer from `mistral_common`. trust_remote_code: Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer. + allowed_local_media_path: Allowing API requests to read local images or + videos from directories specified by the server file system. + This is a security risk. Should only be enabled in trusted + environments. dtype: Data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models. @@ -132,6 +136,7 @@ def __init__( tokenizer: str, tokenizer_mode: str, trust_remote_code: bool, + allowed_local_media_path: str, dtype: Union[str, torch.dtype], seed: int, revision: Optional[str] = None, @@ -164,6 +169,7 @@ def __init__( self.tokenizer = tokenizer self.tokenizer_mode = tokenizer_mode self.trust_remote_code = trust_remote_code + self.allowed_local_media_path = allowed_local_media_path self.seed = seed self.revision = revision self.code_revision = code_revision @@ -1386,10 +1392,8 @@ def maybe_create_spec_config( ngram_prompt_lookup_max, ngram_prompt_lookup_min, draft_token_acceptance_method=draft_token_acceptance_method, - typical_acceptance_sampler_posterior_threshold=\ - typical_acceptance_sampler_posterior_threshold, - typical_acceptance_sampler_posterior_alpha=\ - typical_acceptance_sampler_posterior_alpha, + typical_acceptance_sampler_posterior_threshold=typical_acceptance_sampler_posterior_threshold, + typical_acceptance_sampler_posterior_alpha=typical_acceptance_sampler_posterior_alpha, disable_logprobs=disable_logprobs, disable_log_stats=disable_log_stats, ) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index da06ab186821e..161344daf6a23 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -92,6 +92,7 @@ class EngineArgs: tokenizer_mode: str = 'auto' chat_template_text_format: str = 'string' trust_remote_code: bool = False + allowed_local_media_path: str = "" download_dir: Optional[str] = None load_format: str = 'auto' config_format: ConfigFormat = ConfigFormat.AUTO @@ -269,6 +270,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--trust-remote-code', action='store_true', help='Trust remote code from huggingface.') + parser.add_argument( + '--allowed-local-media-path', + type=str, + help="Allowing API requests to read local images or videos" + "from directories specified by the server file system." + "This is a security risk." + "Should only be enabled in trusted environments") parser.add_argument('--download-dir', type=nullable_str, default=EngineArgs.download_dir, @@ -658,8 +666,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: '--speculative-model', type=nullable_str, default=EngineArgs.speculative_model, - help= - 'The name of the draft model to be used in speculative decoding.') + help='The name of the draft model to be used in speculative decoding.') # Quantization settings for speculative model. parser.add_argument( '--speculative-model-quantization', @@ -681,8 +688,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument( '--speculative-disable-mqa-scorer', action='store_true', - help= - 'If set to True, the MQA scorer will be disabled in speculative ' + help='If set to True, the MQA scorer will be disabled in speculative ' ' and fall back to batch expansion') parser.add_argument( '--speculative-draft-tensor-parallel-size', @@ -920,6 +926,7 @@ def create_model_config(self) -> ModelConfig: tokenizer_mode=self.tokenizer_mode, chat_template_text_format=self.chat_template_text_format, trust_remote_code=self.trust_remote_code, + allowed_local_media_path=self.allowed_local_media_path, dtype=self.dtype, seed=self.seed, revision=self.revision, @@ -971,8 +978,8 @@ def create_engine_config(self) -> VllmConfig: f"'bitsandbytes' load format, but got {self.load_format}") if (self.load_format == "bitsandbytes" or - self.qlora_adapter_name_or_path is not None) and \ - self.quantization != "bitsandbytes": + self.qlora_adapter_name_or_path is not None) and \ + self.quantization != "bitsandbytes": raise ValueError( "BitsAndBytes load format and QLoRA adapter only support " f"'bitsandbytes' quantization, but got {self.quantization}") @@ -1059,10 +1066,8 @@ def create_engine_config(self) -> VllmConfig: target_parallel_config=parallel_config, target_dtype=self.dtype, speculative_model=self.speculative_model, - speculative_model_quantization = \ - self.speculative_model_quantization, - speculative_draft_tensor_parallel_size = \ - self.speculative_draft_tensor_parallel_size, + speculative_model_quantization=self.speculative_model_quantization, + speculative_draft_tensor_parallel_size=self.speculative_draft_tensor_parallel_size, num_speculative_tokens=self.num_speculative_tokens, speculative_disable_mqa_scorer=self.speculative_disable_mqa_scorer, speculative_disable_by_batch_size=self. @@ -1072,8 +1077,7 @@ def create_engine_config(self) -> VllmConfig: disable_log_stats=self.disable_log_stats, ngram_prompt_lookup_max=self.ngram_prompt_lookup_max, ngram_prompt_lookup_min=self.ngram_prompt_lookup_min, - draft_token_acceptance_method=\ - self.spec_decoding_acceptance_method, + draft_token_acceptance_method=self.spec_decoding_acceptance_method, typical_acceptance_sampler_posterior_threshold=self. typical_acceptance_sampler_posterior_threshold, typical_acceptance_sampler_posterior_alpha=self. @@ -1136,7 +1140,7 @@ def create_engine_config(self) -> VllmConfig: and self.max_cpu_loras > 0 else None) if self.enable_lora else None if self.qlora_adapter_name_or_path is not None and \ - self.qlora_adapter_name_or_path != "": + self.qlora_adapter_name_or_path != "": if self.model_loader_extra_config is None: self.model_loader_extra_config = {} self.model_loader_extra_config[ @@ -1147,7 +1151,7 @@ def create_engine_config(self) -> VllmConfig: prompt_adapter_config = PromptAdapterConfig( max_prompt_adapters=self.max_prompt_adapters, max_prompt_adapter_token=self.max_prompt_adapter_token) \ - if self.enable_prompt_adapter else None + if self.enable_prompt_adapter else None decoding_config = DecodingConfig( guided_decoding_backend=self.guided_decoding_backend) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index c9552977710d1..6684a4225608b 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -62,7 +62,7 @@ class CustomChatCompletionContentPartParam(TypedDict, total=False): class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain image_url. This is supported by OpenAI API, although it is not documented. - + Example: { "image_url": "https://example.com/image.jpg" @@ -73,7 +73,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain audio_url. - + Example: { "audio_url": "https://example.com/audio.mp3" @@ -307,7 +307,9 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: self._tracker = tracker def parse_image(self, image_url: str) -> None: - image = get_and_parse_image(image_url) + image = get_and_parse_image( + image_url, + allowed_local_media_path=self._tracker._model_config.allowed_local_media_path) placeholder = self._tracker.add("image", image) self._add_placeholder(placeholder) @@ -327,7 +329,10 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: self._tracker = tracker def parse_image(self, image_url: str) -> None: - image_coro = async_get_and_parse_image(image_url) + image_coro = async_get_and_parse_image( + image_url, + allowed_local_media_path=self._tracker._model_config.allowed_local_media_path + ) placeholder = self._tracker.add("image", image_coro) self._add_placeholder(placeholder) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 3d62cb3598477..78a57041b4089 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -58,6 +58,10 @@ class LLM: from the input. trust_remote_code: Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer. + allowed_local_media_path: Allowing API requests to read local images + or videos from directories specified by the server file system. + This is a security risk. Should only be enabled in trusted + environments. tensor_parallel_size: The number of GPUs to use for distributed execution with tensor parallelism. dtype: The data type for the model weights and activations. Currently, @@ -139,6 +143,7 @@ def __init__( tokenizer_mode: str = "auto", skip_tokenizer_init: bool = False, trust_remote_code: bool = False, + allowed_local_media_path: str = "", tensor_parallel_size: int = 1, dtype: str = "auto", quantization: Optional[str] = None, @@ -179,6 +184,7 @@ def __init__( tokenizer_mode=tokenizer_mode, skip_tokenizer_init=skip_tokenizer_init, trust_remote_code=trust_remote_code, + allowed_local_media_path=allowed_local_media_path, tensor_parallel_size=tensor_parallel_size, dtype=dtype, quantization=quantization, @@ -475,7 +481,7 @@ def sort_beams_key(x: BeamSearchSequence) -> float: logprob_obj.logprob) if token_id == tokenizer.eos_token_id and \ - not ignore_eos: + not ignore_eos: instance.completed.append(new_beam) else: instance_new_beams.append(new_beam) @@ -931,7 +937,8 @@ def _run_engine( # Calculate tokens only for RequestOutput assert output.prompt_token_ids is not None total_in_toks += len(output.prompt_token_ids) - in_spd = total_in_toks / pbar.format_dict["elapsed"] + in_spd = total_in_toks / \ + pbar.format_dict["elapsed"] total_out_toks += sum( len(stp.token_ids) for stp in output.outputs) out_spd = (total_out_toks / diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index c5ff552e06099..d1742ab3ce9c3 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -2,6 +2,7 @@ from functools import lru_cache from io import BytesIO from typing import Any, List, Optional, Tuple, TypeVar, Union +from os import path import numpy as np import numpy.typing as npt @@ -18,19 +19,49 @@ cached_get_tokenizer = lru_cache(get_tokenizer) -def _load_image_from_bytes(b: bytes): +def _load_image_from_bytes(b: bytes) -> Image.Image: image = Image.open(BytesIO(b)) image.load() return image -def _load_image_from_data_url(image_url: str): +def _is_subpath(image_path, allowed_local_media_path): + # Get the common path + common_path = path.commonpath([image_path, allowed_local_media_path]) + # Check if the common path is the same as allowed_local_media_path + return common_path == path.abspath(allowed_local_media_path) + + +def _load_image_from_file(image_url: str, allowed_local_media_path: str) -> Image.Image: + if not allowed_local_media_path: + raise ValueError( + "Invalid 'image_url': Cannot load local files without '--allowed-local-media-path'.") + if allowed_local_media_path: + if not path.exists(allowed_local_media_path): + raise ValueError( + "Invalid '--allowed-local-media-path': The path does not exist.") + if not path.isdir(allowed_local_media_path): + raise ValueError( + "Invalid '--allowed-local-media-path': The path must be a directory.") + + # Only split once and assume the second part is the image path + _, image_path = image_url.split("file://", 1) + if not _is_subpath(image_path, allowed_local_media_path): + raise ValueError( + "Invalid 'image_url': The file path must be a subpath of '--allowed-local-media-path'.") + + image = Image.open(image_path) + image.load() + return image + + +def _load_image_from_data_url(image_url: str) -> Image.Image: # Only split once and assume the second part is the base64 encoded image _, image_base64 = image_url.split(",", 1) return load_image_from_base64(image_base64) -def fetch_image(image_url: str, *, image_mode: str = "RGB") -> Image.Image: +def fetch_image(image_url: str, *, image_mode: str = "RGB", allowed_local_media_path: str = "") -> Image.Image: """ Load a PIL image from a HTTP or base64 data URL. @@ -43,16 +74,18 @@ def fetch_image(image_url: str, *, image_mode: str = "RGB") -> Image.Image: elif image_url.startswith('data:image'): image = _load_image_from_data_url(image_url) + elif image_url.startswith('file://'): + image = _load_image_from_file(image_url, allowed_local_media_path) else: raise ValueError("Invalid 'image_url': A valid 'image_url' must start " - "with either 'data:image' or 'http'.") + "with either 'data:image', 'file://' or 'http'.") return image.convert(image_mode) async def async_fetch_image(image_url: str, *, - image_mode: str = "RGB") -> Image.Image: + image_mode: str = "RGB", allowed_local_media_path: str = "") -> Image.Image: """ Asynchronously load a PIL image from a HTTP or base64 data URL. @@ -65,9 +98,11 @@ async def async_fetch_image(image_url: str, elif image_url.startswith('data:image'): image = _load_image_from_data_url(image_url) + elif image_url.startswith('file://'): + image = _load_image_from_file(image_url, allowed_local_media_path) else: raise ValueError("Invalid 'image_url': A valid 'image_url' must start " - "with either 'data:image' or 'http'.") + "with either 'data:image', 'file://' or 'http'.") return image.convert(image_mode) @@ -126,8 +161,9 @@ def get_and_parse_audio(audio_url: str) -> MultiModalDataDict: return {"audio": (audio, sr)} -def get_and_parse_image(image_url: str) -> MultiModalDataDict: - image = fetch_image(image_url) +def get_and_parse_image(image_url: str, *, allowed_local_media_path: str = "") -> MultiModalDataDict: + image = fetch_image( + image_url, allowed_local_media_path=allowed_local_media_path) return {"image": image} @@ -136,8 +172,9 @@ async def async_get_and_parse_audio(audio_url: str) -> MultiModalDataDict: return {"audio": (audio, sr)} -async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict: - image = await async_fetch_image(image_url) +async def async_get_and_parse_image(image_url: str, *, allowed_local_media_path: str = "") -> MultiModalDataDict: + image = await async_fetch_image( + image_url, allowed_local_media_path=allowed_local_media_path) return {"image": image}