vllm-project · jeejeelee · Dec 4, 2024 · Nov 30, 2024 · Dec 3, 2024 · Dec 4, 2024
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
@@ -46,6 +46,7 @@ def phi3v_tokenizer():
         tokenizer_id=PHI3V_MODEL_ID,
         enable_lora=False,
         max_num_seqs=5,
+        max_loras=0,
         max_input_length=None,
     )
 
@@ -70,6 +71,7 @@ def mllama_tokenizer():
         MLLAMA_MODEL_ID,
         enable_lora=False,
         max_num_seqs=5,
+        max_loras=0,
         max_input_length=None,
     )
 
@@ -682,6 +684,7 @@ def get_conversation(is_hf: bool):
         MLLAMA_MODEL_ID,
         enable_lora=False,
         max_num_seqs=5,
+        max_loras=0,
         max_input_length=None,
     )
     tokenizer = tokenizer_group.tokenizer
@@ -728,6 +731,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         model,
         enable_lora=False,
         max_num_seqs=5,
+        max_loras=0,
         max_input_length=None,
     )
     tokenizer = tokenizer_group.tokenizer
@@ -777,6 +781,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
         PHI3V_MODEL_ID,
         enable_lora=False,
         max_num_seqs=5,
+        max_loras=0,
         max_input_length=None,
     )
     dummy_tokenizer = tokenizer_group.tokenizer

diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
@@ -49,6 +49,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
         tokenizer_id="facebook/opt-125m",
         enable_lora=False,
         max_num_seqs=max_num_seqs,
+        max_loras=0,
         max_input_length=None,
     )
 

diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
@@ -36,6 +36,7 @@ async def test_tokenizer_group(tokenizer_group_type):
         tokenizer_id="gpt2",
         enable_lora=False,
         max_num_seqs=1,
+        max_loras=0,
         max_input_length=None,
     )
     assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
@@ -60,6 +61,7 @@ async def test_tokenizer_group_pool(tokenizer_group_type):
         tokenizer_id="gpt2",
         enable_lora=False,
         max_num_seqs=1,
+        max_loras=0,
         max_input_length=None,
     )
     # Send multiple requests to the tokenizer group pool
@@ -102,6 +104,7 @@ class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool):
         tokenizer_id="gpt2",
         enable_lora=False,
         max_num_seqs=1,
+        max_loras=0,
         max_input_length=None)
     with pytest.raises(AssertionError):
         tokenizer_pool.ping()
@@ -113,6 +116,7 @@ class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool):
             tokenizer_id="gpt2",
             enable_lora=False,
             max_num_seqs=1,
+            max_loras=0,
             max_input_length=None)
         tokenizer_pool.ping()
 
@@ -150,6 +154,7 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
         tokenizer_id="gpt2",
         enable_lora=False,
         max_num_seqs=1,
+        max_loras=0,
         max_input_length=None,
         fail_at=fail_at)
     tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
@@ -177,6 +182,7 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
         tokenizer_id="gpt2",
         enable_lora=False,
         max_num_seqs=1,
+        max_loras=0,
         max_input_length=None,
         fail_at=fail_at)
 
@@ -198,6 +204,7 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
         tokenizer_id="gpt2",
         enable_lora=False,
         max_num_seqs=1,
+        max_loras=0,
         max_input_length=2,
         fail_at=fail_at)
     tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()

@@ -619,7 +619,7 @@ def _init_tokenizer(self) -> BaseTokenizerGroup:
             model_config=self.model_config,
             scheduler_config=self.scheduler_config,
             parallel_config=self.parallel_config,
-            enable_lora=bool(self.lora_config))
+            lora_config=self.lora_config)
 
     def _verify_args(self) -> None:
         self.model_config.verify_with_parallel_config(self.parallel_config)

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -93,8 +93,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
             model_config=self.model_config,
             scheduler_config=engine_config.scheduler_config,
             parallel_config=engine_config.parallel_config,
-            enable_lora=bool(engine_config.lora_config),
-        )
+            lora_config=engine_config.lora_config)
         self.input_preprocessor = InputPreprocessor(self.model_config,
                                                     self.tokenizer)
 

diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -1,7 +1,7 @@
 from typing import Optional, Type
 
-from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
-                         TokenizerPoolConfig)
+from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, TokenizerPoolConfig)
 from vllm.executor.ray_utils import ray
 
 from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
@@ -16,10 +16,11 @@
 def init_tokenizer_from_configs(model_config: ModelConfig,
                                 scheduler_config: SchedulerConfig,
                                 parallel_config: ParallelConfig,
-                                enable_lora: bool):
+                                lora_config: LoRAConfig):
     init_kwargs = dict(tokenizer_id=model_config.tokenizer,
-                       enable_lora=enable_lora,
+                       enable_lora=bool(lora_config),
                        max_num_seqs=scheduler_config.max_num_seqs,
+                       max_loras=lora_config.max_loras if lora_config else 0,
                        max_input_length=None,
                        tokenizer_mode=model_config.tokenizer_mode,
                        trust_remote_code=model_config.trust_remote_code,

diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -51,14 +51,15 @@ def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
         return cls(**init_kwargs)
 
     def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
-                 max_input_length: Optional[int], num_actors: int,
-                 ray_actor_options: dict, **tokenizer_config):
+                 max_loras: int, max_input_length: Optional[int],
+                 num_actors: int, ray_actor_options: dict, **tokenizer_config):
         # Store a local copy of the TokenizerGroup for quick access
         # to underlying HF tokenizers.
         self._tokenizer_config = {
             "tokenizer_id": tokenizer_id,
             "enable_lora": enable_lora,
             "max_num_seqs": max_num_seqs,
+            "max_loras": max_loras,
             "max_input_length": max_input_length,
             **tokenizer_config
         }

diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
@@ -15,14 +15,15 @@ class TokenizerGroup(BaseTokenizerGroup):
     """A group of tokenizers that can be used for LoRA adapters."""
 
     def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
-                 max_input_length: Optional[int], **tokenizer_config):
+                 max_loras: int, max_input_length: Optional[int],
+                 **tokenizer_config):
         self.tokenizer_id = tokenizer_id
         self.tokenizer_config = tokenizer_config
         self.enable_lora = enable_lora
         self.max_input_length = max_input_length
         self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
         self.lora_tokenizers = LRUCache[AnyTokenizer](
-            capacity=max_num_seqs if enable_lora else 0)
+            capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
 
     @classmethod
     def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],

@@ -51,7 +51,7 @@ def __init__(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
             parallel_config=vllm_config.parallel_config,
-            enable_lora=bool(vllm_config.lora_config))
+            lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
         # Request streams (map of request_id -> AsyncStream).

@@ -46,7 +46,7 @@ def __init__(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
             parallel_config=vllm_config.parallel_config,
-            enable_lora=bool(vllm_config.lora_config))
+            lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
         # Processor (convert Inputs --> EngineCoreRequests)