Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Enables offline /score for embedding models #12021

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions tests/models/embedding/language/test_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@
import math

import pytest
import torch
import torch.nn.functional as F

MODELS = [
"cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert
"BAAI/bge-reranker-v2-m3", # Roberta
]

EMBEDDING_MODELS = [
"sentence-transformers/all-MiniLM-L12-v2",
]

TEXTS_1 = [
"What is the capital of France?",
"What is the capital of Germany?",
Expand Down Expand Up @@ -87,3 +93,98 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):

assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)


@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
def emb_model_name(request):
yield request.param


@pytest.mark.parametrize("dtype", ["half"])
def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):

text_pair = [TEXTS_1[0], TEXTS_2[0]]

with hf_runner(emb_model_name, dtype=dtype,
is_sentence_transformer=True) as hf_model:
hf_embeddings = hf_model.encode(text_pair)
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, hf_embeddings),
dim=0)
]

with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])

assert len(vllm_outputs) == 1
assert len(hf_outputs) == 1

assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)


@pytest.mark.parametrize("dtype", ["half"])
def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):

text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
]

with hf_runner(emb_model_name, dtype=dtype,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, pair),
dim=0) for pair in hf_embeddings
]

with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)

assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2

assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)


@pytest.mark.parametrize("dtype", ["half"])
def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):

text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]

with hf_runner(emb_model_name, dtype=dtype,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, pair),
dim=0) for pair in hf_embeddings
]

with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)

assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2

assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
116 changes: 79 additions & 37 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Tuple, Type, Union, cast, overload)

import cloudpickle
import torch
import torch.nn as nn
from tqdm import tqdm
from typing_extensions import TypeVar, deprecated
Expand Down Expand Up @@ -1032,6 +1033,7 @@ def score(
A list of ``ScoringRequestOutput`` objects containing the
generated scores in the same order as the input prompts.
"""

runner_type = self.llm_engine.model_config.runner_type
if runner_type != "pooling":
messages = ["LLM.score() is only supported for pooling models."]
Expand All @@ -1047,25 +1049,20 @@ def score(

raise ValueError(" ".join(messages))

if not self.llm_engine.model_config.is_cross_encoder:
raise ValueError("Your model does not support cross encoding")
if self.llm_engine.model_config.task != "score":
raise ValueError("Score API is only enabled for `--task score`")

tokenizer = self.llm_engine.get_tokenizer()

if isinstance(tokenizer, MistralTokenizer):
if self.llm_engine.model_config.task not in ("embed", "score"):
raise ValueError(
"MistralTokenizer not supported for cross-encoding")
"Score API is only enabled for `--task embed or score`")

# the tokenizer for models such as
# "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
# lists of tokens to the `text` and `text_pair` kwargs
tokenizer = self.llm_engine.get_tokenizer()

def ensure_str(prompt: SingletonPrompt):
if isinstance(prompt, dict):
if "multi_modal_data" in prompt:
raise ValueError("Multi-modal prompt is not "
"supported for cross encoding")
"supported for scoring")
elif "prompt_token_ids" in prompt:
prompt = tokenizer.decode(
cast(TokensPrompt, prompt)["prompt_token_ids"])
Expand All @@ -1091,40 +1088,85 @@ def ensure_str(prompt: SingletonPrompt):
if len(text_2) == 0:
raise ValueError("At least one text_pair element must be given")

if len(text_1) == 1:
text_1 = text_1 * len(text_2)
if self.llm_engine.model_config.is_cross_encoder:
if isinstance(tokenizer, MistralTokenizer):
gmarinho2 marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(
"MistralTokenizer not supported for cross-encoding")

input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
pooling_params = PoolingParams()
if len(text_1) == 1:
text_1 = text_1 * len(text_2)

tokenization_kwargs: Dict[str, Any] = {}
if truncate_prompt_tokens is not None:
tokenization_kwargs["truncation"] = True
tokenization_kwargs["max_length"] = truncate_prompt_tokens
input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]

parsed_prompts = []
pooling_params = PoolingParams()

for q, t in input_pairs:
prompt_inputs = tokenizer(text=q,
text_pair=t,
**tokenization_kwargs)
engine_prompt = TokensPrompt(
prompt_token_ids=prompt_inputs["input_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))
parsed_prompts.append(engine_prompt)
tokenization_kwargs: Dict[str, Any] = {}
if truncate_prompt_tokens is not None:
tokenization_kwargs["truncation"] = True
tokenization_kwargs["max_length"] = truncate_prompt_tokens

parsed_prompts = []

for q, t in input_pairs:
prompt_inputs = tokenizer(text=q,
text_pair=t,
**tokenization_kwargs)
engine_prompt = TokensPrompt(
prompt_token_ids=prompt_inputs["input_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))
parsed_prompts.append(engine_prompt)

self._validate_and_add_requests(
prompts=parsed_prompts,
params=pooling_params,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
)

self._validate_and_add_requests(
prompts=parsed_prompts,
params=pooling_params,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
)
outputs = self._run_engine(use_tqdm=use_tqdm)
items = self.engine_class.validate_outputs(outputs,
PoolingRequestOutput)

outputs = self._run_engine(use_tqdm=use_tqdm)
items = self.engine_class.validate_outputs(outputs,
PoolingRequestOutput)
return [ScoringRequestOutput.from_base(item) for item in items]

elif self.llm_engine.model_config.runner_type == "pooling":
encoded_output = self.encode(text_1 + text_2)
encoded_output_1 = encoded_output[0:len(text_1)]
encoded_output_2 = encoded_output[len(text_1):]

if len(encoded_output_1) == 1:
encoded_output_1 = encoded_output_1 * len(encoded_output_2)

output_pairs = [
(t1, t2) for t1, t2 in zip(encoded_output_1, encoded_output_2)
]

scores = []
scorer = torch.nn.CosineSimilarity(0)

for embed_1, embed_2 in output_pairs:
pair_score = scorer(embed_1.outputs.data, embed_2.outputs.data)

if getattr(tokenizer, "pad_token", None) is None:
tokens = embed_1.prompt_token_ids + embed_2.prompt_token_ids
else:
tokens = embed_1.prompt_token_ids + [
tokenizer.pad_token_type_id
] + embed_2.prompt_token_ids

scores.append(
PoolingRequestOutput(
request_id=f"{embed_1.request_id}_{embed_2.request_id}",
outputs=pair_score,
prompt_token_ids=tokens,
finished=True))

items = self.engine_class.validate_outputs(scores,
PoolingRequestOutput)
return [ScoringRequestOutput.from_base(item) for item in items]

return [ScoringRequestOutput.from_base(item) for item in items]
raise ValueError(
"Your model does not support cross encoding and pooling.")

def start_profile(self) -> None:
self.llm_engine.start_profile()
Expand Down
Loading