Skip to content

Commit

Permalink
Rename num_token_ids to num_output_tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
kthui committed Nov 7, 2024
1 parent 5e9b09f commit dae3c13
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 26 deletions.
34 changes: 19 additions & 15 deletions ci/L0_additional_outputs_vllm/additional_outputs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _get_inputs(
sampling_parameters=None,
return_finish_reason=None,
return_cumulative_logprob=None,
return_num_token_ids=None,
return_num_output_tokens=None,
):
inputs = []

Expand Down Expand Up @@ -76,9 +76,13 @@ def _get_inputs(
np.array([return_cumulative_logprob], dtype=bool)
)

if return_num_token_ids is not None:
inputs.append(grpcclient.InferInput("return_num_token_ids", [1], "BOOL"))
inputs[-1].set_data_from_numpy(np.array([return_num_token_ids], dtype=bool))
if return_num_output_tokens is not None:
inputs.append(
grpcclient.InferInput("return_num_output_tokens", [1], "BOOL")
)
inputs[-1].set_data_from_numpy(
np.array([return_num_output_tokens], dtype=bool)
)

return inputs

Expand Down Expand Up @@ -131,15 +135,15 @@ def _assert_cumulative_logprob(self, return_cumulative_logprob):
assert cumulative_logprob != prev_cumulative_logprob
prev_cumulative_logprob = cumulative_logprob

def _assert_num_token_ids(self, return_num_token_ids):
def _assert_num_output_tokens(self, return_num_output_tokens):
for response in self._responses:
result, error = response["result"], response["error"]
assert error is None
num_token_ids_np = result.as_numpy(name="num_token_ids")
if return_num_token_ids is None or return_num_token_ids == False:
assert num_token_ids_np is None
num_output_tokens_np = result.as_numpy(name="num_output_tokens")
if return_num_output_tokens is None or return_num_output_tokens == False:
assert num_output_tokens_np is None
continue
num_token_ids = num_token_ids_np[0].astype(int)
num_output_tokens = num_output_tokens_np[0].astype(int)
# TODO: vLLM may return token ids identical to the previous one when
# streaming, for example:
#
Expand All @@ -156,30 +160,30 @@ def _assert_num_token_ids(self, return_num_token_ids):
# curr: text=' the term “', token_ids=array('l', [5, 1385, 44, 48])
#
# If this is no longer the case in a future release, change the assert
# to assert num_token_ids > 0.
assert num_token_ids >= 0
# to assert num_output_tokens > 0.
assert num_output_tokens >= 0

@pytest.mark.parametrize("stream", [True, False])
@pytest.mark.parametrize("return_finish_reason", [None, True, False])
@pytest.mark.parametrize("return_cumulative_logprob", [None, True, False])
@pytest.mark.parametrize("return_num_token_ids", [None, True, False])
@pytest.mark.parametrize("return_num_output_tokens", [None, True, False])
def test_additional_outputs(
self,
stream,
return_finish_reason,
return_cumulative_logprob,
return_num_token_ids,
return_num_output_tokens,
):
inputs = self._get_inputs(
self._prompt,
stream=stream,
sampling_parameters=self._sampling_parameters,
return_finish_reason=return_finish_reason,
return_cumulative_logprob=return_cumulative_logprob,
return_num_token_ids=return_num_token_ids,
return_num_output_tokens=return_num_output_tokens,
)
self._llm_infer(inputs)
self._assert_text_output_valid()
self._assert_finish_reason(return_finish_reason)
self._assert_cumulative_logprob(return_cumulative_logprob)
self._assert_num_token_ids(return_num_token_ids)
self._assert_num_output_tokens(return_num_output_tokens)
6 changes: 3 additions & 3 deletions docs/additional_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ point value will be sent on the `cumulative_logprob` output tensor.

Supported since r24.11.

### Number of token IDs
### Number of Output Tokens

The number of token IDs of the generated output text sent on this response. It
is the difference in length of the token IDs generated from the last response to
Expand All @@ -68,8 +68,8 @@ presumed to be zero. See
[here](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/outputs.py#L21)
for more details on the token IDs of the generated output text.

To enable, set `return_num_token_ids` input tensor to `True`. The unsigned
integer value will be sent on the `num_token_ids` output tensor.
To enable, set `return_num_output_tokens` input tensor to `True`. The unsigned
integer value will be sent on the `num_output_tokens` output tensor.

Supported since r24.11.

Expand Down
16 changes: 8 additions & 8 deletions src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config):
"optional": True,
},
{
"name": "return_num_token_ids",
"name": "return_num_output_tokens",
"data_type": "TYPE_BOOL",
"dims": [1],
"optional": True,
Expand All @@ -111,7 +111,7 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config):
{"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]},
{"name": "finish_reason", "data_type": "TYPE_STRING", "dims": [-1]},
{"name": "cumulative_logprob", "data_type": "TYPE_FP32", "dims": [-1]},
{"name": "num_token_ids", "data_type": "TYPE_UINT32", "dims": [-1]},
{"name": "num_output_tokens", "data_type": "TYPE_UINT32", "dims": [-1]},
]

# Collect input and output names from the provided model config.
Expand Down Expand Up @@ -348,11 +348,11 @@ def _get_input_tensors(self, request):
else:
parameters = request.parameters()

# return_finish_reason, return_cumulative_logprob, return_num_token_ids
# return_finish_reason, return_cumulative_logprob, return_num_output_tokens
additional_outputs = {
"return_finish_reason": None,
"return_cumulative_logprob": None,
"return_num_token_ids": None,
"return_num_output_tokens": None,
}
for tensor_name in additional_outputs.keys():
tensor = pb_utils.get_input_tensor_by_name(request, tensor_name)
Expand Down Expand Up @@ -467,8 +467,8 @@ def _create_response(
)
)

# num_token_ids
if additional_outputs["return_num_token_ids"]:
# num_output_tokens
if additional_outputs["return_num_output_tokens"]:
if prev_request_output is None:
# this is the first response
prev_lens = [0] * len(request_output.outputs)
Expand All @@ -478,13 +478,13 @@ def _create_response(
len(prev_output.token_ids)
for prev_output in prev_request_output.outputs
]
num_token_ids = [
num_output_tokens = [
(len(output.token_ids) - prev_len)
for output, prev_len in zip(request_output.outputs, prev_lens)
]
output_tensors.append(
pb_utils.Tensor(
"num_token_ids", np.asarray(num_token_ids, dtype=np.uint32)
"num_output_tokens", np.asarray(num_output_tokens, dtype=np.uint32)
)
)

Expand Down

0 comments on commit dae3c13

Please sign in to comment.