Skip to content

Commit

Permalink
Revert "Return token ids instead of number of token ids"
Browse files Browse the repository at this point in the history
This reverts commit 457eeaa.
  • Loading branch information
kthui committed Nov 7, 2024
1 parent 457eeaa commit 5e9b09f
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 30 deletions.
32 changes: 16 additions & 16 deletions ci/L0_additional_outputs_vllm/additional_outputs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _get_inputs(
sampling_parameters=None,
return_finish_reason=None,
return_cumulative_logprob=None,
return_token_ids=None,
return_num_token_ids=None,
):
inputs = []

Expand Down Expand Up @@ -76,9 +76,9 @@ def _get_inputs(
np.array([return_cumulative_logprob], dtype=bool)
)

if return_token_ids is not None:
inputs.append(grpcclient.InferInput("return_token_ids", [1], "BOOL"))
inputs[-1].set_data_from_numpy(np.array([return_token_ids], dtype=bool))
if return_num_token_ids is not None:
inputs.append(grpcclient.InferInput("return_num_token_ids", [1], "BOOL"))
inputs[-1].set_data_from_numpy(np.array([return_num_token_ids], dtype=bool))

return inputs

Expand Down Expand Up @@ -131,15 +131,15 @@ def _assert_cumulative_logprob(self, return_cumulative_logprob):
assert cumulative_logprob != prev_cumulative_logprob
prev_cumulative_logprob = cumulative_logprob

def _assert_token_ids(self, return_token_ids):
def _assert_num_token_ids(self, return_num_token_ids):
for response in self._responses:
result, error = response["result"], response["error"]
assert error is None
token_ids_np = result.as_numpy(name="token_ids")
if return_token_ids is None or return_token_ids == False:
assert token_ids_np is None
num_token_ids_np = result.as_numpy(name="num_token_ids")
if return_num_token_ids is None or return_num_token_ids == False:
assert num_token_ids_np is None
continue
token_ids = token_ids_np[0].astype(int)
num_token_ids = num_token_ids_np[0].astype(int)
# TODO: vLLM may return token ids identical to the previous one when
# streaming, for example:
#
Expand All @@ -155,31 +155,31 @@ def _assert_token_ids(self, return_token_ids):
# prev: text=' the term', token_ids=array('l', [5, 1385, 44, 48])
# curr: text=' the term “', token_ids=array('l', [5, 1385, 44, 48])
#
# If this is no longer the case in a future release, change to
# assert len(token_ids) > 0.
assert len(token_ids) >= 0
# If this is no longer the case in a future release, change the assert
# to assert num_token_ids > 0.
assert num_token_ids >= 0

@pytest.mark.parametrize("stream", [True, False])
@pytest.mark.parametrize("return_finish_reason", [None, True, False])
@pytest.mark.parametrize("return_cumulative_logprob", [None, True, False])
@pytest.mark.parametrize("return_token_ids", [None, True, False])
@pytest.mark.parametrize("return_num_token_ids", [None, True, False])
def test_additional_outputs(
self,
stream,
return_finish_reason,
return_cumulative_logprob,
return_token_ids,
return_num_token_ids,
):
inputs = self._get_inputs(
self._prompt,
stream=stream,
sampling_parameters=self._sampling_parameters,
return_finish_reason=return_finish_reason,
return_cumulative_logprob=return_cumulative_logprob,
return_token_ids=return_token_ids,
return_num_token_ids=return_num_token_ids,
)
self._llm_infer(inputs)
self._assert_text_output_valid()
self._assert_finish_reason(return_finish_reason)
self._assert_cumulative_logprob(return_cumulative_logprob)
self._assert_token_ids(return_token_ids)
self._assert_num_token_ids(return_num_token_ids)
13 changes: 8 additions & 5 deletions docs/additional_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,17 @@ point value will be sent on the `cumulative_logprob` output tensor.

Supported since r24.11.

### Token IDs
### Number of token IDs

The token IDs of the generated output text sent on this response. See
The number of token IDs of the generated output text sent on this response. It
is the difference in length of the token IDs generated from the last response to
this response. If this is the first response, the last response length is
presumed to be zero. See
[here](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/outputs.py#L21)
for more details.
for more details on the token IDs of the generated output text.

To enable, set `return_token_ids` input tensor to `True`. The array of integer
value will be sent on the `token_ids` output tensor.
To enable, set `return_num_token_ids` input tensor to `True`. The unsigned
integer value will be sent on the `num_token_ids` output tensor.

Supported since r24.11.

Expand Down
20 changes: 11 additions & 9 deletions src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config):
"optional": True,
},
{
"name": "return_token_ids",
"name": "return_num_token_ids",
"data_type": "TYPE_BOOL",
"dims": [1],
"optional": True,
Expand All @@ -111,7 +111,7 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config):
{"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]},
{"name": "finish_reason", "data_type": "TYPE_STRING", "dims": [-1]},
{"name": "cumulative_logprob", "data_type": "TYPE_FP32", "dims": [-1]},
{"name": "token_ids", "data_type": "TYPE_INT64", "dims": [-1, -1]},
{"name": "num_token_ids", "data_type": "TYPE_UINT32", "dims": [-1]},
]

# Collect input and output names from the provided model config.
Expand Down Expand Up @@ -348,11 +348,11 @@ def _get_input_tensors(self, request):
else:
parameters = request.parameters()

# return_finish_reason, return_cumulative_logprob, return_token_ids
# return_finish_reason, return_cumulative_logprob, return_num_token_ids
additional_outputs = {
"return_finish_reason": None,
"return_cumulative_logprob": None,
"return_token_ids": None,
"return_num_token_ids": None,
}
for tensor_name in additional_outputs.keys():
tensor = pb_utils.get_input_tensor_by_name(request, tensor_name)
Expand Down Expand Up @@ -467,8 +467,8 @@ def _create_response(
)
)

# token_ids
if additional_outputs["return_token_ids"]:
# num_token_ids
if additional_outputs["return_num_token_ids"]:
if prev_request_output is None:
# this is the first response
prev_lens = [0] * len(request_output.outputs)
Expand All @@ -478,12 +478,14 @@ def _create_response(
len(prev_output.token_ids)
for prev_output in prev_request_output.outputs
]
token_ids = [
output.token_ids[prev_len:]
num_token_ids = [
(len(output.token_ids) - prev_len)
for output, prev_len in zip(request_output.outputs, prev_lens)
]
output_tensors.append(
pb_utils.Tensor("token_ids", np.asarray(token_ids, dtype=np.int64))
pb_utils.Tensor(
"num_token_ids", np.asarray(num_token_ids, dtype=np.uint32)
)
)

return pb_utils.InferenceResponse(output_tensors=output_tensors)
Expand Down

0 comments on commit 5e9b09f

Please sign in to comment.