diff --git a/ci/L0_additional_outputs_vllm/additional_outputs_test.py b/ci/L0_additional_outputs_vllm/additional_outputs_test.py index a8dfb24d..2826a4ca 100644 --- a/ci/L0_additional_outputs_vllm/additional_outputs_test.py +++ b/ci/L0_additional_outputs_vllm/additional_outputs_test.py @@ -44,7 +44,7 @@ def _get_inputs( sampling_parameters=None, return_finish_reason=None, return_cumulative_logprob=None, - return_num_token_ids=None, + return_token_ids=None, ): inputs = [] @@ -76,9 +76,9 @@ def _get_inputs( np.array([return_cumulative_logprob], dtype=bool) ) - if return_num_token_ids is not None: - inputs.append(grpcclient.InferInput("return_num_token_ids", [1], "BOOL")) - inputs[-1].set_data_from_numpy(np.array([return_num_token_ids], dtype=bool)) + if return_token_ids is not None: + inputs.append(grpcclient.InferInput("return_token_ids", [1], "BOOL")) + inputs[-1].set_data_from_numpy(np.array([return_token_ids], dtype=bool)) return inputs @@ -131,15 +131,15 @@ def _assert_cumulative_logprob(self, return_cumulative_logprob): assert cumulative_logprob != prev_cumulative_logprob prev_cumulative_logprob = cumulative_logprob - def _assert_num_token_ids(self, return_num_token_ids): + def _assert_token_ids(self, return_token_ids): for response in self._responses: result, error = response["result"], response["error"] assert error is None - num_token_ids_np = result.as_numpy(name="num_token_ids") - if return_num_token_ids is None or return_num_token_ids == False: - assert num_token_ids_np is None + token_ids_np = result.as_numpy(name="token_ids") + if return_token_ids is None or return_token_ids == False: + assert token_ids_np is None continue - num_token_ids = num_token_ids_np[0].astype(int) + token_ids = token_ids_np[0].astype(int) # TODO: vLLM may return token ids identical to the previous one when # streaming, for example: # @@ -155,20 +155,20 @@ def _assert_num_token_ids(self, return_num_token_ids): # prev: text=' the term', token_ids=array('l', [5, 1385, 44, 48]) # curr: text=' the term “', token_ids=array('l', [5, 1385, 44, 48]) # - # If this is no longer the case in a future release, change the assert - # to assert num_token_ids > 0. - assert num_token_ids >= 0 + # If this is no longer the case in a future release, change to + # assert len(token_ids) > 0. + assert len(token_ids) >= 0 @pytest.mark.parametrize("stream", [True, False]) @pytest.mark.parametrize("return_finish_reason", [None, True, False]) @pytest.mark.parametrize("return_cumulative_logprob", [None, True, False]) - @pytest.mark.parametrize("return_num_token_ids", [None, True, False]) + @pytest.mark.parametrize("return_token_ids", [None, True, False]) def test_additional_outputs( self, stream, return_finish_reason, return_cumulative_logprob, - return_num_token_ids, + return_token_ids, ): inputs = self._get_inputs( self._prompt, @@ -176,10 +176,10 @@ def test_additional_outputs( sampling_parameters=self._sampling_parameters, return_finish_reason=return_finish_reason, return_cumulative_logprob=return_cumulative_logprob, - return_num_token_ids=return_num_token_ids, + return_token_ids=return_token_ids, ) self._llm_infer(inputs) self._assert_text_output_valid() self._assert_finish_reason(return_finish_reason) self._assert_cumulative_logprob(return_cumulative_logprob) - self._assert_num_token_ids(return_num_token_ids) + self._assert_token_ids(return_token_ids) diff --git a/docs/additional_outputs.md b/docs/additional_outputs.md index dcca0dc4..fdc631dd 100644 --- a/docs/additional_outputs.md +++ b/docs/additional_outputs.md @@ -59,17 +59,14 @@ point value will be sent on the `cumulative_logprob` output tensor. Supported since r24.11. -### Number of token IDs +### Token IDs -The number of token IDs of the generated output text sent on this response. It -is the difference in length of the token IDs generated from the last response to -this response. If this is the first response, the last response length is -presumed to be zero. See +The token IDs of the generated output text sent on this response. See [here](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/outputs.py#L21) -for more details on the token IDs of the generated output text. +for more details. -To enable, set `return_num_token_ids` input tensor to `True`. The unsigned -integer value will be sent on the `num_token_ids` output tensor. +To enable, set `return_token_ids` input tensor to `True`. The array of integer +value will be sent on the `token_ids` output tensor. Supported since r24.11. diff --git a/src/model.py b/src/model.py index dfaebf61..bd073156 100644 --- a/src/model.py +++ b/src/model.py @@ -101,7 +101,7 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config): "optional": True, }, { - "name": "return_num_token_ids", + "name": "return_token_ids", "data_type": "TYPE_BOOL", "dims": [1], "optional": True, @@ -111,7 +111,7 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config): {"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]}, {"name": "finish_reason", "data_type": "TYPE_STRING", "dims": [-1]}, {"name": "cumulative_logprob", "data_type": "TYPE_FP32", "dims": [-1]}, - {"name": "num_token_ids", "data_type": "TYPE_UINT32", "dims": [-1]}, + {"name": "token_ids", "data_type": "TYPE_INT64", "dims": [-1, -1]}, ] # Collect input and output names from the provided model config. @@ -348,11 +348,11 @@ def _get_input_tensors(self, request): else: parameters = request.parameters() - # return_finish_reason, return_cumulative_logprob, return_num_token_ids + # return_finish_reason, return_cumulative_logprob, return_token_ids additional_outputs = { "return_finish_reason": None, "return_cumulative_logprob": None, - "return_num_token_ids": None, + "return_token_ids": None, } for tensor_name in additional_outputs.keys(): tensor = pb_utils.get_input_tensor_by_name(request, tensor_name) @@ -467,8 +467,8 @@ def _create_response( ) ) - # num_token_ids - if additional_outputs["return_num_token_ids"]: + # token_ids + if additional_outputs["return_token_ids"]: if prev_request_output is None: # this is the first response prev_lens = [0] * len(request_output.outputs) @@ -478,14 +478,12 @@ def _create_response( len(prev_output.token_ids) for prev_output in prev_request_output.outputs ] - num_token_ids = [ - (len(output.token_ids) - prev_len) + token_ids = [ + output.token_ids[prev_len:] for output, prev_len in zip(request_output.outputs, prev_lens) ] output_tensors.append( - pb_utils.Tensor( - "num_token_ids", np.asarray(num_token_ids, dtype=np.uint32) - ) + pb_utils.Tensor("token_ids", np.asarray(token_ids, dtype=np.int64)) ) return pb_utils.InferenceResponse(output_tensors=output_tensors)