Skip to content

Commit

Permalink
Allow health check to be enableable for all instance counts
Browse files Browse the repository at this point in the history
  • Loading branch information
kthui committed Nov 21, 2024
1 parent 4803ee0 commit 33367f6
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 55 deletions.
10 changes: 0 additions & 10 deletions ci/L0_check_health_vllm/check_health_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,8 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import json
import os

import numpy as np
import pytest
import tritonclient.grpc as grpcclient


Expand Down Expand Up @@ -118,11 +116,3 @@ def test_vllm_not_healthy(self):
"Request for unknown model: 'vllm_opt' has no available versions"
)
self._assert_model_ready(False)

def test_vllm_enable_health_check_multi_instance(self):
with open(os.environ["SERVER_LOG"]) as f:
server_log = f.read()
expected_vllm_warning = "[vllm] Health check may only be enabled when the model has exactly 1 instance but 2 are found"
assert expected_vllm_warning in server_log
# Health check should be disabled
self.test_vllm_is_healthy()
17 changes: 1 addition & 16 deletions ci/L0_check_health_vllm/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,6 @@ function setup_model_repository {
cp -r $sample_model_repo_path/vllm_model models/vllm_opt
}

function setup_model_repository_with_multi_instances {
setup_model_repository
echo -e "backend: \"vllm\"" > models/vllm_opt/config.pbtxt
echo -e "instance_group [" >> models/vllm_opt/config.pbtxt
echo -e " { kind: KIND_MODEL }," >> models/vllm_opt/config.pbtxt
echo -e " { kind: KIND_MODEL \n count: 1 }" >> models/vllm_opt/config.pbtxt
echo -e "]" >> models/vllm_opt/config.pbtxt
}

function enable_health_check {
local enable_vllm_health_check="$1"
echo -e "parameters: {" >> models/vllm_opt/config.pbtxt
Expand Down Expand Up @@ -82,7 +73,7 @@ function test_check_health {
fi

set +e
SERVER_LOG=$SERVER_LOG python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log
python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log
if [ $? -ne 0 ]; then
echo -e "\n***\n*** $test_name FAILED. \n***"
RET=1
Expand Down Expand Up @@ -124,12 +115,6 @@ setup_model_repository
enable_health_check "true"
test_check_health "health_check_enabled_mocked_failure" "test_vllm_not_healthy"

# Test health check enabled with mocked vLLM check_health() failure when there
# are multiple instances
setup_model_repository_with_multi_instances
enable_health_check "true"
test_check_health "health_check_enabled_multi_instance_mocked_failure" "test_vllm_enable_health_check_multi_instance"

# Unmock check_health()
unmock_vllm_async_llm_engine

Expand Down
40 changes: 11 additions & 29 deletions src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,10 @@ def initialize(self, args):
self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"])

# Setup vLLM engine health check
self._setup_health_check()
self._enable_health_check = self._get_bool_config_param(
"ENABLE_VLLM_HEALTH_CHECK"
)
self._is_healthy = True

# Prepare vLLM engine
self.init_engine()
Expand All @@ -134,31 +137,6 @@ def initialize(self, args):
self._shutdown_event = asyncio.Event()
self._event_thread.start()

def _setup_health_check(self):
# Check if health check should be enabled
self._enable_health_check = (
"ENABLE_VLLM_HEALTH_CHECK" in self.model_config["parameters"]
) and (
self.model_config["parameters"]["ENABLE_VLLM_HEALTH_CHECK"][
"string_value"
].lower()
in ["yes", "true"]
)
# Setup health check if enabled
if self._enable_health_check:
# Only enable health check if there is exactly 1 instance
num_instances = 0
for group in self.model_config["instance_group"]:
num_instances += group["count"]
if num_instances != 1:
self.logger.log_warn(
f"[vllm] Health check may only be enabled when the model has exactly 1 instance but {num_instances} are found"
)
self._enable_health_check = False
return
# Set is healthy flag
self._is_healthy = True

def init_engine(self):
# Currently, Triton needs to use decoupled policy for asynchronously
# forwarding requests to vLLM engine, so assert it.
Expand Down Expand Up @@ -191,9 +169,7 @@ def init_engine(self):
# Create vLLM custom metrics
self.vllm_metrics = None
if (
"REPORT_CUSTOM_METRICS" in self.model_config["parameters"]
and self.model_config["parameters"]["REPORT_CUSTOM_METRICS"]["string_value"]
== "yes"
self._get_bool_config_param("REPORT_CUSTOM_METRICS")
and not aync_engine_args.disable_log_stats
):
try:
Expand All @@ -214,6 +190,12 @@ def init_engine(self):
else:
raise e

def _get_bool_config_param(self, param_name: str) -> bool:
return (param_name in self.model_config["parameters"]) and (
self.model_config["parameters"][param_name]["string_value"].lower()
in ["yes", "true"]
)

def setup_lora(self):
self.enable_lora = False

Expand Down

0 comments on commit 33367f6

Please sign in to comment.