Merge branch 'main' into r24.10

triton-inference-server · Jan 21, 2025 · f172906 · f172906
2 parents dab2ebf + 153ba03
commit f172906
Show file tree

Hide file tree

Showing 3 changed files with 125 additions and 19 deletions.
diff --git a/qa/L0_io/gen_libtorch_model.py b/qa/L0_io/gen_libtorch_model.py
@@ -0,0 +1,90 @@
+#!/usr/bin/python
+# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import torch
+import torch.nn as nn
+
+
+class SumModule(nn.Module):
+    def __init__(self, device):
+        super(SumModule, self).__init__()
+        self.device = device
+
+    def forward(self, INPUT0, INPUT1):
+        INPUT0 = INPUT0.to(self.device)
+        INPUT1 = INPUT1.to(self.device)
+        print(
+            "SumModule - INPUT0 device: {}, INPUT1 device: {}\n".format(
+                INPUT0.device, INPUT1.device
+            )
+        )
+        return INPUT0 + INPUT1
+
+
+class DiffModule(nn.Module):
+    def __init__(self, device):
+        super(DiffModule, self).__init__()
+        self.device = device
+
+    def forward(self, INPUT0, INPUT1):
+        INPUT0 = INPUT0.to(self.device)
+        INPUT1 = INPUT1.to(self.device)
+        print(
+            "DiffModule - INPUT0 device: {}, INPUT1 device: {}\n".format(
+                INPUT0.device, INPUT1.device
+            )
+        )
+        return INPUT0 - INPUT1
+
+
+class TestModel(nn.Module):
+    def __init__(self, device0, device1):
+        super(TestModel, self).__init__()
+        self.device0 = device0
+        self.device1 = device1
+
+        self.layer1 = SumModule(self.device0)
+        self.layer2 = DiffModule(self.device1)
+
+    def forward(self, INPUT0, INPUT1):
+        op0 = self.layer1(INPUT0, INPUT1)
+        op1 = self.layer2(INPUT0, INPUT1)
+        return op0, op1
+
+
+if torch.cuda.device_count() < 2:
+    print("Need at least 2 GPUs to run this test")
+    exit(1)
+
+devices = [("cuda:1", "cuda:0"), ("cpu", "cuda:1")]
+model_names = ["libtorch_multi_gpu", "libtorch_multi_device"]
+
+for device_pair, model_name in zip(devices, model_names):
+    model = TestModel(device_pair[0], device_pair[1])
+    model_path = "models/" + model_name + "/1/model.pt"
+    scripted_model = torch.jit.script(model)
+    scripted_model.save(model_path)
diff --git a/qa/L0_io/test.sh b/qa/L0_io/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -38,7 +38,8 @@ if [ ! -z "$TEST_REPO_ARCH" ]; then
     REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
 fi
 
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+# This test requires at least 2 GPUs to test h2d and d2d transfer combinations
+export CUDA_VISIBLE_DEVICES=0,1
 
 IO_TEST_UTIL=./memory_alloc
 CLIENT_LOG="./client.log"
@@ -147,7 +148,6 @@ cp -r $ENSEMBLEDIR/nop_TYPE_FP32_-1 $MODELSDIR/. && \
 
 # prepare libtorch multi-device and multi-gpu models
 cp -r ../L0_libtorch_instance_group_kind_model/models/libtorch_multi_device $MODELSDIR/.
-cp ../L0_libtorch_instance_group_kind_model/gen_models.py ./gen_libtorch_model.py
 mkdir -p $MODELSDIR/libtorch_multi_device/1
 mkdir -p $MODELSDIR/libtorch_multi_gpu/1
 cp $MODELSDIR/libtorch_multi_device/config.pbtxt $MODELSDIR/libtorch_multi_gpu/.

diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -53,22 +53,38 @@ function prepare_tensorrtllm() {
     # FIXME: Remove when testing TRT-LLM containers built from source
     pip install -r requirements.txt
 
-    MODEL="llama-3-8b-instruct"
+    MODEL="meta-llama/Meta-Llama-3.1-8B-Instruct"
     MODEL_REPO="tests/tensorrtllm_models"
-    rm -rf ${MODEL_REPO}
-
-    # FIXME: This may require an upgrade each release to match the TRT-LLM version,
-    # and would likely be easier to use trtllm-build directly for test purposes.
-    # Use Triton CLI to prepare model repository for testing
-    pip install git+https://github.com/triton-inference-server/[email protected]
-    # NOTE: Could use ENGINE_DEST_PATH set to NFS mount for pre-built engines in future
-    triton import \
-        --model ${MODEL}  \
-        --backend tensorrtllm \
-        --model-repository "${MODEL_REPO}"
-
-    # WAR for tests expecting default name of "tensorrt_llm_bls"
-    mv "${MODEL_REPO}/${MODEL}" "${MODEL_REPO}/tensorrt_llm_bls"
+    mkdir -p ${MODEL_REPO}
+    cp /app/all_models/inflight_batcher_llm/* "${MODEL_REPO}" -r
+    # Ensemble model is not needed for the test
+    rm -rf ${MODEL_REPO}/ensemble
+
+    # 1. Download model from HF
+    huggingface-cli download ${MODEL}
+
+    HF_LLAMA_MODEL=`python3 -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('${MODEL}', filename='config.json')).parent)"`
+    CKPT_PATH=/tmp/ckpt/llama/3.1-8b-instruct/
+    ENGINE_PATH=/tmp/engines/llama/3.1-8b-instruct/
+
+    # 2. Convert weights
+    python3 /app/examples/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \
+        --output_dir ${CKPT_PATH} \
+        --dtype float16
+
+    # 3. Build engine
+    # max_batch_size set to 128 to avoid OOM errors
+    trtllm-build --checkpoint_dir ${CKPT_PATH} \
+        --gemm_plugin auto \
+        --max_batch_size 128 \
+        --output_dir ${ENGINE_PATH}
+
+    # 4. Prepare model repository
+    FILL_TEMPLATE="/app/tools/fill_template.py"
+    python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1,max_queue_size:0
+    python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
+    python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
+    python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},batching_strategy:inflight_fused_batching,max_queue_size:0,max_queue_delay_microseconds:1000,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,exclude_input_in_output:True
 }
 
 function pre_test() {