From 4872ae39bee2f0482db75bb943ef1547749af92c Mon Sep 17 00:00:00 2001
From: Onkar Chougule <168134249+ochougul@users.noreply.github.com>
Date: Thu, 7 Nov 2024 17:11:14 +0530
Subject: [PATCH 1/2] Vllm test (#168)

* [VLLM] vllm install and test added.

Signed-off-by: Mahesh Balasubramanian <quic_mahebala@quicinc.com>

* lint checked

Signed-off-by: Mahesh Balasubramanian <quic_mahebala@quicinc.com>

* Changes made according to the compliance

Signed-off-by: Mahesh Balasubramanian <quic_mahebala@quicinc.com>

* Updated the copyrights

Signed-off-by: Mahesh Balasubramanian <quic_mahebala@quicinc.com>

* Updated test and Jenkins file according to the comments

Signed-off-by: Mahesh Balasubramanian <quic_mahebala@quicinc.com>

---------

Signed-off-by: Mahesh Balasubramanian <quic_mahebala@quicinc.com>
Co-authored-by: Mahesh Balasubramanian <quic_mahebala@quicinc.com>
Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 scripts/Jenkinsfile                        |  37 +++++++-
 tests/vllm/test_qaic_output_consistency.py | 102 +++++++++++++++++++++
 2 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 tests/vllm/test_qaic_output_consistency.py

diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index ce2e66780..5b081a5c6 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -48,6 +48,41 @@ pipeline
                 }
             }
         }
+
+
+        stage('Install vLLM')
+        {
+            steps
+            {
+                sh '''
+                    . preflight_qeff/bin/activate
+                    git clone https://github.com/vllm-project/vllm.git
+                    cd vllm
+                    git checkout v0.6.0
+                    git apply /opt/qti-aic/integrations/vllm/qaic_vllm.patch
+                    export VLLM_TARGET_DEVICE="qaic"
+                    pip install -e .
+                '''
+            }
+        }
+
+
+        stage('vLLM Test')
+            {
+                steps
+                {
+                    
+            timeout(time: 660, unit: 'MINUTES') {
+                    sh '''
+                    . preflight_qeff/bin/activate
+                    pytest --disable-warnings -s -v tests/vllm --junitxml=tests/tests_log4.xml
+                    junitparser merge tests/tests_log1.xml tests/tests_log2.xml tests/tests_log3.xml tests/tests_log4.xml tests/tests_log.xml
+                    deactivate
+                    exit
+                    '''     
+                }
+            }
+        }
     }
     post 
     {
@@ -59,4 +94,4 @@ pipeline
          }
     }
        
-}
+}
\ No newline at end of file
diff --git a/tests/vllm/test_qaic_output_consistency.py b/tests/vllm/test_qaic_output_consistency.py
new file mode 100644
index 000000000..e4c2b1a6a
--- /dev/null
+++ b/tests/vllm/test_qaic_output_consistency.py
@@ -0,0 +1,102 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import random
+
+import pytest
+from vllm import LLM, SamplingParams
+
+# Model to test
+test_models = [
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+]
+
+# Constants for configuration
+SEQ_LEN = 128
+CTX_LEN = 256
+DECOE_BSZ = 4
+DTYPE = "mxfp6"
+KV_DTYPE = "mxint8"
+DEVICE_GROUP = [0]
+
+
+@pytest.mark.parametrize("model_name", test_models)
+def test_output_consistency(model_name):
+    """This pytest function is used to check the consistency of vLLM.
+       1) Single prompt test to check if the output generated in 5 different
+          runs yields the same results
+       2) Multiple prompt check to test if multiple prompts yield same results
+          if run in different slots.
+
+    Parameters
+    ----------
+    model_name : string
+        Huggingface model card name.
+    """
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=None)
+
+    # Creating LLM Object
+    qllm = LLM(
+        model=model_name,
+        device_group=DEVICE_GROUP,
+        max_num_seqs=DECOE_BSZ,
+        max_model_len=CTX_LEN,
+        max_seq_len_to_capture=SEQ_LEN,
+        quantization=DTYPE,
+        kv_cache_dtype=KV_DTYPE,
+        device="qaic",
+    )
+
+    # Single prompt test
+    prompt1 = ["My name is"]
+
+    output1 = qllm.generate(prompt1 * 5, sampling_params)
+
+    check_output1 = []
+    for i, op in enumerate(output1):
+        check_output1.append(op.outputs[0].text)
+
+
+    # Multiple prompt test
+    outputDict = dict()
+    prompt2 = [
+        "My name is",
+        "How to eat mangosteen?",
+        "How many people died in World War II",
+        "Hello ",
+        "Who is the president of United States",
+        "Who is the president of India",
+        "When it snowfalls in San Diego",
+        "In which country yamana river flows",
+        "How many people died in World War II",
+        "Thy youth is proud livery, so gazed on now",
+        "Will be a tattered weed, of small worth held:" "Then being asked where all thy beauty lies",
+        "Where all the treasure of thy lusty days",
+        "To say, within thine own deep-sunken eyes",
+        "Where is Statue of Liberty located?",
+    ]
+
+    for p in prompt2:
+        outputDict[p] = []
+
+    for _ in range(5):
+        random.shuffle(prompt2)
+        output2 = qllm.generate(prompt2, sampling_params)
+        for i, op in enumerate(output2):
+            generated_text = op.outputs[0].text
+            outputDict[prompt2[i]].append(str(prompt2[i] + generated_text))
+
+    
+    # Assertion to check the consistency of single prompt.
+    assert len(set(check_output1)) == 1, "Outputs from different slots for same prompt does not match!!"
+
+    # Assertion to check multiple prompts.
+    for key in outputDict.keys():
+        assert len(set(outputDict[key])) == 1, "Outputs from different slots for same prompt does not match!!"
+
+    # Assertion to check if any prompts are missed.
+    assert len(prompt2) == len(output2), "Number of Generated Tokens do not match the number of valid inputs!!"
\ No newline at end of file

From 4fdc2a6b949ddf41726d4d1bdb90423f44769474 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 7 Nov 2024 17:17:14 +0530
Subject: [PATCH 2/2] fixed jenkinsfile for tests run and ran formatter

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 scripts/Jenkinsfile                        | 8 ++++----
 tests/vllm/test_qaic_output_consistency.py | 4 +---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index 5b081a5c6..a522002e1 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -37,10 +37,10 @@ pipeline
                     . preflight_qeff/bin/activate
                     export TOKENIZERS_PARALLELISM=false
                     export QEFF_HOME=$PWD  
-                    pytest tests -m "not cli and not on_qaic" -n auto --junitxml=tests/tests_log1.xml &
-                    pytest tests -m "not cli and on_qaic" -n 4 --junitxml=tests/tests_log2.xml &
+                    pytest tests -m "not cli and not on_qaic" --ignore tests/vllm -n auto --junitxml=tests/tests_log1.xml &
+                    pytest tests -m "not cli and on_qaic" --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml &
                     wait
-                    pytest tests -m cli --junitxml=tests/tests_log3.xml
+                    pytest tests -m cli --ignore tests/vllm --junitxml=tests/tests_log3.xml
                     junitparser merge tests/tests_log1.xml tests/tests_log2.xml tests/tests_log3.xml tests/tests_log.xml
                     deactivate
                     exit
@@ -94,4 +94,4 @@ pipeline
          }
     }
        
-}
\ No newline at end of file
+}
diff --git a/tests/vllm/test_qaic_output_consistency.py b/tests/vllm/test_qaic_output_consistency.py
index e4c2b1a6a..00cd5765a 100644
--- a/tests/vllm/test_qaic_output_consistency.py
+++ b/tests/vllm/test_qaic_output_consistency.py
@@ -60,7 +60,6 @@ def test_output_consistency(model_name):
     for i, op in enumerate(output1):
         check_output1.append(op.outputs[0].text)
 
-
     # Multiple prompt test
     outputDict = dict()
     prompt2 = [
@@ -90,7 +89,6 @@ def test_output_consistency(model_name):
             generated_text = op.outputs[0].text
             outputDict[prompt2[i]].append(str(prompt2[i] + generated_text))
 
-    
     # Assertion to check the consistency of single prompt.
     assert len(set(check_output1)) == 1, "Outputs from different slots for same prompt does not match!!"
 
@@ -99,4 +97,4 @@ def test_output_consistency(model_name):
         assert len(set(outputDict[key])) == 1, "Outputs from different slots for same prompt does not match!!"
 
     # Assertion to check if any prompts are missed.
-    assert len(prompt2) == len(output2), "Number of Generated Tokens do not match the number of valid inputs!!"
\ No newline at end of file
+    assert len(prompt2) == len(output2), "Number of Generated Tokens do not match the number of valid inputs!!"