Skip to content

Commit

Permalink
Jenkins Test Time Reduction Expe
Browse files Browse the repository at this point in the history
Signed-off-by: Abukhoyer Shaik <[email protected]>
  • Loading branch information
abukhoy committed Jan 20, 2025
1 parent 2904183 commit 5b75a90
Show file tree
Hide file tree
Showing 8 changed files with 251 additions and 255 deletions.
44 changes: 22 additions & 22 deletions scripts/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,30 @@ pipeline {
mkdir -p $PWD/Non_qaic &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Non_qaic &&
pytest tests -m '(not cli) and (on_qaic) and (not qnn)' -n 4 --junitxml=tests/tests_log2.xml &&
pytest tests -m '(not cli) and (on_qaic) and (not qnn)' -n auto --junitxml=tests/tests_log2.xml &&
deactivate"
'''
}
}
}
stage('QNN Non-CLI Tests') {
steps {
timeout(time: 60, unit: 'MINUTES') {
sh '''
sudo docker exec ${BUILD_TAG} bash -c "
source /qnn_sdk/bin/envsetup.sh &&
source /qnn_sdk/bin/envcheck -c &&
cd /efficient-transformers &&
. preflight_qeff/bin/activate &&
mkdir -p $PWD/Qnn_non_cli &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Qnn_non_cli &&
pytest tests -m '(not cli) and (qnn) and (on_qaic)' -n auto --junitxml=tests/tests_log3.xml &&
deactivate"
'''
}
}
}
}
}
stage('CLI Tests') {
Expand All @@ -74,7 +92,7 @@ pipeline {
mkdir -p $PWD/cli &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/cli &&
pytest tests -m '(cli and not qnn)' --junitxml=tests/tests_log3.xml &&
pytest tests -m '(cli and not qnn)' --junitxml=tests/tests_log4.xml &&
deactivate"
'''
}
Expand All @@ -92,31 +110,13 @@ pipeline {
mkdir -p $PWD/Qnn_cli &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Qnn_cli &&
pytest tests -m '(cli and qnn)' --junitxml=tests/tests_log4.xml &&
deactivate"
'''
}
}
}
stage('QNN Non-CLI Tests') {
steps {
timeout(time: 60, unit: 'MINUTES') {
sh '''
sudo docker exec ${BUILD_TAG} bash -c "
source /qnn_sdk/bin/envsetup.sh &&
source /qnn_sdk/bin/envcheck -c &&
cd /efficient-transformers &&
. preflight_qeff/bin/activate &&
mkdir -p $PWD/Qnn_non_cli &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Qnn_non_cli &&
pytest tests -m '(not cli) and (qnn) and (on_qaic)' --junitxml=tests/tests_log5.xml &&
pytest tests -m '(cli and qnn)' --junitxml=tests/tests_log5.xml &&
junitparser merge tests/tests_log1.xml tests/tests_log2.xml tests/tests_log3.xml tests/tests_log4.xml tests/tests_log5.xml tests/tests_log.xml &&
deactivate"
'''
}
}
}
}
}

post {
Expand Down
25 changes: 12 additions & 13 deletions tests/peft/lora/test_lora_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

from QEfficient import QEffAutoPeftModelForCausalLM
from QEfficient.peft.lora import QEffAutoLoraModelForCausalLM
from QEfficient.utils import load_hf_tokenizer

configs = [
pytest.param(
Expand Down Expand Up @@ -227,12 +226,12 @@ def test_auto_lora_model_for_causal_lm_noncb_export_compile_generate(
assert Path(qeff_model.qpc_path).is_dir()

# test generate
prompts = ["hello!", "hi", "hello, my name is", "hey"]
qeff_model.generate(
tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
prompts=prompts,
prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
)
# prompts = ["hello!", "hi", "hello, my name is", "hey"]
# qeff_model.generate(
# tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
# prompts=prompts,
# prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
# )


# test the compile and generate workflow in cb mode
Expand All @@ -251,9 +250,9 @@ def test_auto_lora_model_for_causal_lm_cb_compile_generate(base_model_name, adap
assert Path(qeff_model.qpc_path).is_dir()

# test generate
prompts = ["hello!", "hi", "hello, my name is", "hey"]
qeff_model.generate(
tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
prompts=prompts,
prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
)
# prompts = ["hello!", "hi", "hello, my name is", "hey"]
# qeff_model.generate(
# tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
# prompts=prompts,
# prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
# )
23 changes: 11 additions & 12 deletions tests/peft/test_peft_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from time import perf_counter

import numpy as np
import onnx
import pytest
import torch
Expand Down Expand Up @@ -170,17 +169,17 @@ def test_auto_peft_model_for_causal_lm_compile_generate(base_config, adapter_con
end = perf_counter()
compile_time_0 = end - start

qeff_model.generate(
input_ids=np.zeros((batch_size, 32), dtype="int64"),
attention_mask=np.concatenate(
[
np.ones((batch_size, 10), dtype="int64"),
np.zeros((batch_size, 22), dtype="int64"),
],
axis=1,
),
max_new_tokens=10,
)
# qeff_model.generate(
# input_ids=np.zeros((batch_size, 32), dtype="int64"),
# attention_mask=np.concatenate(
# [
# np.ones((batch_size, 10), dtype="int64"),
# np.zeros((batch_size, 22), dtype="int64"),
# ],
# axis=1,
# ),
# max_new_tokens=10,
# )

start = perf_counter()
qeff_model.compile(batch_size=batch_size, prefill_seq_len=32, ctx_len=128)
Expand Down
34 changes: 17 additions & 17 deletions tests/qnn_tests/test_causal_lm_models_qnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(

pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)

assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), (
"Tokens don't match for HF PyTorch model output and KV PyTorch model output"
)
assert (
pytorch_hf_tokens == pytorch_kv_tokens
).all(), "Tokens don't match for HF PyTorch model output and KV PyTorch model output"

onnx_model_path = qeff_model.export()
ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path)
Expand All @@ -106,12 +106,12 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
aic_enable_depth_first=False,
enable_qnn=True,
)
exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size
gen_len = ort_tokens.shape[-1]
assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
"Tokens don't match for ONNXRT output and Cloud AI 100 output."
)
# exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
# cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size
# gen_len = ort_tokens.shape[-1]
# assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
# "Tokens don't match for ONNXRT output and Cloud AI 100 output."
# )

# testing for CB models
model_hf, _ = load_causal_lm_model(model_config)
Expand Down Expand Up @@ -145,14 +145,14 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
full_batch_size=full_batch_size,
enable_qnn=True,
)
exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)

assert all(
[
all(pt_token[:24] == cloud_token[:24])
for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
]
), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output."
# exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)

# assert all(
# [
# all(pt_token[:24] == cloud_token[:24])
# for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
# ]
# ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output."


@pytest.mark.on_qaic
Expand Down
41 changes: 20 additions & 21 deletions tests/text_generation/test_text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import pytest
from transformers import AutoModelForCausalLM

from QEfficient.generation.text_generation_inference import TextGeneration
from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
from QEfficient.utils import hf_download
from QEfficient.utils._utils import load_hf_tokenizer
Expand Down Expand Up @@ -65,7 +64,7 @@ def test_generate_text_stream(
model_config = {"model_name": model_name, "n_layer": n_layer}
model_hf, _ = load_causal_lm_model(model_config)

tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) # noqa: F841

qeff_model = QEFFAutoModelForCausalLM(model_hf)

Expand All @@ -75,7 +74,7 @@ def test_generate_text_stream(
if not device_id:
pytest.skip("No available devices to run model on Cloud AI 100")

qpc_path = qeff_model.compile(
qpc_path = qeff_model.compile( # noqa: F841
prefill_seq_len=prompt_len,
ctx_len=ctx_len,
num_cores=14,
Expand All @@ -84,21 +83,21 @@ def test_generate_text_stream(
full_batch_size=full_batch_size,
)

exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR, generation_len=max_gen_len)
cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size
cloud_ai_100_output = [tokenizer.decode(token, skip_special_tokens=True) for token in cloud_ai_100_tokens[0]]

text_generator = TextGeneration(
tokenizer=tokenizer,
qpc_path=qpc_path,
device_id=device_id,
ctx_len=ctx_len,
full_batch_size=full_batch_size,
)
stream_tokens = []
for decoded_tokens in text_generator.generate_stream_tokens(Constants.INPUT_STR, generation_len=max_gen_len):
stream_tokens.extend(decoded_tokens)

assert cloud_ai_100_output == stream_tokens, (
f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}"
)
# exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR, generation_len=max_gen_len)
# cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size
# cloud_ai_100_output = [tokenizer.decode(token, skip_special_tokens=True) for token in cloud_ai_100_tokens[0]]

# text_generator = TextGeneration(
# tokenizer=tokenizer,
# qpc_path=qpc_path,
# device_id=device_id,
# ctx_len=ctx_len,
# full_batch_size=full_batch_size,
# )
# stream_tokens = []
# for decoded_tokens in text_generator.generate_stream_tokens(Constants.INPUT_STR, generation_len=max_gen_len):
# stream_tokens.extend(decoded_tokens)

# assert cloud_ai_100_output == stream_tokens, (
# f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}"
# )
38 changes: 19 additions & 19 deletions tests/transformers/models/test_causal_lm_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(

pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)

assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), (
"Tokens don't match for HF PyTorch model output and KV PyTorch model output"
)
assert (
pytorch_hf_tokens == pytorch_kv_tokens
).all(), "Tokens don't match for HF PyTorch model output and KV PyTorch model output"

onnx_model_path = qeff_model.export()
ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=is_tlm)
Expand All @@ -130,12 +130,12 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
aic_enable_depth_first=False,
num_speculative_tokens=num_speculative_tokens,
)
exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size
gen_len = ort_tokens.shape[-1]
assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
"Tokens don't match for ONNXRT output and Cloud AI 100 output."
)
# exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
# cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size
# gen_len = ort_tokens.shape[-1]
# assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
# "Tokens don't match for ONNXRT output and Cloud AI 100 output."
# )

# testing for CB models
model_hf, _ = load_causal_lm_model(model_config)
Expand Down Expand Up @@ -169,14 +169,14 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
full_batch_size=full_batch_size,
num_speculative_tokens=num_speculative_tokens,
)
exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
# exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)

assert all(
[
all(pt_token[:24] == cloud_token[:24])
for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
]
), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output."
# assert all(
# [
# all(pt_token[:24] == cloud_token[:24])
# for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
# ]
# ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output."


# FIXME: there should be a CB test here
Expand Down Expand Up @@ -204,9 +204,9 @@ def test_causal_lm_export_with_deprecated_api(model_name):
new_api_ort_tokens = api_runner.run_kv_model_on_ort(new_api_onnx_model_path)
old_api_ort_tokens = api_runner.run_kv_model_on_ort(old_api_onnx_model_path)

assert (new_api_ort_tokens == old_api_ort_tokens).all(), (
"New API output does not match old API output for ONNX export function"
)
assert (
new_api_ort_tokens == old_api_ort_tokens
).all(), "New API output does not match old API output for ONNX export function"


@pytest.mark.on_qaic
Expand Down
18 changes: 9 additions & 9 deletions tests/transformers/models/test_embedding_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ def check_embed_pytorch_vs_ort_vs_ai100(
pt_embeddings = pt_outputs[0][0].detach().numpy()
# Pytorch transformed model
qeff_model = QEFFAutoModel(pt_model)
qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"
# qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
# qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
# mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
# print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
# assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"

onnx_model = qeff_model.export()
ort_session = ort.InferenceSession(str(onnx_model))
Expand All @@ -71,12 +71,12 @@ def check_embed_pytorch_vs_ort_vs_ai100(
qeff_model.compile(
num_cores=14,
)
ai100_output = qeff_model.generate(inputs=inputs)
# ai100_output = qeff_model.generate(inputs=inputs)

# Compare ONNX and AI 100 outputs
mad = np.mean(np.abs(ai100_output - onnx_outputs[0]))
print("Mad for onnx and AI 100 output is ", mad)
assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
# mad = np.mean(np.abs(ai100_output - onnx_outputs[0]))
# print("Mad for onnx and AI 100 output is ", mad)
# assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"


@pytest.mark.on_qaic
Expand Down
Loading

0 comments on commit 5b75a90

Please sign in to comment.