From 31b14667deb9322f16f58958751c3e0d1717ae21 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Mon, 29 Apr 2024 19:22:26 +0530 Subject: [PATCH] Update README.md to fix broken links and hf_download function to allow and ignore patterns while downloading model files from hugginface Signed-off-by: vbaddi --- QEfficient/cloud/execute.py | 4 +-- QEfficient/cloud/infer.py | 27 ++++++++++--------- .../exporter/export_hf_to_cloud_ai_100.py | 21 ++++++++++----- QEfficient/exporter/export_utils.py | 18 ++++++++----- .../generation/text_generation_inference.py | 4 +-- QEfficient/utils/__init__.py | 11 +++++--- README.md | 18 ++++++------- notebooks/QEfficientGPT2.ipynb | 11 ++++++-- notebooks/QEfficientMPT.ipynb | 10 +++++-- tests/utils.py | 19 ++++++++++--- 10 files changed, 92 insertions(+), 51 deletions(-) diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index b83ff0324..6c53ce73c 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -8,8 +8,8 @@ import argparse from typing import List -from transformers import AutoTokenizer from huggingface_hub import login +from transformers import AutoTokenizer from QEfficient.generation.text_generation_inference import latency_stats_kv from QEfficient.utils import hf_download @@ -35,7 +35,7 @@ def main( if hf_token is not None: login(hf_token) # Download tokenizer along with model if it doesn't exist - model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir) + model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json"]) tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True) latency_stats_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 376eb1e4d..687a4872c 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -5,20 +5,20 @@ # # ----------------------------------------------------------------------------- -import os -import shutil import argparse +import os from typing import List -from transformers import AutoModelForCausalLM, AutoTokenizer from huggingface_hub import login +from transformers import AutoModelForCausalLM, AutoTokenizer + import QEfficient -from QEfficient.utils import hf_download from QEfficient.cloud.compile import main as compile -from QEfficient.utils.constants import Constants, QEFF_MODELS_DIR -from QEfficient.utils.logging_utils import logger from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.generation.text_generation_inference import latency_stats_kv +from QEfficient.utils import hf_download +from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants +from QEfficient.utils.logging_utils import logger """ 1. Check if compiled qpc for given config already exists, if it does jump to execute, else @@ -35,9 +35,7 @@ def qpc_exists(qpc_dir_path: str) -> bool: :param dir_path: str. Path of qpc directory. :return: bool. """ - return (os.path.isdir(qpc_dir_path) and - os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin"))) - + return os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin")) def onnx_exists(onnx_file_path: str) -> bool: @@ -81,7 +79,11 @@ def main( # Get tokenizer if hf_token is not None: login(hf_token) - model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir) + model_hf_path = hf_download( + repo_id=model_name, + cache_dir=cache_dir, + ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"], + ) tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True) if qpc_exists(qpc_dir_path): @@ -128,7 +130,7 @@ def main( kv=True, form_factor="cloud", return_path=True, - tokenizer=tokenizer + tokenizer=tokenizer, ) assert ( generated_onnx_path == onnx_model_path @@ -194,7 +196,8 @@ def main( help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol", ) parser.add_argument( - "--aic_enable_depth_first", "--aic-enable-depth-first", + "--aic_enable_depth_first", + "--aic-enable-depth-first", action="store_true", help="If passed, this option will be enabled during compilation, disabled by default", ) diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index df9912bfa..cbd2495eb 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -6,18 +6,17 @@ # ----------------------------------------------------------------------------- import os -from typing import Tuple, Optional import shutil +from typing import Optional, Tuple import torch -from transformers import AutoTokenizer from huggingface_hub import login +from transformers import AutoTokenizer from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort -from QEfficient.utils.constants import Constants -from QEfficient.utils import hf_download from QEfficient.transformers.modeling_utils import transform -from QEfficient.utils.constants import QEFF_MODELS_DIR +from QEfficient.utils import hf_download +from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants from QEfficient.utils.logging_utils import logger @@ -80,7 +79,11 @@ def convert_to_cloud_bertstyle( try: if hf_token: login(hf_token) - model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR) + model_hf_path = hf_download( + repo_id=model_name, + cache_dir=Constants.CACHE_DIR, + ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"], + ) model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True) except Exception as e: print(f"Failed to download the {model_name} model from Huggingface:%s", e) @@ -238,7 +241,11 @@ def convert_to_cloud_kvstyle( try: if hf_token: login(hf_token) - model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR) + model_hf_path = hf_download( + repo_id=model_name, + cache_dir=Constants.CACHE_DIR, + ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"], + ) model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True) except Exception as e: print(f"Failed to download the {model_name} model from Huggingface:%s", e) diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py index 775677d09..a2f3e9782 100644 --- a/QEfficient/exporter/export_utils.py +++ b/QEfficient/exporter/export_utils.py @@ -329,12 +329,14 @@ def run_model_on_ort( past_value_mean = past_key_sum / num print(f"past_keys (mean) \t\t {past_key_mean}") print(f"past_value (mean) \t\t {past_value_mean}") + print("\n=============================================================\n") return input_names, ort_outputs except Exception as e: model = onnx.load(onnx_path, load_external_data=False) input_names = [x.name for x in model.graph.input] print(f"Failed to run the onnx {onnx_path} model in onnx runtime:%s", e) + print("\n=============================================================\n") return input_names, None @@ -373,17 +375,19 @@ def compile_kv_model_on_cloud_ai_100( custom_io_path: str, aic_enable_depth_first: bool, mos: int = -1, - device_group: List[int]=[0], + device_group: List[int] = [0], **kwargs, ) -> bool: import shutil - aic_binary_dir = os.path.join(base_path, "qpcs") + aic_binary_dir = os.path.join(base_path, "qpcs") if os.path.isdir(aic_binary_dir): shutil.rmtree(aic_binary_dir) - assert os.path.isfile(specializations_json), f"Please use 'from QEfficient.cloud.compile import main as compile', as {specializations_json} file was not found" + assert os.path.isfile( + specializations_json + ), f"Please use 'from QEfficient.cloud.compile import main as compile', as {specializations_json} file was not found" assert os.path.isfile(custom_io_path), f"{custom_io_path} file was not found!" command = [ "/opt/qti-aic/exec/qaic-exec", @@ -400,7 +404,7 @@ def compile_kv_model_on_cloud_ai_100( ] if mxfp6: command.append("-mxfp6-matmul") - if (mos>0): + if mos > 0: command.append(f"-mos={mos}") if aic_enable_depth_first: command.append("-aic-enable-depth-first") @@ -414,14 +418,14 @@ def compile_kv_model_on_cloud_ai_100( } ], } - mdp_ts_config_path = os.path.join(base_path, f"mdp_ts_config.json") + mdp_ts_config_path = os.path.join(base_path, "mdp_ts_config.json") with open(mdp_ts_config_path, "w") as file: json.dump(mdp_ts_config, file, indent=4) command.append(f"-mdp-load-partition-config={mdp_ts_config_path}") print("Running AI 100 compiler:", " ".join(command)) result = subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) - if result.returncode !=0: + if result.returncode != 0: raise RuntimeError("Compilation Failed!!, please check compilation arguments.") - print(f"\n=============== Compilation Done! ===============\n") + print("\n===================== Compilation Done! =====================\n") return result.returncode == 0, aic_binary_dir diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 142395235..32237a2b9 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -204,7 +204,7 @@ def latency_stats_kv( print("Total (E2E) inference time is=", round(total_time, 2)) return print() - print("*****************Performance Stats**********************") + print("===================== Performance Stats =====================") if batch_size > 1: print("Prefill time a.k.a TTFT (batch) is :", round(prefill_time, 2), "s") print("Decode (batch):", round(decode_perf * batch_size, 2), "tok/s") @@ -215,4 +215,4 @@ def latency_stats_kv( print("Decode:", round(decode_perf, 2), "tok/s") print("E2E:", round(total_perf, 2), "tok/s") print("Total (E2E) inference time is=", round(total_time, 2), "s") - print("********************************************************") + print("=============================================================") diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index 833c4539a..1929b8d4a 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -6,17 +6,19 @@ # ----------------------------------------------------------------------------- import os -import requests -from typing import Optional +from typing import List, Optional -from requests.exceptions import HTTPError +import requests from huggingface_hub import snapshot_download +from requests.exceptions import HTTPError def hf_download( repo_id: Optional[str] = None, cache_dir: Optional[str] = None, hf_token: Optional[str] = None, + allow_patterns: Optional[List[str]] = None, + ignore_patterns: Optional[List[str]] = None, ): # Setup cache and local dir local_dir = None @@ -37,7 +39,8 @@ def hf_download( revision="main", resume_download=True, token=hf_token, - ignore_patterns=["*.txt", "*.msgpack", "*.h5", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"], + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, ) break except requests.ReadTimeout as e: diff --git a/README.md b/README.md index 1017041d7..ed83b6182 100644 --- a/README.md +++ b/README.md @@ -99,11 +99,11 @@ In summary: | High Level APIs | Sample use | Arguments | |-----------------|------------|-------------------| -| QEfficient.cloud.infer | [click here](#qeff-python-infer-api-e2e) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional
  • hf_token : Optional
  • cache_dir : Optional ["cache_dir" in current working directory]
  • prompt : Optinoal [Default-"My name is"]
  • | -| QEfficient.cloud.execute | [click here](#qeff-python-execute-api) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • prompt : Optional [Default-"My name is"]
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • | +| QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional
  • hf_token : Optional
  • cache_dir : Optional ["cache_dir" in current working directory]
  • prompt : Optinoal [Default-"My name is"]
  • | +| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexcute) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • prompt : Optional [Default-"My name is"]
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • | -### 1. Use QEfficient.cloud.infer for +### 1. Use QEfficient.cloud.infer This is the single e2e python api in the library, which takes model_card name as input along with other compile args if necessary and does everything in one go. @@ -160,10 +160,10 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3 | Low Level APIs | Sample use | Arguments | |-----------------|------------|-------------------| -| QEfficient.transform | [click here](#) |
  • model : $\color{green} {Mandatory}$
  • Type : Optional [Default- "Transformers"]
  • form_factor : Optional [Default-"cloud"]
  • | -| qualcomm_efficient_converter | [click here](#qeff-python-execute-api) |
  • mode_name : $\color{green} {Mandatory}$
  • model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed]
  • model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]
  • tokenizer : Optional
  • onnx_path : Optional
  • hf_token : Optional
  • seq_length : Optional [Default-128]
  • input_str : Optional [Default-"My name is"]
  • kv : Optional [Default-True]
  • return_path : Optional [Default-False]
  • form_factor : Optional [Default-"cloud"]
  • save_fp32_onnx : Optional [Default-False]
  • save_fp16_onnx : Optional [Default-True]
  • *Both save_fp32_onnx and save_fp16_onnx can't be false*
  • | -| compile_kv_model_on_cloud_ai_100 | [click here](#3-compile-on-cloud-ai-100) |
  • onnx_path : $\color{green} {Mandatory}$
  • specializations_json : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • base_path : $\color{green} {Mandatory}$
  • mxfp6 : $\color{green} {Mandatory}$
  • custom_io_path : $\color{green} {Mandatory}$
  • device_group : Optional [Default -[0]]
  • | -|latency_stats_kv | [click here](#4print-benchmark) |
  • tokenizer : $\color{green} {Mandatory}$
  • qpc : $\color{green} {Mandatory}$
  • prompt : $\color{green} {Mandatory}$
  • input_len : Optional [Default-None]
  • generation_len : Optional [Default-None]
  • device_id : Optional [Default-[0]]
  • enable_debug_logs : Optional [Default-False]
  • stream : Optional [Default-True]
  • write_io_dir : Optional
  • automation : Optional [Default-False]
  • | +| QEfficient.transform | [click here](#1-model-download-and-transform) |
  • model : $\color{green} {Mandatory}$
  • Type : Optional [Default- "Transformers"]
  • form_factor : Optional [Default-"cloud"]
  • | +| qualcomm_efficient_converter | [click here](#2-onnx-export-of-transformed-model) |
  • mode_name : $\color{green} {Mandatory}$
  • model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed]
  • model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]
  • tokenizer : Optional
  • onnx_path : Optional
  • hf_token : Optional
  • seq_length : Optional [Default-128]
  • input_str : Optional [Default-"My name is"]
  • kv : Optional [Default-$\color{green} {True}$]
  • return_path : Optional [Default-False]
  • form_factor : Optional [Default-"cloud"]
  • save_fp32_onnx : Optional [Default-False]
  • save_fp16_onnx : Optional [Default-True]
  • *Both save_fp32_onnx and save_fp16_onnx can't be false*
  • | +| compile | [click here](#3-compile-on-cloud-ai-100) |
  • onnx_path : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional [Default-True]
  • | +|latency_stats_kv | [click here](#4-run-benchmark) |
  • tokenizer : $\color{green} {Mandatory}$
  • qpc : $\color{green} {Mandatory}$
  • prompt : $\color{green} {Mandatory}$
  • input_len : Optional [Default-None]
  • generation_len : Optional [Default-None]
  • device_id : Optional [Default-[0]]
  • enable_debug_logs : Optional [Default-False]
  • stream : Optional [Default-True]
  • write_io_dir : Optional
  • automation : Optional [Default-False]
  • | ### 1. Model download and transform @@ -189,7 +189,7 @@ model_name = "gpt2" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib. -model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR) +model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR, ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"]) model_hf = GPT2LMHeadModel.from_pretrained(model_hf_path, use_cache=True) model_hf.eval() print(f"{model_name} from hugging-face \n", model_hf) @@ -243,8 +243,8 @@ generated_qpc_path = compile( onnx_path=onnx_path, num_cores=14, qpc_path=base_path, - mxfp6=True, device_group=[0], + mxfp6=True, ) ``` ### 4. Run Benchmark diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 9883e52ad..537475dbf 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -25,10 +25,13 @@ "source": [ "# Initiate the Orignal Transformer model\n", "import os\n", - "from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel\n", + "\n", "from transformers import AutoTokenizer\n", + "from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel\n", + "\n", "from QEfficient.utils import hf_download\n", "from QEfficient.utils.constants import Constants\n", + "\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n", "\n", @@ -37,7 +40,11 @@ "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n", "model_name = \"gpt2\" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n", "\n", - "model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR)\n", + "model_hf_path = hf_download(\n", + " repo_id=model_name,\n", + " cache_dir=Constants.CACHE_DIR,\n", + " ignore_pattrens=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n", + ")\n", "model_hf = GPT2LMHeadModel.from_pretrained(model_hf_path, use_cache=True)\n", "model_hf.eval()\n", "print(f\"{model_name} from hugging-face \\n\", model_hf)" diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index f0968ff12..ecce36b23 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -25,8 +25,10 @@ "source": [ "# Initiate the Orignal Transformer model\n", "import os\n", - "from transformers.models.mpt.modeling_mpt import MptForCausalLM\n", + "\n", "from transformers import AutoTokenizer\n", + "from transformers.models.mpt.modeling_mpt import MptForCausalLM\n", + "\n", "from QEfficient.utils import hf_download\n", "from QEfficient.utils.constants import Constants\n", "\n", @@ -37,7 +39,11 @@ "\n", "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n", "model_name = \"mosaicml/mpt-7b\" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n", - "model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR)\n", + "model_hf_path = hf_download(\n", + " repo_id=model_name,\n", + " cache_dir=Constants.CACHE_DIR,\n", + " ignore_pattrens=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n", + ")\n", "model_hf = MptForCausalLM.from_pretrained(model_hf_path, use_cache=True)\n", "model_hf.eval()\n", "print(f\"{model_name} from hugging-face \\n\", model_hf)" diff --git a/tests/utils.py b/tests/utils.py index dd6edc407..8ff1f627d 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -14,7 +14,7 @@ from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100 from QEfficient.utils import hf_download -from QEfficient.utils.constants import Constants, QEFF_MODELS_DIR, ROOT_DIR +from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants from QEfficient.utils.device_utils import get_available_device_id from QEfficient.utils.run_utils import ApiRunner @@ -32,6 +32,7 @@ def prepare_work_dir(work_dir): # create empty temp dir os.makedirs(temp_dir) + def remove_temp_dir(work_dir): """ Function to remove the temp work directory location @@ -42,18 +43,20 @@ def remove_temp_dir(work_dir): if os.path.exists(temp_dir): shutil.rmtree(temp_dir) + def get_tokenizer(model_name): """ Function to get tokenizer info from transformers.AutoTokenizer :param model_name: str :return tokenizer """ - model_hf_path = hf_download(repo_id=model_name) + model_hf_path = hf_download(repo_id=model_name, allow_patterns=["*.json"]) tokenizer = transformers.AutoTokenizer.from_pretrained(model_hf_path, padding_side="left") if tokenizer.pad_token_id is None: tokenizer.pad_token_id = tokenizer.eos_token_id return tokenizer + def load_pytorch_model(model_name, model_class): """ Function to load model from huggingface and transform to KV model @@ -61,11 +64,14 @@ def load_pytorch_model(model_name, model_class): :param model_class: type :return model_hf """ - model_path = hf_download(repo_id=model_name) + model_path = hf_download( + repo_id=model_name, ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"] + ) model_hf = model_class.from_pretrained(model_path, use_cache=True) model_hf.eval() return model_hf + def transform_pt_model_with_qeff(model_hf): """ Function to take huggingface model and transform to KV model @@ -76,6 +82,7 @@ def transform_pt_model_with_qeff(model_hf): model_kv.eval() return model_kv + def export_onnx(model_kv, tokenizer, model_name, model_class): """ Function to export onnx model @@ -91,9 +98,11 @@ def export_onnx(model_kv, tokenizer, model_name, model_class): tokenizer=tokenizer, onnx_dir_path=onnx_dir_path, kv=True, - return_path=True) + return_path=True, + ) return base_path, onnx_model_path + def set_up(model_config): """ Set up function to set up the test environment for TestQEfficientModel class @@ -148,6 +157,7 @@ def set_up(model_config): setup_info["ort_tokens"] = ort_tokens return setup_info + def get_cloud_ai_100_tokens(setup_info): """ Test function to validate the llama model before and after KV changes on Cloud AI 100 @@ -168,6 +178,7 @@ def get_cloud_ai_100_tokens(setup_info): device_group=[0], ) from QEfficient.generation.cloud_infer import QAICInferenceSession + session = QAICInferenceSession(test_qpcs_path, device_id, enable_debug_logs=False) try: cloud_ai_100_tokens = setup_info["api_runner"].run_kv_model_on_cloud_ai_100(