From 09a939538a95862a809d37326c16233d62db0243 Mon Sep 17 00:00:00 2001 From: Onkar Chougule <168134249+ochougul@users.noreply.github.com> Date: Thu, 6 Jun 2024 20:50:11 +0530 Subject: [PATCH] Adding QEFFAutoModel i.e. model loader for loading any type of model. (#31) * all changes Signed-off-by: Onkar Chougule * only loader changes Signed-off-by: Onkar Chougule * removed unused imports Signed-off-by: Onkar Chougule * allowed to initialize QEFFAUtoLMModel Signed-off-by: Onkar Chougule * fixed tests bugs Signed-off-by: Onkar Chougule * renamed utils.py to _utils.py Signed-off-by: Onkar Chougule * added more type hinting and docstrings Signed-off-by: Onkar Chougule * addressed review comments, added test file for new interface Signed-off-by: Onkar Chougule * enabled CLI APIs Signed-off-by: Onkar Chougule * *Updated README, notebooks *Removed circular import *Added comments on loader files * separated cross-compile script *separated utils funcs Signed-off-by: Onkar Chougule * bug-fix infer Signed-off-by: Onkar Chougule * using QEfficient.export, compile in cloud APIs Signed-off-by: Onkar Chougule * cleaner infer,epxport APIs Signed-off-by: Onkar Chougule * addressed review comments Signed-off-by: Onkar Chougule * *updated notebooks, readme *moved class desc to base.py *Added Runtime Enum Signed-off-by: Onkar Chougule * updated cloud_ai_100_exec_kv to be callable from QEfficient package Signed-off-by: Onkar Chougule * fixed tests Signed-off-by: Onkar Chougule * clenaed notebook Signed-off-by: Onkar Chougule * *Added transfrom call within init *reanmed cross_compile *updated notebooks *updated README Signed-off-by: Onkar Chougule * addressed review comments Signed-off-by: Onkar Chougule --------- Signed-off-by: Onkar Chougule --- QEfficient/cloud/compile.py | 1 - QEfficient/cloud/export.py | 18 +----------- .../exporter/export_hf_to_cloud_ai_100.py | 14 ++------- .../generation/text_generation_inference.py | 9 ------ QEfficient/transformers/modeling_utils.py | 29 +++++-------------- notebooks/QEfficientGPT2.ipynb | 9 ++++++ notebooks/QEfficientMPT.ipynb | 9 ++++++ 7 files changed, 29 insertions(+), 60 deletions(-) diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py index e2fd3a65c..4e446de05 100644 --- a/QEfficient/cloud/compile.py +++ b/QEfficient/cloud/compile.py @@ -6,7 +6,6 @@ # ----------------------------------------------------------------------------- import argparse - import QEfficient if __name__ == "__main__": diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 31f2a814b..1cd7223c1 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -12,24 +12,12 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -<<<<<<< HEAD -<<<<<<< HEAD -from QEfficient.utils import onnx_exists -======= -<<<<<<< HEAD -======= ->>>>>>> a7db36f (Fixed bug) -from QEfficient.utils import hf_download, onnx_exists ->>>>>>> a62b22d (Rebased) from QEfficient.utils.constants import Constants from QEfficient.utils.logging_utils import logger # Specifically for Docker images. ROOT_DIR = os.path.dirname(os.path.abspath("")) - -<<<<<<< HEAD -<<<<<<< HEAD def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]]=None, hf_token: Optional[str] = None): """ exports the model to onnx if pre-exported file is not found and returns onnx_model_path @@ -60,11 +48,7 @@ def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Uni return onnx_model_path -======= -<<<<<<< HEAD ->>>>>>> a62b22d (Rebased) -======= ->>>>>>> a7db36f (Fixed bug) + def main( model_name: str, cache_dir: str, diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index 7f000e6f6..a1cc43245 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -56,12 +56,6 @@ def convert_to_cloud_bertstyle( Return: Path of exported ONNX file. """ - # todo (amitraj) Optimize the onnx export - if onnx_dir_path is None: - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) - onnx_dir_path = os.path.join(model_card_dir, "onnx_bertstyle") - - # Check if ONNX already exist if os.path.exists(onnx_dir_path): logger.warning(f"Overriding {onnx_dir_path}") shutil.rmtree(onnx_dir_path) @@ -459,7 +453,8 @@ def qualcomm_efficient_converter( form_factor: str="cloud", save_fp32_onnx: bool = False, save_fp16_onnx: bool = True, -) -> str: +) -> Tuple[str, str]: + """ API to convert torch Bert style and KV style model to ONNX. --------- @@ -480,13 +475,8 @@ def qualcomm_efficient_converter( Returns: Path of exported ONNX file. """ -<<<<<<< HEAD # Get model_kv first model_kv = model_kv if model_kv else QEFFCommonLoader.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token, cache_dir=cache_dir) -======= - if model_kv is not None and not kv: - raise AttributeError("For transformed model kv must be True") ->>>>>>> a7db36f (Fixed bug) # Transform if required if model_kv.is_transformed and not kv: diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index f6f7009c7..2225750a3 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -108,25 +108,16 @@ def latency_stats_bertstyle( def get_compilation_batch_size(qpc_path: str): -<<<<<<< HEAD qpc_base_path = os.path.dirname(os.path.normpath(qpc_path)) specialization_file_path = os.path.join(qpc_base_path, "specializations.json") logger.info(f"specialization_file_path : {specialization_file_path}") -======= - qpc_base_path = os.path.dirname(qpc_path) - specialization_file_path = os.path.join(qpc_base_path, "specializations.json") ->>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11)) with open(specialization_file_path, "r") as file: data = json.load(file) compilation_batch_size = int(data["specializations"][0]["batch_size"]) return compilation_batch_size -<<<<<<< HEAD def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) -> List[str]: -======= -def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size): ->>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11)) assert ( prompt is not None or prompts_txt_file_path is not None ), "Please pass atleast one argument either using --prompt or --prompts_txt_file_path" diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index cb7c1bc50..753d08204 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -33,31 +33,18 @@ ) from transformers.models.mixtral.modeling_mixtral import ( MixtralAttention, + MixtralBLockSparseTop2MLP, + MixtralDecoderLayer, MixtralForCausalLM, MixtralModel, - MixtralDecoderLayer, - MixtralSparseMoeBlock, - MixtralBLockSparseTop2MLP, - MixtralRotaryEmbedding, MixtralRMSNorm, + MixtralRotaryEmbedding, + MixtralSparseMoeBlock, ) from transformers.models.mpt.modeling_mpt import MptAttention, MptBlock, MptForCausalLM, MptModel from QEfficient.customop import CustomRMSNormAIC -from .modeling_attn_mask_utils import ( - QEffAttentionMaskConverter, - _qeff_prepare_4d_attention_mask, - _qeff_prepare_4d_causal_attention_mask, -) -from .modeling_outputs import ( - QEffBaseModelOutputWithPast, - QEffBaseModelOutputWithPastAndCrossAttentions, - QEffCausalLMOutputWithCrossAttentions, - QEffCausalLMOutputWithPast, - QEffMoeCausalLMOutputWithPast, - QEffMoeModelOutputWithPast, -) from .models.codegen.modeling_codegen import ( QEffCodeGenAttention, QEffCodeGenBlock, @@ -79,13 +66,13 @@ QEffMistralRotaryEmbedding, ) from .models.mixtral_moe.modeling_mixtral import ( - QEffMixtralModel, - QEffMixtralRotaryEmbedding, QEffMixtralAttention, - QEffMixtralForCausalLM, + QEffMixtralBLockSparseTop2MLP, QEffMixtralDecoderLayer, + QEffMixtralForCausalLM, + QEffMixtralModel, + QEffMixtralRotaryEmbedding, QEffMixtralSparseMoeBlock, - QEffMixtralBLockSparseTop2MLP, ) from .models.mpt.modeling_mpt import QEffMptAttention, QEffMptBlock, QEffMptForCausalLM, QEFfMptModel diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index cd65109e5..3d0c10fab 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -130,6 +130,7 @@ "metadata": {}, "outputs": [], "source": [ +<<<<<<< HEAD <<<<<<< HEAD "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n", "\n", @@ -145,6 +146,14 @@ "batch_size = get_compilation_batch_size(generated_qpc_path)\n" "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")" >>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11)) +======= + "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n", + "\n", + "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n", + "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", + "batch_size = get_compilation_batch_size(generated_qpc_path)\n", + "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])" +>>>>>>> cfb3776 (Adding QEFFAutoModel i.e. model loader for loading any type of model. (#31)) ] } ], diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index 185f7678e..ad9daea54 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -128,15 +128,20 @@ "metadata": {}, "outputs": [], "source": [ +<<<<<<< HEAD <<<<<<< HEAD "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n", ======= "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n", >>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11)) +======= + "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n", +>>>>>>> cfb3776 (Adding QEFFAutoModel i.e. model loader for loading any type of model. (#31)) "\n", "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "\n", +<<<<<<< HEAD <<<<<<< HEAD "batch_size = get_compilation_batch_size(generated_qpc_path)\n", "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])" @@ -144,6 +149,10 @@ "batch_size = get_compilation_batch_size(generated_qpc_path)" "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")" >>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11)) +======= + "batch_size = get_compilation_batch_size(generated_qpc_path)\n", + "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])" +>>>>>>> cfb3776 (Adding QEFFAutoModel i.e. model loader for loading any type of model. (#31)) ] } ],