Adding QEFFAutoModel i.e. model loader for loading any type of model. (…

…quic#31) * all changes Signed-off-by: Onkar Chougule <[email protected]> * only loader changes Signed-off-by: Onkar Chougule <[email protected]> * removed unused imports Signed-off-by: Onkar Chougule <[email protected]> * allowed to initialize QEFFAUtoLMModel Signed-off-by: Onkar Chougule <[email protected]> * fixed tests bugs Signed-off-by: Onkar Chougule <[email protected]> * renamed utils.py to _utils.py Signed-off-by: Onkar Chougule <[email protected]> * added more type hinting and docstrings Signed-off-by: Onkar Chougule <[email protected]> * addressed review comments, added test file for new interface Signed-off-by: Onkar Chougule <[email protected]> * enabled CLI APIs Signed-off-by: Onkar Chougule <[email protected]> * *Updated README, notebooks *Removed circular import *Added comments on loader files * separated cross-compile script *separated utils funcs Signed-off-by: Onkar Chougule <[email protected]> * bug-fix infer Signed-off-by: Onkar Chougule <[email protected]> * using QEfficient.export, compile in cloud APIs Signed-off-by: Onkar Chougule <[email protected]> * cleaner infer,epxport APIs Signed-off-by: Onkar Chougule <[email protected]> * addressed review comments Signed-off-by: Onkar Chougule <[email protected]> * *updated notebooks, readme *moved class desc to base.py *Added Runtime Enum Signed-off-by: Onkar Chougule <[email protected]> * updated cloud_ai_100_exec_kv to be callable from QEfficient package Signed-off-by: Onkar Chougule <[email protected]> * fixed tests Signed-off-by: Onkar Chougule <[email protected]> * clenaed notebook Signed-off-by: Onkar Chougule <[email protected]> * *Added transfrom call within init *reanmed cross_compile *updated notebooks *updated README Signed-off-by: Onkar Chougule <[email protected]> * addressed review comments Signed-off-by: Onkar Chougule <[email protected]> --------- Signed-off-by: Onkar Chougule <[email protected]>
quic-amitraj · Jun 7, 2024 · 09a9395 · 09a9395
1 parent 8fb7bd5
commit 09a9395
Show file tree

Hide file tree

Showing 7 changed files with 29 additions and 60 deletions.
diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py
@@ -6,7 +6,6 @@
 # -----------------------------------------------------------------------------
 
 import argparse
-
 import QEfficient
 
 if __name__ == "__main__":

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
@@ -12,24 +12,12 @@
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-<<<<<<< HEAD
-<<<<<<< HEAD
-from QEfficient.utils import onnx_exists
-=======
-<<<<<<< HEAD
-=======
->>>>>>> a7db36f (Fixed bug)
-from QEfficient.utils import hf_download, onnx_exists
->>>>>>> a62b22d (Rebased)
 from QEfficient.utils.constants import Constants
 from QEfficient.utils.logging_utils import logger
 
 # Specifically for Docker images.
 ROOT_DIR = os.path.dirname(os.path.abspath(""))
 
-
-<<<<<<< HEAD
-<<<<<<< HEAD
 def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]]=None, hf_token: Optional[str] = None):
     """
     exports the model to onnx if pre-exported file is not found and returns onnx_model_path
@@ -60,11 +48,7 @@ def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Uni
     return onnx_model_path
 
 
-=======
-<<<<<<< HEAD
->>>>>>> a62b22d (Rebased)
-=======
->>>>>>> a7db36f (Fixed bug)
+
 def main(
     model_name: str,
     cache_dir: str,

diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -56,12 +56,6 @@ def convert_to_cloud_bertstyle(
     Return:
         Path of exported ONNX file.
     """
-    # todo (amitraj) Optimize the onnx export
-    if onnx_dir_path is None:
-        model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
-        onnx_dir_path = os.path.join(model_card_dir, "onnx_bertstyle")
-
-    # Check if ONNX already exist
     if os.path.exists(onnx_dir_path):
         logger.warning(f"Overriding {onnx_dir_path}")
         shutil.rmtree(onnx_dir_path)
@@ -459,7 +453,8 @@ def qualcomm_efficient_converter(
     form_factor: str="cloud",
     save_fp32_onnx: bool = False,
     save_fp16_onnx: bool = True,
-) -> str:
+) -> Tuple[str, str]:
+
     """
     API to convert torch Bert style and KV style model to ONNX.
     ---------
@@ -480,13 +475,8 @@ def qualcomm_efficient_converter(
     Returns:
        Path of exported ONNX file.
     """
-<<<<<<< HEAD
     # Get model_kv first
     model_kv = model_kv if model_kv else QEFFCommonLoader.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token, cache_dir=cache_dir)
-=======
-    if model_kv is not None and not kv:
-        raise AttributeError("For transformed model kv must be True")
->>>>>>> a7db36f (Fixed bug)
 
     # Transform if required
     if model_kv.is_transformed and not kv:

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
@@ -108,25 +108,16 @@ def latency_stats_bertstyle(
 
 
 def get_compilation_batch_size(qpc_path: str):
-<<<<<<< HEAD
     qpc_base_path = os.path.dirname(os.path.normpath(qpc_path))
     specialization_file_path = os.path.join(qpc_base_path, "specializations.json")
     logger.info(f"specialization_file_path : {specialization_file_path}")
-=======
-    qpc_base_path = os.path.dirname(qpc_path)
-    specialization_file_path = os.path.join(qpc_base_path, "specializations.json")
->>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
     with open(specialization_file_path, "r") as file:
         data = json.load(file)
     compilation_batch_size = int(data["specializations"][0]["batch_size"])
     return compilation_batch_size
 
 
-<<<<<<< HEAD
 def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) -> List[str]:
-=======
-def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size):
->>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
     assert (
         prompt is not None or prompts_txt_file_path is not None
     ), "Please pass atleast one argument either using --prompt or --prompts_txt_file_path"

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
@@ -33,31 +33,18 @@
 )
 from transformers.models.mixtral.modeling_mixtral import (
     MixtralAttention,
+    MixtralBLockSparseTop2MLP,
+    MixtralDecoderLayer,
     MixtralForCausalLM,
     MixtralModel,
-    MixtralDecoderLayer,
-    MixtralSparseMoeBlock,
-    MixtralBLockSparseTop2MLP,
-    MixtralRotaryEmbedding,
     MixtralRMSNorm,
+    MixtralRotaryEmbedding,
+    MixtralSparseMoeBlock,
 )
 from transformers.models.mpt.modeling_mpt import MptAttention, MptBlock, MptForCausalLM, MptModel
 
 from QEfficient.customop import CustomRMSNormAIC
 
-from .modeling_attn_mask_utils import (
-    QEffAttentionMaskConverter,
-    _qeff_prepare_4d_attention_mask,
-    _qeff_prepare_4d_causal_attention_mask,
-)
-from .modeling_outputs import (
-    QEffBaseModelOutputWithPast,
-    QEffBaseModelOutputWithPastAndCrossAttentions,
-    QEffCausalLMOutputWithCrossAttentions,
-    QEffCausalLMOutputWithPast,
-    QEffMoeCausalLMOutputWithPast,
-    QEffMoeModelOutputWithPast,
-)
 from .models.codegen.modeling_codegen import (
     QEffCodeGenAttention,
     QEffCodeGenBlock,
@@ -79,13 +66,13 @@
     QEffMistralRotaryEmbedding,
 )
 from .models.mixtral_moe.modeling_mixtral import (
-    QEffMixtralModel,
-    QEffMixtralRotaryEmbedding,
     QEffMixtralAttention,
-    QEffMixtralForCausalLM,
+    QEffMixtralBLockSparseTop2MLP,
     QEffMixtralDecoderLayer,
+    QEffMixtralForCausalLM,
+    QEffMixtralModel,
+    QEffMixtralRotaryEmbedding,
     QEffMixtralSparseMoeBlock,
-    QEffMixtralBLockSparseTop2MLP,
 )
 from .models.mpt.modeling_mpt import QEffMptAttention, QEffMptBlock, QEffMptForCausalLM, QEFfMptModel
 

diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
@@ -130,6 +130,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+<<<<<<< HEAD
 <<<<<<< HEAD
     "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
     "\n",
@@ -145,6 +146,14 @@
     "batch_size = get_compilation_batch_size(generated_qpc_path)\n"
     "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
 >>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
+=======
+    "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
+    "\n",
+    "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
+    "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
+    "batch_size = get_compilation_batch_size(generated_qpc_path)\n",
+    "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
+>>>>>>> cfb3776 (Adding QEFFAutoModel i.e. model loader for loading any type of model. (#31))
    ]
   }
  ],

diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
@@ -128,22 +128,31 @@
    "metadata": {},
    "outputs": [],
    "source": [
+<<<<<<< HEAD
 <<<<<<< HEAD
     "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
 =======
     "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n",
 >>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
+=======
+    "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
+>>>>>>> cfb3776 (Adding QEFFAutoModel i.e. model loader for loading any type of model. (#31))
     "\n",
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
+<<<<<<< HEAD
 <<<<<<< HEAD
     "batch_size = get_compilation_batch_size(generated_qpc_path)\n",
     "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
 =======
     "batch_size = get_compilation_batch_size(generated_qpc_path)"
     "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
 >>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
+=======
+    "batch_size = get_compilation_batch_size(generated_qpc_path)\n",
+    "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
+>>>>>>> cfb3776 (Adding QEFFAutoModel i.e. model loader for loading any type of model. (#31))
    ]
   }
  ],