Skip to content

Commit

Permalink
Adding QEFFAutoModel i.e. model loader for loading any type of model. (
Browse files Browse the repository at this point in the history
…quic#31)

* all changes

Signed-off-by: Onkar Chougule <[email protected]>

* only loader changes

Signed-off-by: Onkar Chougule <[email protected]>

* removed unused imports

Signed-off-by: Onkar Chougule <[email protected]>

* allowed to initialize QEFFAUtoLMModel

Signed-off-by: Onkar Chougule <[email protected]>

* fixed tests bugs

Signed-off-by: Onkar Chougule <[email protected]>

* renamed utils.py to _utils.py

Signed-off-by: Onkar Chougule <[email protected]>

* added more type hinting and docstrings

Signed-off-by: Onkar Chougule <[email protected]>

* addressed review comments, added test file for new interface

Signed-off-by: Onkar Chougule <[email protected]>

* enabled CLI APIs

Signed-off-by: Onkar Chougule <[email protected]>

* *Updated README, notebooks *Removed circular import *Added comments on loader files * separated cross-compile script *separated utils funcs

Signed-off-by: Onkar Chougule <[email protected]>

* bug-fix infer

Signed-off-by: Onkar Chougule <[email protected]>

* using QEfficient.export, compile in cloud APIs

Signed-off-by: Onkar Chougule <[email protected]>

* cleaner infer,epxport APIs

Signed-off-by: Onkar Chougule <[email protected]>

* addressed review comments

Signed-off-by: Onkar Chougule <[email protected]>

* *updated notebooks, readme *moved class desc to base.py *Added Runtime Enum

Signed-off-by: Onkar Chougule <[email protected]>

* updated cloud_ai_100_exec_kv to be callable from QEfficient package

Signed-off-by: Onkar Chougule <[email protected]>

* fixed tests

Signed-off-by: Onkar Chougule <[email protected]>

* clenaed notebook

Signed-off-by: Onkar Chougule <[email protected]>

* *Added transfrom call within init *reanmed cross_compile *updated notebooks *updated README

Signed-off-by: Onkar Chougule <[email protected]>

* addressed review comments

Signed-off-by: Onkar Chougule <[email protected]>

---------

Signed-off-by: Onkar Chougule <[email protected]>
  • Loading branch information
ochougul authored and quic-amitraj committed Jun 7, 2024
1 parent 8fb7bd5 commit 09a9395
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 60 deletions.
1 change: 0 additions & 1 deletion QEfficient/cloud/compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
# -----------------------------------------------------------------------------

import argparse

import QEfficient

if __name__ == "__main__":
Expand Down
18 changes: 1 addition & 17 deletions QEfficient/cloud/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,12 @@
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast

from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
<<<<<<< HEAD
<<<<<<< HEAD
from QEfficient.utils import onnx_exists
=======
<<<<<<< HEAD
=======
>>>>>>> a7db36f (Fixed bug)
from QEfficient.utils import hf_download, onnx_exists
>>>>>>> a62b22d (Rebased)
from QEfficient.utils.constants import Constants
from QEfficient.utils.logging_utils import logger

# Specifically for Docker images.
ROOT_DIR = os.path.dirname(os.path.abspath(""))


<<<<<<< HEAD
<<<<<<< HEAD
def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]]=None, hf_token: Optional[str] = None):
"""
exports the model to onnx if pre-exported file is not found and returns onnx_model_path
Expand Down Expand Up @@ -60,11 +48,7 @@ def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Uni
return onnx_model_path


=======
<<<<<<< HEAD
>>>>>>> a62b22d (Rebased)
=======
>>>>>>> a7db36f (Fixed bug)

def main(
model_name: str,
cache_dir: str,
Expand Down
14 changes: 2 additions & 12 deletions QEfficient/exporter/export_hf_to_cloud_ai_100.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,6 @@ def convert_to_cloud_bertstyle(
Return:
Path of exported ONNX file.
"""
# todo (amitraj) Optimize the onnx export
if onnx_dir_path is None:
model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
onnx_dir_path = os.path.join(model_card_dir, "onnx_bertstyle")

# Check if ONNX already exist
if os.path.exists(onnx_dir_path):
logger.warning(f"Overriding {onnx_dir_path}")
shutil.rmtree(onnx_dir_path)
Expand Down Expand Up @@ -459,7 +453,8 @@ def qualcomm_efficient_converter(
form_factor: str="cloud",
save_fp32_onnx: bool = False,
save_fp16_onnx: bool = True,
) -> str:
) -> Tuple[str, str]:

"""
API to convert torch Bert style and KV style model to ONNX.
---------
Expand All @@ -480,13 +475,8 @@ def qualcomm_efficient_converter(
Returns:
Path of exported ONNX file.
"""
<<<<<<< HEAD
# Get model_kv first
model_kv = model_kv if model_kv else QEFFCommonLoader.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token, cache_dir=cache_dir)
=======
if model_kv is not None and not kv:
raise AttributeError("For transformed model kv must be True")
>>>>>>> a7db36f (Fixed bug)

# Transform if required
if model_kv.is_transformed and not kv:
Expand Down
9 changes: 0 additions & 9 deletions QEfficient/generation/text_generation_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,25 +108,16 @@ def latency_stats_bertstyle(


def get_compilation_batch_size(qpc_path: str):
<<<<<<< HEAD
qpc_base_path = os.path.dirname(os.path.normpath(qpc_path))
specialization_file_path = os.path.join(qpc_base_path, "specializations.json")
logger.info(f"specialization_file_path : {specialization_file_path}")
=======
qpc_base_path = os.path.dirname(qpc_path)
specialization_file_path = os.path.join(qpc_base_path, "specializations.json")
>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
with open(specialization_file_path, "r") as file:
data = json.load(file)
compilation_batch_size = int(data["specializations"][0]["batch_size"])
return compilation_batch_size


<<<<<<< HEAD
def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) -> List[str]:
=======
def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size):
>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
assert (
prompt is not None or prompts_txt_file_path is not None
), "Please pass atleast one argument either using --prompt or --prompts_txt_file_path"
Expand Down
29 changes: 8 additions & 21 deletions QEfficient/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,31 +33,18 @@
)
from transformers.models.mixtral.modeling_mixtral import (
MixtralAttention,
MixtralBLockSparseTop2MLP,
MixtralDecoderLayer,
MixtralForCausalLM,
MixtralModel,
MixtralDecoderLayer,
MixtralSparseMoeBlock,
MixtralBLockSparseTop2MLP,
MixtralRotaryEmbedding,
MixtralRMSNorm,
MixtralRotaryEmbedding,
MixtralSparseMoeBlock,
)
from transformers.models.mpt.modeling_mpt import MptAttention, MptBlock, MptForCausalLM, MptModel

from QEfficient.customop import CustomRMSNormAIC

from .modeling_attn_mask_utils import (
QEffAttentionMaskConverter,
_qeff_prepare_4d_attention_mask,
_qeff_prepare_4d_causal_attention_mask,
)
from .modeling_outputs import (
QEffBaseModelOutputWithPast,
QEffBaseModelOutputWithPastAndCrossAttentions,
QEffCausalLMOutputWithCrossAttentions,
QEffCausalLMOutputWithPast,
QEffMoeCausalLMOutputWithPast,
QEffMoeModelOutputWithPast,
)
from .models.codegen.modeling_codegen import (
QEffCodeGenAttention,
QEffCodeGenBlock,
Expand All @@ -79,13 +66,13 @@
QEffMistralRotaryEmbedding,
)
from .models.mixtral_moe.modeling_mixtral import (
QEffMixtralModel,
QEffMixtralRotaryEmbedding,
QEffMixtralAttention,
QEffMixtralForCausalLM,
QEffMixtralBLockSparseTop2MLP,
QEffMixtralDecoderLayer,
QEffMixtralForCausalLM,
QEffMixtralModel,
QEffMixtralRotaryEmbedding,
QEffMixtralSparseMoeBlock,
QEffMixtralBLockSparseTop2MLP,
)
from .models.mpt.modeling_mpt import QEffMptAttention, QEffMptBlock, QEffMptForCausalLM, QEFfMptModel

Expand Down
9 changes: 9 additions & 0 deletions notebooks/QEfficientGPT2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@
"metadata": {},
"outputs": [],
"source": [
<<<<<<< HEAD
<<<<<<< HEAD
"from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
"\n",
Expand All @@ -145,6 +146,14 @@
"batch_size = get_compilation_batch_size(generated_qpc_path)\n"
"cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
=======
"from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
"\n",
"# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
"# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
"batch_size = get_compilation_batch_size(generated_qpc_path)\n",
"QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
>>>>>>> cfb3776 (Adding QEFFAutoModel i.e. model loader for loading any type of model. (#31))
]
}
],
Expand Down
9 changes: 9 additions & 0 deletions notebooks/QEfficientMPT.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -128,22 +128,31 @@
"metadata": {},
"outputs": [],
"source": [
<<<<<<< HEAD
<<<<<<< HEAD
"from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
=======
"from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n",
>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
=======
"from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
>>>>>>> cfb3776 (Adding QEFFAutoModel i.e. model loader for loading any type of model. (#31))
"\n",
"# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
"# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
"\n",
<<<<<<< HEAD
<<<<<<< HEAD
"batch_size = get_compilation_batch_size(generated_qpc_path)\n",
"QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
=======
"batch_size = get_compilation_batch_size(generated_qpc_path)"
"cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
=======
"batch_size = get_compilation_batch_size(generated_qpc_path)\n",
"QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
>>>>>>> cfb3776 (Adding QEFFAutoModel i.e. model loader for loading any type of model. (#31))
]
}
],
Expand Down

0 comments on commit 09a9395

Please sign in to comment.