Skip to content

Commit

Permalink
Adding QEFFAutoModel i.e. model loader for loading any type of model. (
Browse files Browse the repository at this point in the history
…quic#31)

* all changes

Signed-off-by: Onkar Chougule <[email protected]>

* only loader changes

Signed-off-by: Onkar Chougule <[email protected]>

* removed unused imports

Signed-off-by: Onkar Chougule <[email protected]>

* allowed to initialize QEFFAUtoLMModel

Signed-off-by: Onkar Chougule <[email protected]>

* fixed tests bugs

Signed-off-by: Onkar Chougule <[email protected]>

* renamed utils.py to _utils.py

Signed-off-by: Onkar Chougule <[email protected]>

* added more type hinting and docstrings

Signed-off-by: Onkar Chougule <[email protected]>

* addressed review comments, added test file for new interface

Signed-off-by: Onkar Chougule <[email protected]>

* enabled CLI APIs

Signed-off-by: Onkar Chougule <[email protected]>

* *Updated README, notebooks *Removed circular import *Added comments on loader files * separated cross-compile script *separated utils funcs

Signed-off-by: Onkar Chougule <[email protected]>

* bug-fix infer

Signed-off-by: Onkar Chougule <[email protected]>

* using QEfficient.export, compile in cloud APIs

Signed-off-by: Onkar Chougule <[email protected]>

* cleaner infer,epxport APIs

Signed-off-by: Onkar Chougule <[email protected]>

* addressed review comments

Signed-off-by: Onkar Chougule <[email protected]>

* *updated notebooks, readme *moved class desc to base.py *Added Runtime Enum

Signed-off-by: Onkar Chougule <[email protected]>

* updated cloud_ai_100_exec_kv to be callable from QEfficient package

Signed-off-by: Onkar Chougule <[email protected]>

* fixed tests

Signed-off-by: Onkar Chougule <[email protected]>

* clenaed notebook

Signed-off-by: Onkar Chougule <[email protected]>

* *Added transfrom call within init *reanmed cross_compile *updated notebooks *updated README

Signed-off-by: Onkar Chougule <[email protected]>

* addressed review comments

Signed-off-by: Onkar Chougule <[email protected]>

---------

Signed-off-by: Onkar Chougule <[email protected]>
  • Loading branch information
ochougul authored and quic-amitraj committed Jun 7, 2024
1 parent e98180d commit cfb3776
Show file tree
Hide file tree
Showing 24 changed files with 1,133 additions and 923 deletions.
25 changes: 8 additions & 17 deletions QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,11 @@
#
# -----------------------------------------------------------------------------

import torch.nn as nn

from QEfficient.transformers.modeling_utils import transform as transform_hf


def transform(model: nn.Module, type="Transformers", form_factor="cloud"):
"""
Low level apis in library
---------
:param model: nn.Module. instance of nn.Module.
:type: str. Transformers | Diffusers, default : Transformers.
:form_factor: str.
"""
if type == "Transformers":
return transform_hf(model, form_factor)
else:
raise NotImplementedError
from QEfficient.compile.compile_helper import compile # noqa: F401
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv # noqa: F401
from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader # noqa: F401
from QEfficient.transformers.transform import transform # noqa: F401

# Users can use QEfficient.export for exporting models to ONNX
export = qualcomm_efficient_converter
96 changes: 4 additions & 92 deletions QEfficient/cloud/compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,97 +6,7 @@
# -----------------------------------------------------------------------------

import argparse
import json
import os
from typing import List

from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100
from QEfficient.utils.logging_utils import logger


def create_and_dump_specializations(batch_size: int, prompt_len: int, ctx_len: int, path: str):
# Create
specializations = {
"specializations": [
{
"batch_size": str(batch_size),
"seq_len": str(prompt_len),
"ctx_len": str(ctx_len),
},
{"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)},
]
}
# Dump
with open(path, "w") as file:
json.dump(specializations, file, indent=4)


def main(
onnx_path: str,
qpc_path: str,
num_cores: int,
device_group: List[int],
aic_enable_depth_first: bool = False,
mos: int = -1,
batch_size: int = 1,
prompt_len: int = 32,
ctx_len: int = 128,
mxfp6: bool = True,
mxint8: bool = False,
) -> str:
"""
API to compile the ONNX model on Cloud AI 100 platform with given config.
---------
:param onnx_path: str. Generated ONNX model path.
:qpc_path: str. Path of store compiled qpc binaries file
:num_cores: int. Number of cores to compile model on. Default: 16, available option: [1 to 16].
:device_group: List[int]. Cloud AI 100 device ids (comma-separated) e.g. [0,1]. if devices > 1, it enable multiple card setup.
:aic_enable_depth_first: bool. If passed, this option will be enabled during compilation. Default=False.
:mos: int. Effort level to reduce the on-chip memory. Default=-1.
:batch_size: int. Batch size for model to compile.
:prompt_len: int. prompt len for the model to compile.
:ctx_len: int. Maximum context length for the model to compile.
:mxfp6: bool. Compress constant MatMul weights to MXFP6 E2M3, default is no compression.
:mxint8: bool. Compress Present/Past KV to MXINT8 using CustomIO config, default is False.
Return:
Path of the QPC files.
"""

os.makedirs(qpc_path, exist_ok=True)
specialization_json_path = os.path.join(qpc_path, "specializations.json")
create_and_dump_specializations(
batch_size=batch_size, prompt_len=prompt_len, ctx_len=ctx_len, path=specialization_json_path
)

# Select the customIO config based on the mx flag.
if mxint8:
custom_io_file_name = "custom_io_int8.yaml"
else:
custom_io_file_name = "custom_io_fp16.yaml"

custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)

if not os.path.isfile(custom_io_file_path):
raise FileNotFoundError(
f"file {custom_io_file_path} needs to exist in the same directory as onnx model files. Please rerun infer/export Api"
)

_, qpc_path = compile_kv_model_on_cloud_ai_100(
onnx_path=onnx_path,
specializations_json=specialization_json_path,
num_cores=num_cores,
custom_io_path=custom_io_file_path,
base_path=qpc_path,
mxfp6=mxfp6,
aic_enable_depth_first=aic_enable_depth_first,
mos=mos,
device_group=device_group,
)

logger.info(f"Compiled QPC files can be found here: {qpc_path}")
return qpc_path

import QEfficient

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Compilation script.")
Expand Down Expand Up @@ -152,5 +62,7 @@ def main(
default=-1,
help=" Effort level to reduce the on-chip memory",
)

# FIXME(ochougul): Allow extra compilation arguments
args = parser.parse_args()
main(**vars(args))
QEfficient.compile(**vars(args))
29 changes: 10 additions & 19 deletions QEfficient/cloud/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,25 @@
# -----------------------------------------------------------------------------

import argparse
from typing import List
from huggingface_hub import login
from transformers import AutoTokenizer
from typing import List, Optional

from QEfficient.generation.text_generation_inference import (
check_batch_size_and_num_prompts,
cloud_ai_100_exec_kv,
get_compilation_batch_size,
)
from QEfficient.utils import hf_download
)
from QEfficient.utils import load_hf_tokenizer
from QEfficient.utils.constants import Constants


def main(
model_name: str,
qpc_path: str,
device_group: List[int],
prompt: str = None,
prompts_txt_file_path: str = None,
cache_dir: str = Constants.CACHE_DIR,
hf_token: str = None,
prompt: Optional[str] = None, # type: ignore
prompts_txt_file_path: Optional[str] = None,
cache_dir: Optional[str] = Constants.CACHE_DIR,
hf_token: Optional[str] = None,
):
"""
API to run the model on Cloud AI 100 platform.
Expand All @@ -35,22 +34,14 @@ def main(
:qpc_path: str. Path to the save generated binary file after compilation.
:cache_dir: str. Cache dir to store the downloaded huggingface files.
:hf_token: Huggingface token to access gated models.
<<<<<<< HEAD
:device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
:prompts_txt_file_path: str. Path to txt file for multiple input prompts
=======
>>>>>>> bb46e21 (Added sphinx files in docs)
"""

if hf_token is not None:
login(hf_token)

# Download tokenizer along with model if it doesn't exist
model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"])
tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
tokenizer = load_hf_tokenizer(model_name, cache_dir, hf_token)

batch_size = get_compilation_batch_size(qpc_path)
prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)

# Execute
cloud_ai_100_exec_kv(
Expand Down
72 changes: 35 additions & 37 deletions QEfficient/cloud/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,53 @@

import argparse
import os
from typing import Optional, Union

from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast

import QEfficient
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.utils import hf_download, onnx_exists
from QEfficient.utils import onnx_exists
from QEfficient.utils.constants import Constants
from QEfficient.utils.logging_utils import logger

# Specifically for Docker images.
ROOT_DIR = os.path.dirname(os.path.abspath(""))


def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]]=None, hf_token: Optional[str] = None):
"""
exports the model to onnx if pre-exported file is not found and returns onnx_model_path
"""
onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
if onnx_path_exists:
logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
else:
###################
# hf model -> export
####################
# Export to the Onnx
logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
_, generated_onnx_model_path = qualcomm_efficient_converter(
model_name=model_name,
tokenizer=tokenizer,
onnx_dir_path=onnx_dir_path,
kv=True,
form_factor="cloud",
return_path=True,
hf_token=hf_token,
cache_dir=cache_dir
) # type: ignore
logger.info(f"Generated Onnx_path {generated_onnx_model_path} \nOnnx_model_path {onnx_model_path} \nand Onnx_dir_path is {onnx_dir_path}")
assert (
generated_onnx_model_path == onnx_model_path
), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_model_path}"
return onnx_model_path


def main(
model_name: str,
cache_dir: str,
hf_token: str = None,
hf_token: Optional[str] = None,
) -> None:
"""
ApI for exporting to ONNX Model.
Expand All @@ -33,38 +62,7 @@ def main(
:cache_dir: str. Cache dir to store the downloaded huggingface files.
:hf_token: str. HuggingFace login token to access private repos.
"""
onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
if onnx_path_exists:
logger.warning(f"Generated Onnx files found {onnx_model_path}! Please use Infer/Compile Apis()")
return

if hf_token is not None:
login(hf_token)
model_hf_path = hf_download(
repo_id=model_name,
cache_dir=cache_dir,
ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
)
tokenizer = AutoTokenizer.from_pretrained(
model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(model_hf_path, use_cache=True)

# Easy and minimal api to update the model to QEff.
QEfficient.transform(model, type="Transformers", form_factor="cloud")
print(f"Model after Optimized transformations {model}")

# Export to the Onnx
print(f"Exporting to Pytorch {model_name} to Onnx")
base_path, onnx_path = qualcomm_efficient_converter(
model_kv=model,
model_name=model_name,
tokenizer=tokenizer,
kv=True,
form_factor="cloud",
return_path=True,
)
print(f"Base Path is {base_path} and Onnx Model Path is : {onnx_path}")
get_onnx_model_path(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token)


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit cfb3776

Please sign in to comment.