Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding QEFFAutoModel i.e. model loader for loading any type of model. #31

Merged
merged 20 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 7 additions & 12 deletions QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,11 @@
#
# -----------------------------------------------------------------------------

import torch.nn as nn
from QEfficient.transformers.modeling_utils import transform as transform_hf
from QEfficient.compile.compile_helper import compile # noqa: F401
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv # noqa: F401
from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader # noqa: F401
from QEfficient.transformers.transform import transform # noqa: F401


def transform(model: nn.Module, type="Transformers", form_factor="cloud"):
"""Low level apis in library
model : instance of nn.Module
type : Transformers | Diffusers, default : Transformers
"""
if type == "Transformers":
return transform_hf(model, form_factor)
else:
raise NotImplementedError
# Users can use QEfficient.export for exporting models to ONNX
export = qualcomm_efficient_converter
89 changes: 4 additions & 85 deletions QEfficient/cloud/compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,91 +6,8 @@
# -----------------------------------------------------------------------------

import argparse
import json
import os
from typing import List

from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100
from QEfficient.utils.logging_utils import logger


def create_and_dump_specializations(batch_size: int, prompt_len: int, ctx_len: int, path: str):
# Create
specializations = {
"specializations": [
{
"batch_size": str(batch_size),
"seq_len": str(prompt_len),
"ctx_len": str(ctx_len),
},
{"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)},
]
}
# Dump
with open(path, "w") as file:
json.dump(specializations, file, indent=4)


def main(
onnx_path: str,
qpc_path: str,
num_cores: int,
device_group: List[int],
aic_enable_depth_first: bool = False,
mos: int = -1,
batch_size: int = 1,
prompt_len: int = 32,
ctx_len: int = 128,
mxfp6: bool = True,
mxint8: bool = False,
) -> str:
# Dynamically create the specializations JSON
"""
Api() to compile the Onnx Model on Cloud AI 100 Platform with give config.
---------
:param onnx_path: str. Generated Onnx Model Path.
:base_path: str. Base path for the generated models.
:batch_size: int. Batch size to compile the model for.
:prompt_len: int. prompt len for the model to compile.
:ctx_len: int. Maximum context length to compile the model.
:mxfp6: bool. Enable compilation for MXFP6 precision
:num_cores: int. Number of cores to compile model on. default: 16 available option: [1 to 16]
"""

os.makedirs(qpc_path, exist_ok=True)
specialization_json_path = os.path.join(qpc_path, "specializations.json")
create_and_dump_specializations(
batch_size=batch_size, prompt_len=prompt_len, ctx_len=ctx_len, path=specialization_json_path
)

# Select the customIO config based on the mx flag.
if mxint8:
custom_io_file_name = "custom_io_int8.yaml"
else:
custom_io_file_name = "custom_io_fp16.yaml"

custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)

if not os.path.isfile(custom_io_file_path):
raise FileNotFoundError(
f"file {custom_io_file_path} needs to exist in the same directory as onnx model files. Please rerun infer/export Api"
)

_, qpc_path = compile_kv_model_on_cloud_ai_100(
onnx_path=onnx_path,
specializations_json=specialization_json_path,
num_cores=num_cores,
custom_io_path=custom_io_file_path,
base_path=qpc_path,
mxfp6=mxfp6,
aic_enable_depth_first=aic_enable_depth_first,
mos=mos,
device_group=device_group,
)

logger.info(f"Compiled QPC files can be found here: {qpc_path}")
return qpc_path

import QEfficient

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Compilation script.")
Expand Down Expand Up @@ -146,5 +63,7 @@ def main(
default=-1,
help=" Effort level to reduce the on-chip memory",
)

# FIXME(ochougul): Allow extra compilation arguments
args = parser.parse_args()
main(**vars(args))
QEfficient.compile(**vars(args))
24 changes: 8 additions & 16 deletions QEfficient/cloud/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,25 @@
# -----------------------------------------------------------------------------

import argparse
from typing import List

from huggingface_hub import login
from transformers import AutoTokenizer
from typing import List, Optional

from QEfficient.generation.text_generation_inference import (
check_batch_size_and_num_prompts,
cloud_ai_100_exec_kv,
get_compilation_batch_size,
)
from QEfficient.utils import hf_download
from QEfficient.utils import load_hf_tokenizer
from QEfficient.utils.constants import Constants


def main(
model_name: str,
qpc_path: str,
device_group: List[int],
prompt: str = None,
prompts_txt_file_path: str = None,
cache_dir: str = Constants.CACHE_DIR,
hf_token: str = None,
prompt: Optional[str] = None, # type: ignore
prompts_txt_file_path: Optional[str] = None,
cache_dir: Optional[str] = Constants.CACHE_DIR,
hf_token: Optional[str] = None,
):
"""
APi() to run the Model on Cloud AI 100 Platform.
Expand All @@ -39,15 +36,10 @@ def main(
:prompts_txt_file_path: str. Path to txt file for multiple input prompts
"""

if hf_token is not None:
login(hf_token)

# Download tokenizer along with model if it doesn't exist
model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"])
tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
tokenizer = load_hf_tokenizer(model_name, cache_dir, hf_token)

batch_size = get_compilation_batch_size(qpc_path)
prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)

# Execute
cloud_ai_100_exec_kv(
Expand Down
72 changes: 35 additions & 37 deletions QEfficient/cloud/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,53 @@

import argparse
import os
from typing import Optional, Union

from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast

import QEfficient
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.utils import hf_download, onnx_exists
from QEfficient.utils import onnx_exists
from QEfficient.utils.constants import Constants
from QEfficient.utils.logging_utils import logger

# Specifically for Docker images.
ROOT_DIR = os.path.dirname(os.path.abspath(""))


def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]]=None, hf_token: Optional[str] = None):
"""
exports the model to onnx if pre-exported file is not found and returns onnx_model_path
"""
onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
if onnx_path_exists:
logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
else:
###################
# hf model -> export
####################
# Export to the Onnx
logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
_, generated_onnx_model_path = qualcomm_efficient_converter(
model_name=model_name,
tokenizer=tokenizer,
onnx_dir_path=onnx_dir_path,
kv=True,
form_factor="cloud",
return_path=True,
hf_token=hf_token,
cache_dir=cache_dir
) # type: ignore
logger.info(f"Generated Onnx_path {generated_onnx_model_path} \nOnnx_model_path {onnx_model_path} \nand Onnx_dir_path is {onnx_dir_path}")
assert (
generated_onnx_model_path == onnx_model_path
), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_model_path}"
return onnx_model_path


def main(
model_name: str,
cache_dir: str,
hf_token: str = None,
hf_token: Optional[str] = None,
) -> None:
"""
Api() for exporting to Onnx Model.
Expand All @@ -33,38 +62,7 @@ def main(
:cache_dir: str. Cache dir to store the downloaded huggingface files.
:hf_token: str. HuggingFace login token to access private repos.
"""
onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
if onnx_path_exists:
logger.warning(f"Generated Onnx files found {onnx_model_path}! Please use Infer/Compile Apis()")
return

if hf_token is not None:
login(hf_token)
model_hf_path = hf_download(
repo_id=model_name,
cache_dir=cache_dir,
ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
)
tokenizer = AutoTokenizer.from_pretrained(
model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(model_hf_path, use_cache=True)

# Easy and minimal api to update the model to QEff.
QEfficient.transform(model, type="Transformers", form_factor="cloud")
print(f"Model after Optimized transformations {model}")

# Export to the Onnx
print(f"Exporting to Pytorch {model_name} to Onnx")
base_path, onnx_path = qualcomm_efficient_converter(
model_kv=model,
model_name=model_name,
tokenizer=tokenizer,
kv=True,
form_factor="cloud",
return_path=True,
)
print(f"Base Path is {base_path} and Onnx Model Path is : {onnx_path}")
get_onnx_model_path(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token)


if __name__ == "__main__":
Expand Down
Loading
Loading