diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 63ab9da92..ac6c1b629 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -5,20 +5,11 @@ # # ----------------------------------------------------------------------------- -import torch.nn as nn - -from QEfficient.transformers.modeling_utils import transform as transform_hf - - -def transform(model: nn.Module, type="Transformers", form_factor="cloud"): - """ - Low level apis in library - --------- - :param model: nn.Module. instance of nn.Module. - :type: str. Transformers | Diffusers, default : Transformers. - :form_factor: str. - """ - if type == "Transformers": - return transform_hf(model, form_factor) - else: - raise NotImplementedError +from QEfficient.compile.compile_helper import compile # noqa: F401 +from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter +from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv # noqa: F401 +from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader # noqa: F401 +from QEfficient.transformers.transform import transform # noqa: F401 + +# Users can use QEfficient.export for exporting models to ONNX +export = qualcomm_efficient_converter diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py index d6003c35c..4e446de05 100644 --- a/QEfficient/cloud/compile.py +++ b/QEfficient/cloud/compile.py @@ -6,97 +6,7 @@ # ----------------------------------------------------------------------------- import argparse -import json -import os -from typing import List - -from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100 -from QEfficient.utils.logging_utils import logger - - -def create_and_dump_specializations(batch_size: int, prompt_len: int, ctx_len: int, path: str): - # Create - specializations = { - "specializations": [ - { - "batch_size": str(batch_size), - "seq_len": str(prompt_len), - "ctx_len": str(ctx_len), - }, - {"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)}, - ] - } - # Dump - with open(path, "w") as file: - json.dump(specializations, file, indent=4) - - -def main( - onnx_path: str, - qpc_path: str, - num_cores: int, - device_group: List[int], - aic_enable_depth_first: bool = False, - mos: int = -1, - batch_size: int = 1, - prompt_len: int = 32, - ctx_len: int = 128, - mxfp6: bool = True, - mxint8: bool = False, -) -> str: - """ - API to compile the ONNX model on Cloud AI 100 platform with given config. - --------- - :param onnx_path: str. Generated ONNX model path. - :qpc_path: str. Path of store compiled qpc binaries file - :num_cores: int. Number of cores to compile model on. Default: 16, available option: [1 to 16]. - :device_group: List[int]. Cloud AI 100 device ids (comma-separated) e.g. [0,1]. if devices > 1, it enable multiple card setup. - :aic_enable_depth_first: bool. If passed, this option will be enabled during compilation. Default=False. - :mos: int. Effort level to reduce the on-chip memory. Default=-1. - :batch_size: int. Batch size for model to compile. - :prompt_len: int. prompt len for the model to compile. - :ctx_len: int. Maximum context length for the model to compile. - :mxfp6: bool. Compress constant MatMul weights to MXFP6 E2M3, default is no compression. - :mxint8: bool. Compress Present/Past KV to MXINT8 using CustomIO config, default is False. - - Return: - Path of the QPC files. - """ - - os.makedirs(qpc_path, exist_ok=True) - specialization_json_path = os.path.join(qpc_path, "specializations.json") - create_and_dump_specializations( - batch_size=batch_size, prompt_len=prompt_len, ctx_len=ctx_len, path=specialization_json_path - ) - - # Select the customIO config based on the mx flag. - if mxint8: - custom_io_file_name = "custom_io_int8.yaml" - else: - custom_io_file_name = "custom_io_fp16.yaml" - - custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name) - - if not os.path.isfile(custom_io_file_path): - raise FileNotFoundError( - f"file {custom_io_file_path} needs to exist in the same directory as onnx model files. Please rerun infer/export Api" - ) - - _, qpc_path = compile_kv_model_on_cloud_ai_100( - onnx_path=onnx_path, - specializations_json=specialization_json_path, - num_cores=num_cores, - custom_io_path=custom_io_file_path, - base_path=qpc_path, - mxfp6=mxfp6, - aic_enable_depth_first=aic_enable_depth_first, - mos=mos, - device_group=device_group, - ) - - logger.info(f"Compiled QPC files can be found here: {qpc_path}") - return qpc_path - +import QEfficient if __name__ == "__main__": parser = argparse.ArgumentParser(description="Compilation script.") @@ -152,5 +62,7 @@ def main( default=-1, help=" Effort level to reduce the on-chip memory", ) + + # FIXME(ochougul): Allow extra compilation arguments args = parser.parse_args() - main(**vars(args)) + QEfficient.compile(**vars(args)) diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index 2eb8386d3..4660a951a 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -6,15 +6,14 @@ # ----------------------------------------------------------------------------- import argparse -from typing import List -from huggingface_hub import login -from transformers import AutoTokenizer +from typing import List, Optional + from QEfficient.generation.text_generation_inference import ( check_batch_size_and_num_prompts, cloud_ai_100_exec_kv, get_compilation_batch_size, - ) -from QEfficient.utils import hf_download +) +from QEfficient.utils import load_hf_tokenizer from QEfficient.utils.constants import Constants @@ -22,10 +21,10 @@ def main( model_name: str, qpc_path: str, device_group: List[int], - prompt: str = None, - prompts_txt_file_path: str = None, - cache_dir: str = Constants.CACHE_DIR, - hf_token: str = None, + prompt: Optional[str] = None, # type: ignore + prompts_txt_file_path: Optional[str] = None, + cache_dir: Optional[str] = Constants.CACHE_DIR, + hf_token: Optional[str] = None, ): """ API to run the model on Cloud AI 100 platform. @@ -35,22 +34,14 @@ def main( :qpc_path: str. Path to the save generated binary file after compilation. :cache_dir: str. Cache dir to store the downloaded huggingface files. :hf_token: Huggingface token to access gated models. -<<<<<<< HEAD :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. :prompts_txt_file_path: str. Path to txt file for multiple input prompts -======= ->>>>>>> bb46e21 (Added sphinx files in docs) """ - if hf_token is not None: - login(hf_token) - - # Download tokenizer along with model if it doesn't exist - model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"]) - tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left") + tokenizer = load_hf_tokenizer(model_name, cache_dir, hf_token) batch_size = get_compilation_batch_size(qpc_path) - prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) + prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) # Execute cloud_ai_100_exec_kv( diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index ce49f49d6..25cf2700f 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -7,13 +7,12 @@ import argparse import os +from typing import Optional, Union -from huggingface_hub import login -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast -import QEfficient from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.utils import hf_download, onnx_exists +from QEfficient.utils import onnx_exists from QEfficient.utils.constants import Constants from QEfficient.utils.logging_utils import logger @@ -21,10 +20,40 @@ ROOT_DIR = os.path.dirname(os.path.abspath("")) +def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]]=None, hf_token: Optional[str] = None): + """ + exports the model to onnx if pre-exported file is not found and returns onnx_model_path + """ + onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name) + if onnx_path_exists: + logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation") + else: + ################### + # hf model -> export + #################### + # Export to the Onnx + logger.info(f"Exporting Pytorch {model_name} model to ONNX...") + _, generated_onnx_model_path = qualcomm_efficient_converter( + model_name=model_name, + tokenizer=tokenizer, + onnx_dir_path=onnx_dir_path, + kv=True, + form_factor="cloud", + return_path=True, + hf_token=hf_token, + cache_dir=cache_dir + ) # type: ignore + logger.info(f"Generated Onnx_path {generated_onnx_model_path} \nOnnx_model_path {onnx_model_path} \nand Onnx_dir_path is {onnx_dir_path}") + assert ( + generated_onnx_model_path == onnx_model_path + ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_model_path}" + return onnx_model_path + + def main( model_name: str, cache_dir: str, - hf_token: str = None, + hf_token: Optional[str] = None, ) -> None: """ ApI for exporting to ONNX Model. @@ -33,38 +62,7 @@ def main( :cache_dir: str. Cache dir to store the downloaded huggingface files. :hf_token: str. HuggingFace login token to access private repos. """ - onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name) - if onnx_path_exists: - logger.warning(f"Generated Onnx files found {onnx_model_path}! Please use Infer/Compile Apis()") - return - - if hf_token is not None: - login(hf_token) - model_hf_path = hf_download( - repo_id=model_name, - cache_dir=cache_dir, - ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"], - ) - tokenizer = AutoTokenizer.from_pretrained( - model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True - ) - model = AutoModelForCausalLM.from_pretrained(model_hf_path, use_cache=True) - - # Easy and minimal api to update the model to QEff. - QEfficient.transform(model, type="Transformers", form_factor="cloud") - print(f"Model after Optimized transformations {model}") - - # Export to the Onnx - print(f"Exporting to Pytorch {model_name} to Onnx") - base_path, onnx_path = qualcomm_efficient_converter( - model_kv=model, - model_name=model_name, - tokenizer=tokenizer, - kv=True, - form_factor="cloud", - return_path=True, - ) - print(f"Base Path is {base_path} and Onnx Model Path is : {onnx_path}") + get_onnx_model_path(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token) if __name__ == "__main__": diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 09c6ff3be..457678a7a 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -6,20 +6,17 @@ # ----------------------------------------------------------------------------- import argparse +import logging import os -from typing import List - -from huggingface_hub import login -from transformers import AutoModelForCausalLM, AutoTokenizer +from typing import List, Optional import QEfficient -from QEfficient.cloud.compile import main as compile -from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter +from QEfficient.cloud.export import get_onnx_model_path from QEfficient.generation.text_generation_inference import ( check_batch_size_and_num_prompts, cloud_ai_100_exec_kv, ) -from QEfficient.utils import hf_download, onnx_exists, qpc_exists +from QEfficient.utils import get_qpc_dir_name_infer, load_hf_tokenizer, qpc_exists from QEfficient.utils.constants import Constants from QEfficient.utils.logging_utils import logger @@ -34,19 +31,19 @@ def main( model_name: str, num_cores: int, - prompt: str = None, - prompts_txt_file_path: str = None, + prompt: Optional[str] = None, # type: ignore + prompts_txt_file_path: Optional[str] = None, aic_enable_depth_first: bool = False, mos: int = -1, cache_dir: str = Constants.CACHE_DIR, - hf_token: str = None, + hf_token: Optional[str] = None, batch_size: int = 1, prompt_len: int = 32, ctx_len: int = 128, mxfp6: bool = False, mxint8: bool = False, device_group: List[int] = [0], -): +) -> None: """ Inference command, the model will be downloaded from HF, optimized, compiled, executed on AIC. --------- @@ -63,120 +60,41 @@ def main( :mxfp6: bool. Enable compilation for MXFP6 precision. :device_group: List[int]. Cloud AI 100 device ids (comma-separated) e.g. [0,1]. if devices > 1, it enable multiple card setup. """ - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) - os.makedirs(model_card_dir, exist_ok=True) - - qpc_base_dir_name = ( - f"qpc_{num_cores}cores_{batch_size}BS_{prompt_len}PL_{ctx_len}CL_{mos}MOS_" - + f"{len(device_group)}" - + "devices" - + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16") - ) - - prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) - - # Get tokenizer - if hf_token is not None: - login(hf_token) - model_hf_path = hf_download( - repo_id=model_name, - cache_dir=cache_dir, - ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"], - ) - tokenizer = AutoTokenizer.from_pretrained( - model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True - ) + qpc_base_dir_name = get_qpc_dir_name_infer(num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group) + prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) + tokenizer = load_hf_tokenizer(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token) qpc_path_exists, qpc_dir_path = qpc_exists(model_name, qpc_base_dir_name) + # Handle qpc generation if qpc_path_exists: - # execute - logger.info("Pre-compiled qpc found! Trying to execute with given prompt") - cloud_ai_100_exec_kv( - batch_size, - tokenizer=tokenizer, - qpc_path=qpc_dir_path, - device_id=device_group, - prompt=prompt, - ) - return - - onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name) - if onnx_path_exists: - # Compile -> execute - # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation - generated_qpc_path = compile( - onnx_path=onnx_model_path, - qpc_path=os.path.dirname(qpc_dir_path), - num_cores=num_cores, - batch_size=batch_size, - prompt_len=prompt_len, - ctx_len=ctx_len, - mxfp6=mxfp6, - mxint8=mxint8, - aic_enable_depth_first=aic_enable_depth_first, - mos=mos, - device_group=device_group, - ) + logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt") + else: + # Handle onnx model generation + onnx_model_path = get_onnx_model_path(model_name, cache_dir, tokenizer, hf_token) + + ######### + # Compile + ######### + generated_qpc_path = QEfficient.compile( + onnx_path=onnx_model_path, + qpc_path=os.path.dirname(qpc_dir_path), # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation + num_cores=num_cores, + batch_size=batch_size, + prompt_len=prompt_len, + ctx_len=ctx_len, + mxfp6=mxfp6, + mxint8=mxint8, + aic_enable_depth_first=aic_enable_depth_first, + mos=mos, + device_group=device_group, + ) assert ( - generated_qpc_path == qpc_dir_path - ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" - cloud_ai_100_exec_kv( - batch_size, - tokenizer=tokenizer, - qpc_path=qpc_dir_path, - device_id=device_group, - prompt=prompt, - ) - return - - ############################################# - # hf model -> export -> compile -> execute - ############################################# - model_hf = AutoModelForCausalLM.from_pretrained(model_hf_path, use_cache=True) - # Easy and minimal api to update the model to QEff. - model_transformed = QEfficient.transform(model_hf, type="Transformers", form_factor="cloud") - logger.info(f"Model after Optimized transformations {model_transformed}") - - # Export to the Onnx - logger.info(f"Exporting to Pytorch {model_name} to ONNX...") - base_path, generated_onnx_path = qualcomm_efficient_converter( - model_kv=model_transformed, - onnx_dir_path=onnx_dir_path, - model_name=model_name, - kv=True, - form_factor="cloud", - return_path=True, - tokenizer=tokenizer, - ) - print( - f"Generated Onnx_path {generated_onnx_path} and Onnx_model_path {onnx_model_path} and Onnx_dir_path is {onnx_dir_path}" - ) - assert ( - generated_onnx_path == onnx_model_path - ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_path}" - logger.info(f"Base Path is {base_path} and Onnx Model Path is : {generated_onnx_path}") - - # Compile - # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation - generated_qpc_path = compile( - onnx_path=onnx_model_path, - qpc_path=os.path.dirname(qpc_dir_path), - num_cores=num_cores, - batch_size=batch_size, - prompt_len=prompt_len, - ctx_len=ctx_len, - mxfp6=mxfp6, - mxint8=mxint8, - aic_enable_depth_first=aic_enable_depth_first, - mos=mos, - device_group=device_group, - ) - assert ( - qpc_dir_path == generated_qpc_path - ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" - logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}") - + generated_qpc_path == qpc_dir_path + ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" + + ######### # Execute + ######### cloud_ai_100_exec_kv( batch_size, tokenizer=tokenizer, @@ -247,6 +165,15 @@ def main( default=-1, help="Effort level to reduce the on-chip memory", ) + #FIXME: Add verbose feature + parser.add_argument( + "--verbose","-v", + action="store_true", + help="pass to print info logs", + ) args = parser.parse_args() + if args.verbose: + logger.setLevel(logging.INFO) + del args.verbose # type: ignore main(**args.__dict__) diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py new file mode 100644 index 000000000..8b5272e8d --- /dev/null +++ b/QEfficient/compile/compile_helper.py @@ -0,0 +1,163 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import json +import os +import shutil +import subprocess +from typing import List, Tuple + +from QEfficient.utils.logging_utils import logger + + +def create_and_dump_specializations(batch_size: int, prompt_len: int, ctx_len: int, path: str): + # Create + specializations = { + "specializations": [ + { + "batch_size": str(batch_size), + "seq_len": str(prompt_len), + "ctx_len": str(ctx_len), + }, + {"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)}, + ] + } + # Dump + with open(path, "w") as file: + json.dump(specializations, file, indent=4) + + +def compile_kv_model_on_cloud_ai_100( + onnx_path: str, + specializations_json: str, + num_cores: int, + base_path: str, + mxfp6: bool, + custom_io_path: str, + aic_enable_depth_first: bool, + mos: int = -1, + device_group: List[int] = [0], + **kwargs, +) -> Tuple[bool, str]: + if kwargs: + # FIXME + raise NotImplementedError("Can't handle extra compilation args now!") + aic_binary_dir = os.path.join(base_path, "qpcs") + + if os.path.isdir(aic_binary_dir): + shutil.rmtree(aic_binary_dir) + + assert os.path.isfile( + specializations_json + ), f"Please use 'QEfficient.compile', as {specializations_json} file was not found" + assert os.path.isfile(custom_io_path), f"{custom_io_path} file was not found!" + command = [ + "/opt/qti-aic/exec/qaic-exec", + f"-m={onnx_path}", + "-aic-hw", + "-aic-hw-version=2.0", + f"-network-specialization-config={specializations_json}", + "-convert-to-fp16", + "-retained-state", + f"-aic-num-cores={num_cores}", + f"-custom-IO-list-file={custom_io_path}", + "-compile-only", + f"-aic-binary-dir={aic_binary_dir}", + ] + if mxfp6: + command.append("-mxfp6-matmul") + if mos > 0: + command.append(f"-mos={mos}") + if aic_enable_depth_first: + command.append("-aic-enable-depth-first") + if len(device_group) > 1: + mdp_ts_config = { + "connections": [{"devices": list(range(len(device_group))), "type": "p2p"}], + "partitions": [ + { + "name": "Partition0", + "devices": [{"deviceId": device, "numCores": num_cores} for device in range(len(device_group))], + } + ], + } + mdp_ts_config_path = os.path.join(base_path, "mdp_ts_config.json") + with open(mdp_ts_config_path, "w") as file: + json.dump(mdp_ts_config, file, indent=4) + command.append(f"-mdp-load-partition-config={mdp_ts_config_path}") + print("Running AI 100 compiler:", " ".join(command)) + result = subprocess.run(command, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f"Compilation Failed!!\n\nSTDOUT\n{result.stdout}\n\nSTDERR\n{result.stderr}") + + print("\n===================== Compilation Done! =====================\n") + return result.returncode == 0, aic_binary_dir + + +def compile( + onnx_path: str, + qpc_path: str, + num_cores: int, + device_group: List[int], # FIXME: use num_devices instead + aic_enable_depth_first: bool = False, + mos: int = -1, + batch_size: int = 1, + prompt_len: int = 32, + ctx_len: int = 128, + mxfp6: bool = True, + mxint8: bool = False, + **kwargs +) -> str: + # Dynamically create the specializations JSON + """ + Api() to compile the Onnx Model on Cloud AI 100 Platform with give config. + --------- + :param onnx_path: str. Generated Onnx Model Path. + :param qpc_path: str. Path for saving compiled qpc binaries. + :num_cores: int. Number of cores to compile model on. + :device_group: List[int]. Used for finding number of devices to compile for. + :aic_enable_depth_first: bool. Enables DFS with default memory size, disabled by default. + :mos: int. Effort level to reduce the on-chip memory. + :batch_size: int. Batch size to compile the model for. + :prompt_len: int. prompt len for the model to compile. + :ctx_len: int. Maximum context length to compile the model. + :mxfp6: bool. Enable compilation for MXFP6 precision + :mxint8: Compress Present/Past KV to MXINT8 using CustomIO config, default is False. + """ + + os.makedirs(qpc_path, exist_ok=True) + specialization_json_path = os.path.join(qpc_path, "specializations.json") + create_and_dump_specializations( + batch_size=batch_size, prompt_len=prompt_len, ctx_len=ctx_len, path=specialization_json_path + ) + + # Select the customIO config based on the mx flag. + if mxint8: + custom_io_file_name = "custom_io_int8.yaml" + else: + custom_io_file_name = "custom_io_fp16.yaml" + + custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name) + + if not os.path.isfile(custom_io_file_path): + raise FileNotFoundError( + f"file {custom_io_file_path} needs to exist in the same directory as onnx model files. Please rerun infer/export Api" + ) + + _, qpc_path = compile_kv_model_on_cloud_ai_100( + onnx_path=onnx_path, + specializations_json=specialization_json_path, + num_cores=num_cores, + custom_io_path=custom_io_file_path, + base_path=qpc_path, + mxfp6=mxfp6, + aic_enable_depth_first=aic_enable_depth_first, + mos=mos, + device_group=device_group, + ) + + logger.info(f"Compiled QPC files can be found here: {qpc_path}") + return qpc_path diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index b598b3108..535fa8095 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -7,31 +7,31 @@ import os import shutil +from typing import Optional, Tuple, Union import torch -from huggingface_hub import login -from transformers import AutoTokenizer +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast +import QEfficient from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort -from QEfficient.transformers.modeling_utils import transform -from QEfficient.utils import hf_download +from QEfficient.src._transformers.auto import QEFFAutoModelForCausalLM +from QEfficient.src.base import QEFFBaseModel +from QEfficient.src.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader +from QEfficient.utils._utils import load_hf_tokenizer from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants from QEfficient.utils.logging_utils import logger def convert_to_cloud_bertstyle( model_name: str, - model_class: type = None, - tokenizer=None, - onnx_dir_path=None, - hf_token: str = None, - seq_len: int = Constants.seq_length, - input_str: str = Constants.input_str, - return_path: bool = False, - save_fp32_onnx: bool = False, - save_fp16_onnx: bool = True, -) -> str: - + qeff_model: QEFFAutoModelForCausalLM, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + onnx_dir_path: str, + seq_len: int, + return_path: bool, + save_fp32_onnx: bool, + save_fp16_onnx: bool, +): """ API to convert model to Bertstyle approach. Bertstyle Approach: @@ -56,12 +56,6 @@ def convert_to_cloud_bertstyle( Return: Path of exported ONNX file. """ - # todo (amitraj) Optimize the onnx export - if onnx_dir_path is None: - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) - onnx_dir_path = os.path.join(model_card_dir, "onnx_bertstyle") - - # Check if ONNX already exist if os.path.exists(onnx_dir_path): logger.warning(f"Overriding {onnx_dir_path}") shutil.rmtree(onnx_dir_path) @@ -69,37 +63,29 @@ def convert_to_cloud_bertstyle( if not (save_fp32_onnx or save_fp16_onnx): raise AttributeError("save_fp32_onnx and save_fp16_onnx can't be false") - seq_len = Constants.seq_length - input_str = Constants.input_str - - # Load tokenizer - if tokenizer is None: - tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=True) - else: - if tokenizer.padding_side != "left": - logger.warning("Please use padding_side='left' while initializing the tokenizer") - tokenizer.padding_side = "left" + if tokenizer.padding_side != "left": + logger.warning("Please use padding_side='left' while initializing the tokenizer") + tokenizer.padding_side = "left" - if tokenizer.pad_token_id is None: + if tokenizer.pad_token_id is None: tokenizer.pad_token_id = tokenizer.eos_token_id - try: - if hf_token: - login(hf_token) - model_hf_path = hf_download( - repo_id=model_name, - cache_dir=Constants.CACHE_DIR, - ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"], - ) - model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True) - except Exception as e: - print(f"Failed to download the {model_name} model from Huggingface:%s", e) - model.eval() - # Decide path for saving exported ONNX files. + fp32_model_name, fp16_model_name = export_bertstyle_model_to_onnx(model_name, qeff_model.model, tokenizer, onnx_dir_path, seq_len, save_fp32_onnx, save_fp16_onnx) # type: ignore + + # return the model path for automation. + if return_path: + if save_fp16_onnx: + return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx") + else: + return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx") + + +def export_bertstyle_model_to_onnx(model_name, model, tokenizer, onnx_dir_path, seq_len, save_fp32_onnx, save_fp16_onnx): model_base_name = model_name.replace("/", "_") + "_bertstyle" os.makedirs(onnx_dir_path, exist_ok=True) + input_str = Constants.input_str # Preprocess inputs if seq_len > 0: if tokenizer.pad_token_id is None: @@ -178,30 +164,20 @@ def convert_to_cloud_bertstyle( inputs=inputs, input_list_file=input_list_file, ) - - # return the model path for automation. - if return_path: - if save_fp16_onnx: - return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx") - else: - return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx") - else: - return + + return fp32_model_name,fp16_model_name def convert_to_cloud_kvstyle( model_name: str, - model_class: type = None, - model_kv: torch.nn.Module = None, - tokenizer=None, - onnx_dir_path=None, - hf_token: str = None, - seq_len: int = Constants.seq_length, - input_str: str = Constants.input_str, - return_path: bool = False, - save_fp32_onnx: bool = False, - save_fp16_onnx: bool = True, -) -> str: + qeff_model: QEFFAutoModelForCausalLM, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + onnx_dir_path: str, + seq_len: int, + return_path: bool, + save_fp32_onnx: bool, + save_fp16_onnx: bool, +): """ API change model for kv retention and export to ONNX. KV Style Approach- @@ -226,58 +202,45 @@ def convert_to_cloud_kvstyle( Returns: Path of exported ONNX file. """ - if onnx_dir_path is None: - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) - onnx_dir_path = os.path.join(model_card_dir, "onnx") - if os.path.exists(onnx_dir_path): logger.warning(f"Overriding {onnx_dir_path}") shutil.rmtree(onnx_dir_path) if not (save_fp32_onnx or save_fp16_onnx): raise AttributeError("save_fp32_onnx and save_fp16_onnx can't be false") + - if model_class is None and model_kv is None: - raise AttributeError("model_class and model_kv both can't be None") + if tokenizer.padding_side != "left": + logger.warning("Please use padding_side='left' while initializing the tokenizer") + tokenizer.padding_side = "left" - if model_kv is not None: - if not getattr(model_kv, "qeff_transformed", False): - raise AttributeError( - "Model is not transformed, Please first use QEfficient.transform to transform the model." - ) - model = model_kv - else: - try: - if hf_token: - login(hf_token) - model_hf_path = hf_download( - repo_id=model_name, - cache_dir=Constants.CACHE_DIR, - ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"], - ) - model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True) - except Exception as e: - print(f"Failed to download the {model_name} model from Huggingface:%s", e) - transform(model, form_factor="cloud") + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id + + assert qeff_model.is_transformed, f"please pass the {qeff_model.__class__.__name__} after transform API" # Decide path for saving exported ONNX files. - model_base_name = model_name.replace("/", "_") + "_kv" - os.makedirs(onnx_dir_path, exist_ok=True) + fp32_model_name, fp16_model_name = export_kvstyle_transformed_model_to_onnx(model_name, qeff_model.model, tokenizer, onnx_dir_path, seq_len, save_fp32_onnx, save_fp16_onnx) # type: ignore - # Load tokenizer - if tokenizer is None: - # todo(ochougul): use cache dir from snapshot download - tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") - else: - if tokenizer.padding_side != "left": - logger.warning("Please use padding_side='left' while initializing the tokenizer") - tokenizer.padding_side = "left" + # return the model path for automation. + if return_path: + if save_fp16_onnx: + return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx") + else: + return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx") - if tokenizer.pad_token_id is None: - tokenizer.pad_token_id = tokenizer.eos_token_id + +def export_kvstyle_transformed_model_to_onnx(model_name: str, transformed_model: torch.nn.Module, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + onnx_dir_path: str, seq_len: int, save_fp32_onnx: Optional[bool] = False, save_fp16_onnx: Optional[bool] = True): + + if tokenizer.padding_side != "left": + logger.warning("Please use padding_side='left' while initializing the tokenizer") + tokenizer.padding_side = "left" + + tokenizer.pad_token_id = tokenizer.eos_token_id if tokenizer.pad_token_id is None else tokenizer.pad_token_id # Disabling requires_grad on all parameters - for j, p in enumerate(model.parameters()): + for j, p in enumerate(transformed_model.parameters()): p.requires_grad_(False) # Preprocess inputs @@ -303,11 +266,10 @@ def convert_to_cloud_kvstyle( else: inputs = tokenizer(input_str, return_tensors="pt") - try: - pt_outputs = model(**inputs) - output_names = list(pt_outputs.keys()) - except Exception as e: - print(f"Model {model_name} Execution failed in pytorch:%s", e) + + pt_outputs = transformed_model(**inputs) + output_names = list(pt_outputs.keys()) + # Raise error if expected outputs are not present assert "logits" in output_names, "logits not found in output" @@ -325,11 +287,9 @@ def convert_to_cloud_kvstyle( inputs["past_key_values"] = tuple([(key.detach(), value.detach()) for key, value in pt_outputs.past_key_values]) # Run PyTorch inference with past - try: - pt_outputs = model(**inputs) - output_names = list(pt_outputs.keys()) - except Exception as e: - print(f"Model {model_name} Execution failed in pytorch:%s", e) + pt_outputs = transformed_model(**inputs) + output_names = list(pt_outputs.keys()) + # Add pkv into output_names pkv = tuple([(key.detach(), value.detach()) for key, value in pt_outputs.past_key_values]) @@ -344,9 +304,12 @@ def convert_to_cloud_kvstyle( pt_outputs[f"past_key.{i}_RetainedState"] = key pt_outputs[f"past_value.{i}_RetainedState"] = value + + model_base_name = model_name.replace("/", "_") + "_kv" + os.makedirs(onnx_dir_path, exist_ok=True) # Export and simplify ONNX model fp32_model_name = export_onnx( - pt_model=model, + pt_model=transformed_model, inputs=inputs, output_names=output_names, gen_models_path=onnx_dir_path, @@ -405,39 +368,93 @@ def convert_to_cloud_kvstyle( inputs=inputs, input_list_file=input_list_file, ) + + return fp32_model_name, fp16_model_name + + +def export_for_cloud(model_name: str, qeff_model: QEFFBaseModel, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + onnx_dir_path: str, seq_length: int = Constants.seq_length, + return_path: bool = True, + save_fp32_onnx: bool = False, + save_fp16_onnx: bool = True)-> Tuple[str, str]: + # FIXME: move all this to class instead of here, and just call qeff_model.export here. + if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore + return export_lm_model_for_cloud(model_name=model_name, + qeff_model=qeff_model, # type: ignore + tokenizer=tokenizer, + onnx_dir_path=onnx_dir_path, + seq_length=seq_length, + return_path=return_path, + save_fp16_onnx=save_fp16_onnx, + save_fp32_onnx=save_fp32_onnx) + else: + raise NotImplementedError(f"Only model type {QEFFAutoModelForCausalLM.__class__.__name__} is supported for export, got {type(qeff_model)}") + + +def export_lm_model_for_cloud(model_name:str, qeff_model: QEFFAutoModelForCausalLM, + tokenizer:Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + onnx_dir_path: str, seq_length: int, return_path:bool, + save_fp32_onnx:bool, save_fp16_onnx: bool): + if os.path.exists(onnx_dir_path): + logger.warning(f"Overriding {onnx_dir_path}") + shutil.rmtree(onnx_dir_path) + if not (save_fp32_onnx or save_fp16_onnx): + raise AttributeError("save_fp32_onnx and save_fp16_onnx can't be false") + + if tokenizer.padding_side != "left": + logger.warning("Please use padding_side='left' while initializing the tokenizer") + tokenizer.padding_side = "left" + + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id + + + if qeff_model.is_transformed: + fp32_model_name, fp16_model_name = export_kvstyle_transformed_model_to_onnx( + model_name=model_name, + transformed_model=qeff_model.model, + tokenizer=tokenizer, + onnx_dir_path=onnx_dir_path, + seq_len=seq_length, + save_fp32_onnx=save_fp32_onnx, + save_fp16_onnx=save_fp16_onnx) # type: ignore + + else: + fp32_model_name, fp16_model_name = export_bertstyle_model_to_onnx( + model_name=model_name, + model=qeff_model.model, + tokenizer=tokenizer, + onnx_dir_path=onnx_dir_path, + seq_len=seq_length, + save_fp32_onnx=save_fp32_onnx, + save_fp16_onnx=save_fp16_onnx) # type: ignore + + # return the model path for automation. if return_path: if save_fp16_onnx: return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx") else: return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx") - else: - return - - -def convert_to_edge(self) -> None: - # [TODO]: Apply the class transformation to make changes for the KV models in edge use cases - # model = QEfficient.transform(model_hf, type="Transformers", form_factor="edge") - # model.eval() - raise NotImplementedError("Oops...reached too far!!") def qualcomm_efficient_converter( model_name: str, - model_class: type = None, - model_kv: torch.nn.Module = None, - tokenizer=None, - onnx_dir_path=None, - hf_token: str = "", + model_kv: QEFFBaseModel = None, # type: ignore + tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]]=None, + cache_dir: Optional[str] = None, + onnx_dir_path: Optional[str]=None, + hf_token: Optional[str] = None, seq_length: int = Constants.seq_length, - input_str: str = Constants.input_str, kv: bool = True, - return_path: bool = False, - form_factor="cloud", + return_path: bool = True, + form_factor: str="cloud", save_fp32_onnx: bool = False, save_fp16_onnx: bool = True, -) -> str: +) -> Tuple[str, str]: + """ API to convert torch Bert style and KV style model to ONNX. --------- @@ -458,36 +475,34 @@ def qualcomm_efficient_converter( Returns: Path of exported ONNX file. """ - if model_kv is not None and not kv: - raise AttributeError("For transformed model kv must be True") + # Get model_kv first + model_kv = model_kv if model_kv else QEFFCommonLoader.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token, cache_dir=cache_dir) + + # Transform if required + if model_kv.is_transformed and not kv: + raise AttributeError("Transformed model is passed while requsting to convert non-transformed model") + + model_kv = model_kv if model_kv.is_transformed else QEfficient.transform(model_kv) if kv else model_kv + if onnx_dir_path is None: + model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) + onnx_dir_path = os.path.join(model_card_dir, "onnx") + + # Load tokenizer if not passed + tokenizer = tokenizer if tokenizer else load_hf_tokenizer(model_name=model_name, hf_token=hf_token, cache_dir=cache_dir) + if form_factor == "cloud": - if kv: - return convert_to_cloud_kvstyle( - model_name=model_name, - model_class=model_class, - model_kv=model_kv, - onnx_dir_path=onnx_dir_path, - tokenizer=tokenizer, - hf_token=hf_token, - seq_len=seq_length, - input_str=input_str, - return_path=return_path, - save_fp32_onnx=save_fp32_onnx, - save_fp16_onnx=save_fp16_onnx, - ) - else: - return convert_to_cloud_bertstyle( - model_name=model_name, - model_class=model_class, - tokenizer=tokenizer, - onnx_dir_path=onnx_dir_path, - hf_token=hf_token, - seq_len=seq_length, - input_str=input_str, - return_path=return_path, - save_fp32_onnx=save_fp32_onnx, - save_fp16_onnx=save_fp16_onnx, - ) + return export_for_cloud( + model_name=model_name, + qeff_model=model_kv, + tokenizer=tokenizer, + onnx_dir_path=onnx_dir_path, + seq_length=seq_length, + return_path=return_path, + save_fp16_onnx=save_fp16_onnx, + save_fp32_onnx=save_fp32_onnx) else: - return convert_to_edge() + # [TODO]: Apply the class transformation to make changes for the KV models in edge use cases + # model = QEfficient.transform(model_hf, type="Transformers", form_factor="edge") + # model.eval() + raise NotImplementedError("Oops! Reached too far!!") diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py index 324bcc092..fc71c9747 100644 --- a/QEfficient/exporter/export_utils.py +++ b/QEfficient/exporter/export_utils.py @@ -5,12 +5,10 @@ # # ----------------------------------------------------------------------------- -import json import os import shutil -import subprocess import sys -from logging import error, info +from logging import info from typing import Dict, List, Tuple, Union import numpy as np @@ -95,8 +93,8 @@ def export_onnx( custom_opsets={"com.qti.aisw.onnx": 1}, ) except Exception as e: - error("Exporting to ONNX failed. {}".format(e)) - return + raise RuntimeError("Exporting to ONNX failed. {}".format(e)) + onnx.checker.check_model(f"{gen_models_path}_tmp/{model_base_name}.onnx") loaded_model = onnx.load(f"{gen_models_path}_tmp/{model_base_name}.onnx") @@ -337,7 +335,7 @@ def generate_input_files( fp.write(",".join(filenames)) fp.write("\n") - +# FIXME(ochougul/quic-mamta): Remove duplication with APIRunner def run_model_on_ort( onnx_path: str, inputs: Dict[str, torch.Tensor], @@ -396,121 +394,3 @@ def run_model_on_ort( print(f"Failed to run the onnx {onnx_path} model in onnx runtime:%s", e) print("\n=============================================================\n") return input_names, None - - -def run_model_on_cloud_ai_100( - onnx_path: str, - onnx_symbol_defs: Dict[str, int] = {}, - **kwargs, -) -> bool: - - """ - API to run model on Cloud AI 100. - --------- - :param onnx_path: str. Path of the ONNX file. - :ONNX_symbol_defs: Dict[str, int] = {}. Custom ONNX symbols definition. - - Return: - True if model run successfully on Cloud AI 100. - """ - - args = [ - "/opt/qti-aic/exec/qaic-exec", - f"-m={onnx_path}", - "-aic-hw", - "-aic-hw-version=2.0", - ] - for onnx_symbol, onnx_def in onnx_symbol_defs.items(): - args.append(f"-onnx-define-symbol={onnx_symbol},{onnx_def}") - for k, v in kwargs.items(): - k = k.replace("_", "-") - if isinstance(v, bool): - if v: - args.append(f"-{k}") - continue - args.append(f"-{k}={v}") - - info("Running compiler:", " ".join(args)) - result = subprocess.run(args) - return result.returncode == 0 - - -def compile_kv_model_on_cloud_ai_100( - onnx_path: str, - specializations_json: str, - num_cores: int, - base_path: str, - mxfp6: bool, - custom_io_path: str, - aic_enable_depth_first: bool, - mos: int = -1, - device_group: List[int] = [0], - **kwargs, -) -> bool: - - """ - API to compile model Cloud AI 100. - --------- - :param onnx_path: str. Path of the ONNX file. - :specializations_json: str. Path of specializations.json file. - :num_cores: int. Number of cores to use during compilation. - :base_path: str. Path where intermediate files and compiled artifacts will be stored. - :mxfp6: bool. If true, it enables MXFP6 (Mixed Precision Floating Point 6) mode during compilation. - :custom_io_path: Path to a custom I/O configuration file. - :aic_enable_depth_first: bool. If true, it enables during compilation. - :mos: int. Effort level to reduce the on-chip memory. - :device_group: List[int]. List of device group IDs. - - Return: - True if model compiled successfully. - """ - - aic_binary_dir = os.path.join(base_path, "qpcs") - - if os.path.isdir(aic_binary_dir): - shutil.rmtree(aic_binary_dir) - - assert os.path.isfile( - specializations_json - ), f"Please use 'from QEfficient.cloud.compile import main as compile', as {specializations_json} file was not found" - assert os.path.isfile(custom_io_path), f"{custom_io_path} file was not found!" - command = [ - "/opt/qti-aic/exec/qaic-exec", - f"-m={onnx_path}", - "-aic-hw", - "-aic-hw-version=2.0", - f"-network-specialization-config={specializations_json}", - "-convert-to-fp16", - "-retained-state", - f"-aic-num-cores={num_cores}", - f"-custom-IO-list-file={custom_io_path}", - "-compile-only", - f"-aic-binary-dir={aic_binary_dir}", - ] - if mxfp6: - command.append("-mxfp6-matmul") - if mos > 0: - command.append(f"-mos={mos}") - if aic_enable_depth_first: - command.append("-aic-enable-depth-first") - if len(device_group) > 1: - mdp_ts_config = { - "connections": [{"devices": list(range(len(device_group))), "type": "p2p"}], - "partitions": [ - { - "name": "Partition0", - "devices": [{"deviceId": device, "numCores": num_cores} for device in range(len(device_group))], - } - ], - } - mdp_ts_config_path = os.path.join(base_path, "mdp_ts_config.json") - with open(mdp_ts_config_path, "w") as file: - json.dump(mdp_ts_config, file, indent=4) - command.append(f"-mdp-load-partition-config={mdp_ts_config_path}") - print("Running AI 100 compiler:", " ".join(command)) - result = subprocess.run(command, capture_output=True, text=True) - if result.returncode != 0: - raise RuntimeError(f"Compilation Failed!!\n\nSTDOUT\n{result.stdout}\n\nSTDERR\n{result.stderr}") - - print("\n===================== Compilation Done! =====================\n") - return result.returncode == 0, aic_binary_dir diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 97200dfca..2225750a3 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -117,7 +117,7 @@ def get_compilation_batch_size(qpc_path: str): return compilation_batch_size -def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size): +def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) -> List[str]: assert ( prompt is not None or prompts_txt_file_path is not None ), "Please pass atleast one argument either using --prompt or --prompts_txt_file_path" diff --git a/QEfficient/src/__init__.py b/QEfficient/src/__init__.py new file mode 100644 index 000000000..854686567 --- /dev/null +++ b/QEfficient/src/__init__.py @@ -0,0 +1,9 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +from QEfficient.src._transformers.auto import QEffAutoModel, QEFFAutoModelForCausalLM # noqa: F401 +from QEfficient.src.common import QEFFCommonLoader # noqa: F401 diff --git a/QEfficient/src/_transformers/__init__.py b/QEfficient/src/_transformers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/QEfficient/src/_transformers/auto.py b/QEfficient/src/_transformers/auto.py new file mode 100644 index 000000000..de01a0840 --- /dev/null +++ b/QEfficient/src/_transformers/auto.py @@ -0,0 +1,92 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +from typing import Any + +import torch.nn as nn +from transformers.models.auto import AutoModel, AutoModelForCausalLM +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING + +import QEfficient +from QEfficient.src.base import QEFFBaseModel +from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict + +# Dictionary that defines the interface from transformers to be used underneath the QEFF interface +QEFFAutoModelToTransformersAutoModelMap = { + "QEFFAutoModelForCausalLM": AutoModelForCausalLM, + "QEFFAutoModel": AutoModel, +} + + +class QEFFTransformersBase(QEFFBaseModel): + """ + Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from src/transformers/models/auto/modeling_auto.py file. + """ + def __init__(self, model: nn.Module, transform:bool = True) -> None: + assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or + # FIXME: Use model architectures here instead of complete dictionary TransformersToQEffModulesDict + model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore + self.model: nn.Module = model + if transform: + self.transform() + + def __repr__(self) -> str: + return self.model.__repr__() + + @property + def is_transformed(self) -> bool: + return getattr(self.model, "qeff_transformed", False) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): + """ + This method accepts All the parameters that are acceptable by transformers.AutoModelForCausalLM. + There are few additional parameters that this method can take: + :param transform:bool. Whether to optimize model for KV retention; default is True. Pass False to get BertStyle model. + """ + transform: bool = kwargs.get("transform", True) + kwargs.update({"use_cache": True}) # Always pass use_cache = True, to get KV values as output during ONNX export + + model = QEFFAutoModelToTransformersAutoModelMap[cls.__name__].from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + return cls(model, transform=transform) + + + def transform_export(self, *args, **kwargs) -> Any: + raise NotImplementedError("Reached too far!!") + + def transform_export_compile(self, *args, **kwargs) -> Any: + raise NotImplementedError("Reached too far!!") + + def transform(self): + # FIXME: break down transform into optmization passes i.e. HW specific optimization(RMSNorm), KV retention pass etc. + QEfficient.transform(self) + return self + + +class QEFFAutoModelForCausalLM(QEFFTransformersBase): + """ + QEFF class for manipulating any causal language model from HuggingFace hub. + """ + def execute(self, *args, **kwargs): # type: ignore + raise NotImplementedError("Reached too far!!") + + def export(self): + raise NotImplementedError("Reached too far!!") + + def compile(self, *args, **kwargs) -> Any: + raise NotImplementedError("Reached too far!!") + + +class QEffAutoModel(QEFFTransformersBase): + def execute(self, *args, **kwargs): # type: ignore + raise NotImplementedError("Reached too far!!") + + def export(self): + raise NotImplementedError("Reached too far!!") + + def compile(self, *args, **kwargs) -> Any: + raise NotImplementedError("Reached too far!!") diff --git a/QEfficient/src/base.py b/QEfficient/src/base.py new file mode 100644 index 000000000..ddc23fc87 --- /dev/null +++ b/QEfficient/src/base.py @@ -0,0 +1,99 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +""" +** This file for holds the classes that handle main functions +1.load i.e. from_pretrained +2.execute +3.transform +4.export +5.compile +For different varities of Transformer Models + +Representation of class inheritence followed keeping in line with transformers/diffusers repos -> + + QEFFBaseModel + ________________________________________________|________________________________________________________________ + | | + QEFFTransformersBase QEFFDiffusersBase + | | + ____________|________________________________________________________ ________________ _________________|______________ + _____ | | | | | | + | QEFFAutoModel QEFFAutoModelForCausalLM QEFFAWQModelForCausalLM ... ... ... +QEFFCommonLoader -| [Provides way to [Provides way to do 1-5 on [Supports 1-5 for +[Provides | do steps 1-5 on transformers.AutoModelForCausalLM] AWQ Models] +interface to |_____ transformers.AutoModel] +Load any of +These models +by automatically +detecting the type +of the model] + +** QEFFBASEModel is abstract base class that defines the basic structure of these classes. +** QEFFPipeline classes will stay at the same level as QEFFAutoModel in this hierarchy in future. +""" + +from abc import ABC, abstractmethod +from enum import Enum +from typing import Any + + +#Defining placeholder ENUM for execute function +class Runtime(Enum): + CPU_ORT = "CPU ONNX Runtime" + CPU_PT = "CPU PyTorch Runtime" + AI_100 = "AI_100" + + +class QEFFBaseModel(ABC): + """ + This class acts as parent class for all the varieties of model class (i.e. LLMs, SD, quantized etc.). + Enforces certain methods to be implemented by child classes. + + All the child classes must provide way to load, transform(optimize), exoprt to ONNX etc. capabilities. + """ + def __init__(self) -> None: + super().__init__() + # Users can call generate or execute + self.generate = self.execute + self._runtime = Runtime.CPU_PT + + @property + def runtime(self) -> Runtime: + return self._runtime + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): + raise NotImplementedError("Must implement for child classes") + + @property + def is_transformed(self) -> bool: + raise NotImplementedError("Must implement for child classes") + + @abstractmethod + def transform_export(self, *args, **kwargs) -> Any: + pass + + @abstractmethod + def transform_export_compile(self, *args, **kwargs) -> Any: + pass + + @abstractmethod + def execute(self, *args, **kwargs) -> Any: + pass + + @abstractmethod + def transform(self, *args, **kwargs) -> Any: + pass + + @abstractmethod + def export(self, *args, **kwargs) -> Any: + pass + + @abstractmethod + def compile(self, *args, **kwargs) -> Any: + pass \ No newline at end of file diff --git a/QEfficient/src/common.py b/QEfficient/src/common.py new file mode 100644 index 000000000..bca391097 --- /dev/null +++ b/QEfficient/src/common.py @@ -0,0 +1,86 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +""" +MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP dictionary defines the mapping between names of the varities of Transformer model defined in +QEFF_MODEL_TYPE and the classes that implement the methods i.e.(compile, export etc.) for those types. + +QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model. +""" +import os +from enum import Enum +from typing import Any, Dict, Type + +from transformers import AutoConfig +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING + +from QEfficient.src._transformers.auto import QEFFAutoModelForCausalLM +from QEfficient.src.base import QEFFBaseModel +from QEfficient.utils._utils import login_and_download_hf_lm + + +class QEFF_MODEL_TYPE(Enum): + """ + Defines Names of the different varities of transformer models. + """ + CAUSALLM = "LLM" + DIFFUSION = "STABLE_DIFFUSION" + AWQ = "AWQ" + + +MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = { + QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM +} + +AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()} + +def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE: + """ + Loads model config file and returns the type of the model (i.e. LLMs, SD, quantized etc.) as supported by the library. + """ + assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model" + config, kwargs = AutoConfig.from_pretrained( + hf_model_path, + return_unused_kwargs=True, + ) + + if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING: + # FIXME: Add logic to handle if quantization config is stored in separate quant_config.json outside of config, also create a separate function for this and below lines + quant_config = getattr(config, "quantization_config", getattr(config, "quant_config", None)) + if quant_config is not None: + if quant_config.get("quant_method", None) == "awq": + return QEFF_MODEL_TYPE.AWQ + else: + raise NotImplementedError(f"current model type is not yet supported {type(config)}") + else: + return QEFF_MODEL_TYPE.CAUSALLM + else: + raise NotImplementedError(f"model type {type(config)} is not yet supported") + + +class QEFFCommonLoader: + """ + Provides HuggingFace model loading interface same as transformers APIs. + Supports loading any model on HuggingFace. + """ + def __init__(self, *args: Any, **kwds: Any) -> None: + raise EnvironmentError( + f"{self.__class__.__name__} is designed to be instantiated " + f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseModel: + """ + Downloads HuggingFace model if already doesn't exist locally, returns QEffAutoModel object based on type of model. + """ + pretrained_model_name_or_path = pretrained_model_name_or_path if os.path.isdir(pretrained_model_name_or_path) \ + else login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs) + model_type = get_hf_model_type(hf_model_path=pretrained_model_name_or_path) + qeff_auto_model_class = MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP[model_type] + assert issubclass(qeff_auto_model_class, QEFFBaseModel), f"Expected class that inherits {QEFFBaseModel}, got {type(qeff_auto_model_class)}" + + return qeff_auto_model_class.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path) diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index 5ad29ef3d..753d08204 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -5,11 +5,10 @@ # # ----------------------------------------------------------------------------- -import hashlib from collections import namedtuple +from typing import Dict, Type import torch.nn as nn -import transformers from transformers.models.codegen.modeling_codegen import ( CodeGenAttention, CodeGenBlock, @@ -34,32 +33,18 @@ ) from transformers.models.mixtral.modeling_mixtral import ( MixtralAttention, + MixtralBLockSparseTop2MLP, + MixtralDecoderLayer, MixtralForCausalLM, MixtralModel, - MixtralDecoderLayer, - MixtralSparseMoeBlock, - MixtralBLockSparseTop2MLP, - MixtralRotaryEmbedding, MixtralRMSNorm, + MixtralRotaryEmbedding, + MixtralSparseMoeBlock, ) from transformers.models.mpt.modeling_mpt import MptAttention, MptBlock, MptForCausalLM, MptModel from QEfficient.customop import CustomRMSNormAIC -from QEfficient.utils.logging_utils import logger -from .modeling_attn_mask_utils import ( - QEffAttentionMaskConverter, - _qeff_prepare_4d_attention_mask, - _qeff_prepare_4d_causal_attention_mask, -) -from .modeling_outputs import ( - QEffBaseModelOutputWithPast, - QEffBaseModelOutputWithPastAndCrossAttentions, - QEffCausalLMOutputWithCrossAttentions, - QEffCausalLMOutputWithPast, - QEffMoeCausalLMOutputWithPast, - QEffMoeModelOutputWithPast, -) from .models.codegen.modeling_codegen import ( QEffCodeGenAttention, QEffCodeGenBlock, @@ -81,13 +66,13 @@ QEffMistralRotaryEmbedding, ) from .models.mixtral_moe.modeling_mixtral import ( - QEffMixtralModel, - QEffMixtralRotaryEmbedding, QEffMixtralAttention, - QEffMixtralForCausalLM, + QEffMixtralBLockSparseTop2MLP, QEffMixtralDecoderLayer, + QEffMixtralForCausalLM, + QEffMixtralModel, + QEffMixtralRotaryEmbedding, QEffMixtralSparseMoeBlock, - QEffMixtralBLockSparseTop2MLP, ) from .models.mpt.modeling_mpt import QEffMptAttention, QEffMptBlock, QEffMptForCausalLM, QEFfMptModel @@ -108,7 +93,7 @@ # Define a transformers layers to QEff layers dictionary # While onboarding new models make sure to add the new layer maps to this dictionary. -TransformersToQEffModulesDict = { +TransformersToQEffModulesDict: Dict[Type[nn.Module], Type[nn.Module]] = { # GPT model layers GPT2Model: QEffGPT2Model, GPT2Block: QEffGPT2Block, @@ -147,87 +132,3 @@ MixtralSparseMoeBlock: QEffMixtralSparseMoeBlock, MixtralBLockSparseTop2MLP:QEffMixtralBLockSparseTop2MLP, } - - -def get_params_hash(model: nn.Module) -> str: - """ - Creates a Hash of all the parameters values i.e. weights using SHA256 algo. - -------- - :param model: torch.nn.Module. Base PyTorch model. - :returns: str. Hash string - """ - hasher = hashlib.sha256() - for _, params in model.named_parameters(): - hasher.update(params.data.numpy().tobytes()) - - return hasher.hexdigest() - - -def replace_module_with_qeff_layers(model: nn.Module) -> None: - """ - Replaces the transformers nn.Module classes with optmized QEff classes in place. - ---------- - :param model: torch.nn.Module. Base PyTorch model. - """ - # Replace if module class is registed in TransformersToQEffModulesDict - target_module = TransformersToQEffModulesDict.get(model.__class__) - if target_module is not None: - model.__class__ = target_module - - # Iterate over child modules - for _, module in model.named_children(): - replace_module_with_qeff_layers(module) - - -def transform(model: nn.Module, form_factor: str = "cloud") -> nn.Module: - """ - Replaces some Transformers' methods for equivalent methods optimized for AI 100. - --------- - Args: - param model (torch.nn.Module): PyTorch model. - form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"]. - - Returns: - torch.nn.Module: PyTorch Module with replaced QEff layers. - """ - - # Introducnig qeff_transformed attribue in model to check status of transform - if getattr(model, "qeff_transformed", False): - print("Model is already transformed") - return model - - - if form_factor == "cloud": - # Get Hash of all params for checking later - prior_params_hash = get_params_hash(model) - logger.warning(f"The model {model.__class__} layers has been upadted to QEff layers in-place") - # Replace with QEff layers - replace_module_with_qeff_layers(model) - - # Check with new params hash - later_params_hash = get_params_hash(model) - assert ( - prior_params_hash == later_params_hash - ), "Weights were changed in the transform process, please report an issue" - - # Replace the modeling output classes - transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions = ( - QEffBaseModelOutputWithPastAndCrossAttentions - ) - transformers.modeling_outputs.CausalLMOutputWithCrossAttentions = QEffCausalLMOutputWithCrossAttentions - transformers.modeling_outputs.BaseModelOutputWithPast = QEffBaseModelOutputWithPast - transformers.modeling_outputs.CausalLMOutputWithPast = QEffCausalLMOutputWithPast - transformers.modeling_outputs.MoeCausalLMOutputWithPast = QEffMoeCausalLMOutputWithPast - transformers.modeling_outputs.MoeModelOutputWithPast = QEffMoeModelOutputWithPast - - # Replace the modeling attn util classes and functions - transformers.modeling_attn_mask_utils.AttentionMaskConverter = QEffAttentionMaskConverter - transformers.modeling_attn_mask_utils._prepare_4d_attention_mask = _qeff_prepare_4d_attention_mask - transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask = _qeff_prepare_4d_causal_attention_mask - - setattr(model,'qeff_transformed',True) - return model.eval() - - elif form_factor == "edge": - # Add changes for the edge usecase - raise NotImplementedError("We currently only support cloud form factor!") diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py new file mode 100644 index 000000000..dfd0de5c7 --- /dev/null +++ b/QEfficient/transformers/transform.py @@ -0,0 +1,123 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import hashlib + +import torch.nn as nn +import transformers + +from QEfficient.src.base import QEFFBaseModel +from QEfficient.src.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE +from QEfficient.transformers.modeling_attn_mask_utils import ( + QEffAttentionMaskConverter, + _qeff_prepare_4d_attention_mask, + _qeff_prepare_4d_causal_attention_mask, +) +from QEfficient.transformers.modeling_outputs import ( + QEffBaseModelOutputWithPast, + QEffBaseModelOutputWithPastAndCrossAttentions, + QEffCausalLMOutputWithCrossAttentions, + QEffCausalLMOutputWithPast, + QEffMoeCausalLMOutputWithPast, + QEffMoeModelOutputWithPast, +) +from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict +from QEfficient.utils.logging_utils import logger + + +def replace_module_with_qeff_layers(model: nn.Module) -> None: + """ + Replaces the transformers nn.Module classes with optmized QEff classes in place. + ---------- + :param model: torch.nn.Module. Base PyTorch model. + """ + # Replace if module class is registed in TransformersToQEffModulesDict + target_module = TransformersToQEffModulesDict.get(model.__class__) + if target_module is not None: + model.__class__ = target_module + + # Iterate over child modules + for _, module in model.named_children(): + replace_module_with_qeff_layers(module) + + +def get_params_hash(model: nn.Module) -> str: + """ + Creates a Hash of all the parameters values i.e. weights using SHA256 algo. + -------- + :param model: torch.nn.Module. Base PyTorch model. + :returns: str. Hash string + """ + hasher = hashlib.sha256() + for _, params in model.named_parameters(): + hasher.update(params.data.numpy().tobytes()) + + return hasher.hexdigest() + + +def transform_lm(model: nn.Module) -> nn.Module: + """ + Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for cloud AI 100. + --------- + Args: + param model (torch.nn.Module): PyTorch model. + + Returns: + torch.nn.Module: PyTorch Module with replaced QEff layers. + """ + + # Introducnig qeff_transformed attribue in model to check status of transform + if getattr(model, "qeff_transformed", False): + print("Model is already transformed") + return model + + # Get Hash of all params for checking later + prior_params_hash = get_params_hash(model) + logger.warning(f"The model {model.__class__} layers has been upadted to QEff layers in-place") + # Replace with QEff layers + replace_module_with_qeff_layers(model) + + # Check with new params hash + later_params_hash = get_params_hash(model) + assert ( + prior_params_hash == later_params_hash + ), "Weights were changed in the transform process, please report an issue" + + # Replace the modeling output classes + transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions = ( + QEffBaseModelOutputWithPastAndCrossAttentions + ) + transformers.modeling_outputs.CausalLMOutputWithCrossAttentions = QEffCausalLMOutputWithCrossAttentions + transformers.modeling_outputs.BaseModelOutputWithPast = QEffBaseModelOutputWithPast + transformers.modeling_outputs.CausalLMOutputWithPast = QEffCausalLMOutputWithPast + transformers.modeling_outputs.MoeCausalLMOutputWithPast = QEffMoeCausalLMOutputWithPast + transformers.modeling_outputs.MoeModelOutputWithPast = QEffMoeModelOutputWithPast + + # Replace the modeling attn util classes and functions + transformers.modeling_attn_mask_utils.AttentionMaskConverter = QEffAttentionMaskConverter + transformers.modeling_attn_mask_utils._prepare_4d_attention_mask = _qeff_prepare_4d_attention_mask + transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask = _qeff_prepare_4d_causal_attention_mask + + setattr(model,'qeff_transformed',True) + return model.eval() + + +def transform(model: QEFFBaseModel, form_factor="cloud"): + """ + This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for cloud AI 100. + Will replace the torch.nn.Module layers of passed QEffModel with optimized implementation of the same. + + model: object of any instance of class that is child of `QEFFBaseAutoModelFactory` + form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"]. + """ + assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!" + #FIXME: move this to class and use model.transform() + if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: + transform_lm(model.model) # type: ignore + return model + else: + raise NotImplementedError(f"Recieved unsupported class of type {type(model)}") \ No newline at end of file diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index 486bae664..bd6b59120 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -5,96 +5,11 @@ # # ----------------------------------------------------------------------------- -import os -from typing import List, Optional, Tuple, Union - -import requests -from huggingface_hub import snapshot_download -from requests.exceptions import HTTPError - -from QEfficient.utils.constants import QEFF_MODELS_DIR - - -def hf_download( - repo_id: Optional[str] = None, - cache_dir: Optional[str] = None, - hf_token: Optional[str] = None, - allow_patterns: Optional[List[str]] = None, - ignore_patterns: Optional[List[str]] = None, -): - # Setup cache and local dir - local_dir = None - if cache_dir is not None: - cache_dir = f"{cache_dir}" - local_dir = f"{cache_dir}/{repo_id}" - - os.makedirs(f"{cache_dir}/{repo_id}", exist_ok=True) - max_retries = 5 - retry_count = 0 - while retry_count < max_retries: - try: - model_path = snapshot_download( - repo_id, - cache_dir=cache_dir, - local_dir=local_dir, - local_dir_use_symlinks=True, - revision="main", - resume_download=True, - token=hf_token, - allow_patterns=allow_patterns, - ignore_patterns=ignore_patterns, - ) - break - except requests.ReadTimeout as e: - print(f"Read timeout: {e}") - retry_count += 1 - - except HTTPError as e: - retry_count = max_retries - if e.response.status_code == 401: - print("You need to pass a valid `--hf_token=...` to download private checkpoints.") - else: - raise e - - return model_path - - -def qpc_exists(model_name: str, qpc_base_dir_name: str) -> Union[Tuple[bool, str], None]: - """ - Checks if qpc files already exists, removes the directory if files have been manipulated. - --------- - :param model_name: str. HF Model card name. - :param dir_path: str. Path of qpc directory. - :return: Union[Tuple[bool, str]]: qpc_exists and path to qpc directory - """ - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) - os.makedirs(model_card_dir, exist_ok=True) - - qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs") - - # Compute the boolean indicating if the QPC exists - qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin")) - - return qpc_exists_bool, qpc_dir_path - - -def onnx_exists(model_name: str) -> Union[Tuple[bool, str, str], None]: - """ - Checks if qpc files already exists, removes the directory if files have been manipulated. - --------- - :param model_name: str. HF Model card name. - :return: Union[Tuple[bool, str, str]]: onnx_exists and path to onnx file and directory - """ - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) - os.makedirs(model_card_dir, exist_ok=True) - - onnx_dir_path = os.path.join(model_card_dir, "onnx") - onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") - - # Compute the boolean indicating if the ONNX model exists - onnx_exists_bool = os.path.isfile(onnx_model_path) and os.path.isfile( - os.path.join(os.path.dirname(onnx_model_path), "custom_io_fp16.yaml") - ) - - # Return the boolean, onnx_dir_path, and onnx_model_path - return onnx_exists_bool, onnx_dir_path, onnx_model_path +from QEfficient.utils._utils import ( # noqa: F401 + get_qpc_dir_name_infer, + hf_download, + load_hf_tokenizer, + login_and_download_hf_lm, + onnx_exists, + qpc_exists, +) diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py new file mode 100644 index 000000000..7a0d85828 --- /dev/null +++ b/QEfficient/utils/_utils.py @@ -0,0 +1,142 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import os +from typing import List, Optional, Tuple, Union + +import requests +from huggingface_hub import login, snapshot_download +from requests.exceptions import HTTPError +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast + +from QEfficient.utils.constants import QEFF_MODELS_DIR +from QEfficient.utils.logging_utils import logger + + +def login_and_download_hf_lm(model_name, *args, **kwargs): + logger.info(f"loading HuggingFace model for {model_name}") + hf_token = kwargs.pop("hf_token", None) + cache_dir = kwargs.pop("cache_dir", None) + if hf_token is not None: + login(hf_token) + model_name = hf_download( + repo_id=model_name, + cache_dir=cache_dir, + ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"], + ) + return model_name + + +def hf_download( + repo_id: Optional[str] = None, + cache_dir: Optional[str] = None, + hf_token: Optional[str] = None, + allow_patterns: Optional[List[str]] = None, + ignore_patterns: Optional[List[str]] = None, +): + # Setup cache and local dir + local_dir = None + if cache_dir is not None: + cache_dir = f"{cache_dir}" + local_dir = f"{cache_dir}/{repo_id}" + + os.makedirs(f"{cache_dir}/{repo_id}", exist_ok=True) + max_retries = 5 + retry_count = 0 + while retry_count < max_retries: + try: + model_path = snapshot_download( + repo_id, + cache_dir=cache_dir, + local_dir=local_dir, + local_dir_use_symlinks=True, + revision="main", + resume_download=True, + token=hf_token, + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, + ) + break + except requests.ReadTimeout as e: + print(f"Read timeout: {e}") + retry_count += 1 + + except HTTPError as e: + retry_count = max_retries + if e.response.status_code == 401: + print("You need to pass a valid `--hf_token=...` to download private checkpoints.") + else: + raise e + + return model_path + + +def qpc_exists(model_name: str, qpc_base_dir_name: str) -> Tuple[bool, str]: + """ + Checks if qpc dir exists. + Returns + 1. Boolean variable indicating if qpc files exist + 2. Path of the qpc dir if found. + --------- + :param model_name: str. HF Model card name. + :param dir_path: str. Path of qpc directory. + :return: Union[Tuple[bool, str]]: qpc_exists and path to qpc directory + """ + model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) + os.makedirs(model_card_dir, exist_ok=True) + + qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs") + + # Compute the boolean indicating if the QPC exists + qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin")) + + return qpc_exists_bool, qpc_dir_path + + +def onnx_exists(model_name: str) -> Tuple[bool, str, str]: + """ + Checks if qpc files already exists, removes the directory if files have been manipulated. + --------- + :param model_name: str. HF Model card name. + :return: Union[Tuple[bool, str, str]]: onnx_exists and path to onnx file and directory + """ + model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) + os.makedirs(model_card_dir, exist_ok=True) + + onnx_dir_path = os.path.join(model_card_dir, "onnx") + onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") + + # Compute the boolean indicating if the ONNX model exists + onnx_exists_bool = os.path.isfile(onnx_model_path) and os.path.isfile( + os.path.join(os.path.dirname(onnx_model_path), "custom_io_fp16.yaml") + ) + + # Return the boolean, onnx_dir_path, and onnx_model_path + return onnx_exists_bool, onnx_dir_path, onnx_model_path + + +def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token: Optional[str] = None, padding_side:str = "left", **kwargs) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]: + logger.info(f"Loading Tokenizer for {model_name}") + if hf_token is not None: + login(hf_token) + + # Download tokenizer along with model if it doesn't exist + model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"]) + #FIXME(ochougul): should this always return left padded tokenizer? + tokenizer = AutoTokenizer.from_pretrained(model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs) + return tokenizer + + +def get_qpc_dir_name_infer(num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group): + qpc_base_dir_name = ( + f"qpc_{num_cores}cores_{batch_size}BS_{prompt_len}PL_{ctx_len}CL_{mos}MOS_" + + f"{len(device_group)}" + + "devices" + + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16") + ) + + return qpc_base_dir_name diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py index fe42d5ed9..044e6e83f 100644 --- a/QEfficient/utils/logging_utils.py +++ b/QEfficient/utils/logging_utils.py @@ -13,19 +13,20 @@ class QEffFormatter(logging.Formatter): Formatter class used to set colors for printing different logging levels of messages on console. """ - grey = "\x1b[38;20m" - yellow = "\x1b[33;20m" - red = "\x1b[31;20m" - bold_red = "\x1b[31;1m" - reset = "\x1b[0m" - format = "%(levelname)s - %(name)s - %(message)s (%(filename)s:%(lineno)d)" + cyan: str = "\x1b[38;5;14m" + yellow: str = "\x1b[33;20m" + red: str = "\x1b[31;20m" + bold_red: str = "\x1b[31;1m" + reset: str = "\x1b[0m" + common_format: str = "%(levelname)s - %(name)s - %(message)s" # type: ignore + format_with_line_info = "%(levelname)s - %(name)s - %(message)s (%(filename)s:%(lineno)d)" # type: ignore FORMATS = { - logging.DEBUG: grey + format + reset, - logging.INFO: grey + format + reset, - logging.WARNING: yellow + format + reset, - logging.ERROR: red + format + reset, - logging.CRITICAL: bold_red + format + reset, + logging.DEBUG: cyan + format_with_line_info + reset, + logging.INFO: cyan + common_format + reset, + logging.WARNING: yellow + common_format + reset, + logging.ERROR: red + format_with_line_info + reset, + logging.CRITICAL: bold_red + format_with_line_info + reset, } def format(self, record): @@ -45,7 +46,7 @@ def create_logger() -> logging.Logger: # create console handler and set level to debug ch = logging.StreamHandler() - ch.setLevel(logging.WARNING) + ch.setLevel(logging.INFO) # define formatter ch.setFormatter(QEffFormatter()) diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index 598f24553..f30016a5a 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -58,7 +58,6 @@ def run_hf_model_on_pytorch(self, model_hf): Return: generated_ids: numpy.ndarray - output tokens """ - input_ids = self.tokenizer.encode(self.prompt[0], return_tensors="pt") input_ids_len = len(input_ids[0]) @@ -77,6 +76,7 @@ def run_hf_model_on_pytorch(self, model_hf): print("Completion:", repr(generated_text)) return generated_ids + def run_kv_model_on_pytorch(self, model, n_layer, padding_shape): """ Function responsible for running KV PyTorch model and return the output tokens diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 668a3b473..3095c7044 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -13,7 +13,16 @@ "id": "88eef7ea-3488-414c-9e36-e960abba30c9", "metadata": {}, "source": [ - "##### Download the OpenSource GPT2 based HuggingFace Model and Save in local *Cache* directory" + "##### Download the OpenSource GPT2 based HuggingFace Model and Save in local *Cache* directory\n", + "###### We Modify the GPT2 Classes using the Optimized Software Library to generate model for Cloud AI 100.\n", + "###### User can disable this optmization by passing `transfrom=False` in the `from_pretrained` call\n", + "###### Here we generate models with below Optimizations:\n", + "\n", + "* RMS Norm Fixes for FP16 Overflows and Underflow\n", + "* Causal Mask Fix\n", + "* Handling FP16 Overflows.\n", + "* KV Cache (Retention Changes).\n", + "* Triu/Tril Ops support." ] }, { @@ -26,59 +35,19 @@ "# Initiate the Orignal Transformer model\n", "import os\n", "\n", - "from transformers import AutoTokenizer\n", - "from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel\n", - "\n", - "from QEfficient.utils import hf_download\n", - "from QEfficient.utils.constants import Constants\n", + "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n", "\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n", "\n", - "ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n", + "# ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n", + "# CACHE_DIR = os.path.join(ROOT_DIR, \"tmp\") #, you can use a different location for just one model by passing this param as cache_dir in below API.\n", "\n", "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n", "model_name = \"gpt2\" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n", "\n", - "model_hf_path = hf_download(\n", - " repo_id=model_name,\n", - " cache_dir=Constants.CACHE_DIR,\n", - " ignore_patterns=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n", - ")\n", - "model_hf = GPT2LMHeadModel.from_pretrained(model_hf_path, use_cache=True)\n", - "model_hf.eval()\n", - "print(f\"{model_name} from hugging-face \\n\", model_hf)" - ] - }, - { - "cell_type": "markdown", - "id": "a89dfa0a-d8fe-4472-bf00-55e563ae9058", - "metadata": {}, - "source": [ - "##### Now we Modify the GPT2 Classes using the Optimized Software Library to generate model for Cloud AI 100.\n", - "##### Here we generate models with below Optimizations:\n", - "\n", - "* RMS Norm Fixes for FP16 Overflows and Underflow\n", - "* Causal Mask Fix\n", - "* Handling FP16 Overflows.\n", - "* KV Cache (Retention Changes).\n", - "* Triu/Tril Ops support." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4543b94-9b50-4bcc-90c6-484ab694c9a6", - "metadata": {}, - "outputs": [], - "source": [ - "import QEfficient\n", - "\n", - "# Easy and minimal api to update the model\n", - "model_transformed = QEfficient.transform(model_hf, type=\"Transformers\", form_factor=\"cloud\")\n", - "\n", - "model_transformed.eval()\n", - "print(\"Model after Optimized transformations \\n\", model_transformed)" + "qeff_model = AutoModelForCausalLM.from_pretrained(model_name)\n", + "print(f\"{model_name} optmized for AI 100 \\n\", qeff_model)" ] }, { @@ -96,8 +65,8 @@ "metadata": {}, "outputs": [], "source": [ - "from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter\n", - "\n", + "import QEfficient\n", + "from QEfficient.utils import load_hf_tokenizer\n", "# We can now export the modified models to Onnx framework\n", "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n", "# Cloud AI 100 Platform.\n", @@ -109,14 +78,14 @@ "# We can generate the KV Style models with the flag \"kv\"\n", "# Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version.\n", "# It is recommended to use kv=True for better performance.\n", - "tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side=\"left\")\n", - "base_path, onnx_path = qualcomm_efficient_converter(\n", - " model_kv=model_transformed,\n", + "tokenizer = load_hf_tokenizer(model_name, use_cache=True)\n", + "base_path, onnx_path = QEfficient.export(\n", " model_name=model_name,\n", + " model_kv=qeff_model,\n", + " tokenizer=tokenizer,\n", " kv=True,\n", " form_factor=\"cloud\",\n", " return_path=True,\n", - " tokenizer=tokenizer,\n", ")" ] }, @@ -136,13 +105,12 @@ "outputs": [], "source": [ "# Please use platform SDk to Check num_cores for your card.\n", - "from QEfficient.cloud.compile import main as compile\n", "\n", - "generated_qpc_path = compile(\n", + "generated_qpc_path = QEfficient.compile(\n", " onnx_path=onnx_path,\n", " num_cores=14,\n", - " qpc_path=base_path,\n", - " mxfp6=True,\n", + " qpc_path=os.path.dirname(base_path),\n", + " mxfp6=False,\n", " device_group=[0],\n", ")" ] @@ -162,12 +130,12 @@ "metadata": {}, "outputs": [], "source": [ - "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n", + "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n", "\n", "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", - "batch_size = get_compilation_batch_size(generated_qpc_path)\n" - "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")" + "batch_size = get_compilation_batch_size(generated_qpc_path)\n", + "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])" ] } ], diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index 8533eedcc..15e84399a 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -13,7 +13,15 @@ "id": "88eef7ea-3488-414c-9e36-e960abba30c9", "metadata": {}, "source": [ - "##### Download the OpenSource MPT based HuggingFace Model and Save in local *Cache* directory" + "##### Download the OpenSource MPT based HuggingFace Model and Save in local *Cache* directory\n", + "###### Now we Modify the MPT Classes using the Optimized Software Library to generate model for Cloud AI 100.\n", + "###### Here we generate models with below Optimizations:\n", + "\n", + "* RMS Norm Fixes for FP16 Overflows and Underflow\n", + "* Causal Mask Fix\n", + "* Handling FP16 Overflows.\n", + "* KV Cache (Retention Changes).\n", + "* Triu/Tril Ops support." ] }, { @@ -26,58 +34,18 @@ "# Initiate the Orignal Transformer model\n", "import os\n", "\n", - "from transformers import AutoTokenizer\n", - "from transformers.models.mpt.modeling_mpt import MptForCausalLM\n", - "\n", - "from QEfficient.utils import hf_download\n", - "from QEfficient.utils.constants import Constants\n", + "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n", "\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n", "\n", - "ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n", + "#ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n", + "#CACHE_DIR = os.path.join(ROOT_DIR, \"tmp\"), you can use a different location for just one model by passing this param as cache_dir in below API.\n", "\n", "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n", "model_name = \"mosaicml/mpt-7b\" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n", - "model_hf_path = hf_download(\n", - " repo_id=model_name,\n", - " cache_dir=Constants.CACHE_DIR,\n", - " ignore_patterns=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n", - ")\n", - "model_hf = MptForCausalLM.from_pretrained(model_hf_path, use_cache=True)\n", - "model_hf.eval()\n", - "print(f\"{model_name} from hugging-face \\n\", model_hf)" - ] - }, - { - "cell_type": "markdown", - "id": "a89dfa0a-d8fe-4472-bf00-55e563ae9058", - "metadata": {}, - "source": [ - "##### Now we Modify the MPT Classes using the Optimized Software Library to generate model for Cloud AI 100.\n", - "##### Here we generate models with below Optimizations:\n", - "\n", - "* RMS Norm Fixes for FP16 Overflows and Underflow\n", - "* Causal Mask Fix\n", - "* Handling FP16 Overflows.\n", - "* KV Cache (Retention Changes).\n", - "* Triu/Tril Ops support." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4543b94-9b50-4bcc-90c6-484ab694c9a6", - "metadata": {}, - "outputs": [], - "source": [ - "import QEfficient\n", - "\n", - "# Easy and minimal api to update the model\n", - "model_transformed = QEfficient.transform(model_hf, type=\"Transformers\", form_factor=\"cloud\")\n", - "\n", - "model_transformed.eval()\n", - "print(\"Model after Optimized transformations \\n\", model_transformed)" + "qeff_model = AutoModelForCausalLM.from_pretrained(model_name)\n", + "print(f\"{model_name} optmized for AI 100 \\n\", qeff_model)" ] }, { @@ -95,7 +63,8 @@ "metadata": {}, "outputs": [], "source": [ - "from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter\n", + "import QEfficient\n", + "from QEfficient.utils import load_hf_tokenizer\n", "\n", "# We have the utils to export the modified models to Onnx framework\n", "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n", @@ -107,14 +76,14 @@ "\n", "# We can generate both bertstyle and KV Style models with the flag \"kv\"\n", "# Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version.\n", - "tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side=\"left\")\n", - "base_path, onnx_path = qualcomm_efficient_converter(\n", - " model_kv=model_transformed,\n", + "tokenizer = load_hf_tokenizer(model_name, use_cache=True, padding_side=\"left\")\n", + "base_path, onnx_path = QEfficient.export(\n", " model_name=model_name,\n", + " model_kv=qeff_model,\n", + " tokenizer=tokenizer,\n", " kv=True,\n", " form_factor=\"cloud\",\n", " return_path=True,\n", - " tokenizer=tokenizer,\n", ")" ] }, @@ -134,12 +103,11 @@ "outputs": [], "source": [ "# Please use platform SDk to Check num_cores for your card.\n", - "from QEfficient.cloud.compile import main as compile\n", "\n", - "generated_qpc_path = compile(\n", + "generated_qpc_path = QEfficient.compile(\n", " onnx_path=onnx_path,\n", " num_cores=14,\n", - " qpc_path=base_path,\n", + " qpc_path=os.path.dirname(base_path),\n", " mxfp6=True,\n", " device_group=[0],\n", ")" @@ -160,13 +128,13 @@ "metadata": {}, "outputs": [], "source": [ - "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n", + "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n", "\n", "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "\n", - "batch_size = get_compilation_batch_size(generated_qpc_path)" - "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")" + "batch_size = get_compilation_batch_size(generated_qpc_path)\n", + "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])" ] } ], diff --git a/tests/test_loader.py b/tests/test_loader.py new file mode 100644 index 000000000..5c626361b --- /dev/null +++ b/tests/test_loader.py @@ -0,0 +1,35 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +from typing import Any, Dict + +import pytest +from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel + +import QEfficient +from QEfficient import QEFFAutoModelForCausalLM, QEFFCommonLoader + +model_name_to_params_dict : Dict[str, Dict[str, Any]] = { + "gpt2": { + "qeff_class": QEFFAutoModelForCausalLM, + "hf_class": GPT2LMHeadModel, + "prompt": "Equator is" + }, + +} +model_names = model_name_to_params_dict.keys() + +#FIXME: Add test cases for passing cache_dir, pretrained_model_path instead of card name, etc., Passing other kwargs +@pytest.mark.parametrize("model_name", model_names) +def test_qeff_auto_model_for_causal_lm(model_name: str): + model = QEFFCommonLoader.from_pretrained(model_name) + assert isinstance(model, model_name_to_params_dict[model_name]['qeff_class']) + assert isinstance(model.model, model_name_to_params_dict[model_name]['hf_class']) # type: ignore + + # Run transform + QEfficient.transform(model) + print(model) diff --git a/tests/utils.py b/tests/utils.py index f8fd7566e..ace803f8f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -10,12 +10,11 @@ import shutil import unittest -import transformers - -import QEfficient +from QEfficient import QEFFAutoModelForCausalLM +from QEfficient.compile.compile_helper import compile_kv_model_on_cloud_ai_100 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100 -from QEfficient.utils import hf_download +from QEfficient.transformers.transform import transform_lm +from QEfficient.utils import hf_download, load_hf_tokenizer from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants from QEfficient.utils.device_utils import get_available_device_id, is_multi_qranium_setup_available, is_qpc_size_gt_32gb from QEfficient.utils.run_utils import ApiRunner @@ -67,10 +66,7 @@ def get_tokenizer(model_name): :param model_name: str :return tokenizer """ - model_hf_path = hf_download(repo_id=model_name, allow_patterns=["*.json"]) - tokenizer = transformers.AutoTokenizer.from_pretrained(model_hf_path, padding_side="left") - if tokenizer.pad_token_id is None: - tokenizer.pad_token_id = tokenizer.eos_token_id + tokenizer = load_hf_tokenizer(model_name=model_name) return tokenizer @@ -98,7 +94,7 @@ def transform_pt_model_with_qeff(model_hf): :param model_hf: pytorch model :return model_kv """ - model_kv = QEfficient.transform(model_hf, type="Transformers", form_factor="cloud") + model_kv = transform_lm(model_hf) model_kv.eval() return model_kv @@ -113,8 +109,7 @@ def export_onnx(model_kv, tokenizer, model_name, model_class): onnx_dir_path = os.path.join(QEFF_MODELS_DIR, model_name) base_path, onnx_model_path = qualcomm_efficient_converter( model_name=model_name, - model_class=model_class, - model_kv=model_kv, + model_kv=QEFFAutoModelForCausalLM(model=model_kv), # type: ignore tokenizer=tokenizer, onnx_dir_path=onnx_dir_path, kv=True, @@ -159,14 +154,13 @@ def set_up(model_config, device_group=[0]): model_config["model_name"], model_config["model_class"], ) - try: - ort_tokens = api_runner.run_kv_model_on_ort( - onnx_model_path, - model_config["n_layer"], - model_config["padding_shape"], - ) - except Exception as e: - print(f"ONNX Model run on onnxrt failed due to : {e}") + + ort_tokens = api_runner.run_kv_model_on_ort( + onnx_model_path, + model_config["n_layer"], + model_config["padding_shape"], + ) + setup_info = {} setup_info["model_config"] = model_config