diff --git a/LICENSE b/LICENSE index ffde93d75..03e0408a6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted (subject to the limitations in the diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 8ab26734a..742163e5e 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/base/__init__.py b/QEfficient/base/__init__.py index 5fb59abd0..257051d97 100644 --- a/QEfficient/base/__init__.py +++ b/QEfficient/base/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py index adfafbb30..450e5618e 100644 --- a/QEfficient/cloud/compile.py +++ b/QEfficient/cloud/compile.py @@ -55,7 +55,7 @@ "--device-group", required=True, type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")], - help="Cloud AI 100 device ids (comma-separated) e.g. [0] ", + help="Cloud AI 100 device ids (comma-separated) e.g. [0,1] ", ) parser.add_argument( "--aic_enable_depth_first", diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index bd22830c0..c6145dfd5 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -25,18 +25,23 @@ def main( hf_token: Optional[str] = None, ) -> None: """ - Helper function used by execute CLI app to run the Model on Cloud AI 100 Platform. - --------- + Helper function used by execute CLI app to run the Model on ``Cloud AI 100`` Platform. - :model_name: str. Hugging Face Model Card name, Example: "gpt2" - :qpc_path: str. Path to the generated binary after compilation. - :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. - :local_model_dir: str. Path to custom model weights and config files. - :prompt: str. Sample prompt for the model text generation - :prompts_txt_file_path: str. Path to txt file for multiple input prompts - :generation_len: int. Number of tokens to be generated. - :cache_dir: str. Cache dir where downloaded HuggingFace files are stored. - :hf_token: str. HuggingFace login token to access private repos. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2``. + :qpc_path (str): Path to the generated binary after compilation. + :device_group (List[int]): Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. + ``Optional`` Args: + :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` + :prompt (str): Sample prompt for the model text generation. ``Defaults to None.`` + :prompts_txt_file_path (str): Path to txt file for multiple input prompts. ``Defaults to None.`` + :generation_len (int): Number of tokens to be generated. ``Defaults to None.`` + :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to Constants.CACHE_DIR.`` + :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` + + .. code-block:: bash + + python -m QEfficient.cloud.execute OPTIONS """ tokenizer = load_hf_tokenizer( pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index c7dee0e2b..47b62b92a 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -28,13 +28,14 @@ def get_onnx_model_path( ): """ exports the model to onnx if pre-exported file is not found and returns onnx_model_path - --------- - :model_name: str. Hugging Face Model Card name, Example: "gpt2" - :cache_dir: str. Cache dir where downloaded HuggingFace files are stored. - :tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]. Pass model tokenizer. - :hf_token: str. HuggingFace login token to access private repos. - :local_model_dir: str. Path to custom model weights and config files. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2``. + ``Optional`` Args: + :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. ``Defaults to None.`` + :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` + :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` """ onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name) if onnx_path_exists: @@ -67,12 +68,19 @@ def main( ) -> None: """ Helper function used by export CLI app for exporting to ONNX Model. - --------- - :model_name: str. Hugging Face Model Card name, Example: gpt2 - :cache_dir: str. Cache dir to store the downloaded HuggingFace files. - :hf_token: str. HuggingFace login token to access private repos. - :local_model_dir: str. Path to custom model weights and config files. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2``. + + ``Optional`` Args: + :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` + :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` + :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` + + .. code-block:: bash + + python -m QEfficient.cloud.export OPTIONS + """ cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir) get_onnx_model_path(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token, local_model_dir=local_model_dir) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index f33dcc285..44d93933f 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -36,29 +36,29 @@ def main( hf_token: Optional[str] = None, ) -> None: """ - Helper function used by infer CLI app; to export, compile and execute the model on Cloud AI 100 Platform. - 1. Check if compiled qpc for given config already exists, if it does jump to execute, else - 2. Check if exported ONNX file already exists, if true, jump to compilation -> execution, else - 3. Check if HF model exists in cache, if true, start transform -> export -> compilation -> execution, else, - 4. Download HF model -> transform -> export -> compile -> execute - --------- + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + :num_cores (int): Number of cores to compile model on. + :device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled. + ``Optional`` Args: + :prompt (str): Sample prompt for the model text generation. ``Defaults to None.`` + :prompts_txt_file_path (str): Path to txt file for multiple input prompts. ``Defaults to None.`` + :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.`` + :mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.`` + :batch_size (int): Batch size to compile the model for. ``Defaults to 1.`` + :prompt_len (int): Prompt length for the model to compile. ``Defaults to 32.`` + :ctx_len (int): Maximum context length to compile the model. ``Defaults to 128.`` + :generation_len (int): Number of tokens to be generated. ``Defaults to False.`` + :mxfp6 (bool): Enable compilation for MXFP6 precision. ``Defaults to False.`` + :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.`` + :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` + :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` + :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` + + .. code-block:: bash + + python -m QEfficient.cloud.infer OPTIONS - :model_name: str. Hugging Face Model Card name, Example: "gpt2" - :num_cores: int. :num_cores: int. Number of cores to compile model on. - :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. - :prompt: str. Sample prompt for the model text generation - :prompts_txt_file_path: str. Path to txt file for multiple input prompts - :aic_enable_depth_first: bool. Enables DFS with default memory size, disabled by default. - :mos: int. Effort level to reduce the on-chip memory. - :batch_size: int. Batch size to compile the model for. - :prompt_len: int. prompt length for the model to compile. - :ctx_len: int. Maximum context length to compile the model. - :generation_len: int. Number of tokens to be generated. - :mxfp6: bool. Enable compilation for MXFP6 precision - :mxint8: Compress Present/Past KV to MXINT8 using CustomIO config, default is False. - :local_model_dir: str. Path to custom model weights and config files. - :cache_dir: str. Cache dir where downloaded HuggingFace files are stored. - :hf_token: str. HuggingFace login token to access private repos. """ cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir) tokenizer = load_hf_tokenizer( diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py index c2ed0106f..bcd4c7feb 100644 --- a/QEfficient/compile/compile_helper.py +++ b/QEfficient/compile/compile_helper.py @@ -113,21 +113,28 @@ def compile( **kwargs, ) -> str: """ - Helper function used by compile CLI app for compiling the Onnx Model on Cloud AI 100 Platform with given config. - --------- - - :onnx_path: str. Generated Onnx Model Path. - :qpc_path: str. Path for saving compiled qpc binaries. - :num_cores: int. Number of cores to compile model on. - :device_group: List[int]. Used for finding number of devices to compile for. - :aic_enable_depth_first: bool. Enables DFS with default memory size, disabled by default. - :mos: int. Effort level to reduce the on-chip memory. - :batch_size: int. Batch size to compile the model for. - :prompt_len: int. prompt len for the model to compile. - :ctx_len: int. Maximum context length to compile the model. - :mxfp6: bool. Enable compilation for MXFP6 precision - :mxint8: Compress Present/Past KV to MXINT8 using CustomIO config, default is False. - :custom_io_file_path: str. Path to custom IO file. + Compiles the given ``ONNX`` model using Cloud AI 100 platform SDK compiler and saves the compiled ``qpc`` package at ``qpc_path``. + Generates tensor-slicing configuration if multiple devices are passed in ``device_group``. + + This function will be deprecated soon and will be replaced by ``QEFFAutoModelForCausalLM.compile``. + + ``Mandatory`` Args: + :onnx_path (str): Generated ``ONNX`` Model Path. + :qpc_path (str): Path for saving compiled qpc binaries. + :num_cores (int): Number of cores to compile the model on. + :device_group (List[int]): Used for finding the number of devices to compile for. + ``Optional`` Args: + :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.`` + :mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.`` + :batch_size (int): Batch size to compile the model for. ``Defaults to 1.`` + :prompt_len (int): Prompt length for the model to compile. ``Defaults to 32`` + :ctx_len (int): Maximum context length to compile the model. ``Defaults to 128`` + :mxfp6 (bool): Enable compilation for ``MXFP6`` precision. ``Defaults to True.`` + :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.`` + :custom_io_file_path (str): Path to ``customIO`` file (formatted as a string). ``Defaults to None.`` + + Returns: + :str: Path to compiled ``qpc`` package. """ os.makedirs(qpc_path, exist_ok=True) specialization_json_path = os.path.join(qpc_path, "specializations.json") diff --git a/QEfficient/customop/__init__.py b/QEfficient/customop/__init__.py index 9bfd08998..5b0e1ff02 100644 --- a/QEfficient/customop/__init__.py +++ b/QEfficient/customop/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/customop/rms_norm.py b/QEfficient/customop/rms_norm.py index 210cca686..4cb1df71a 100644 --- a/QEfficient/customop/rms_norm.py +++ b/QEfficient/customop/rms_norm.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/exporter/__init__.py b/QEfficient/exporter/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/exporter/__init__.py +++ b/QEfficient/exporter/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index d775881a8..dda7487a3 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -32,18 +32,21 @@ def convert_to_cloud_bertstyle( seq_len: int, ) -> str: """ - Function to convert the model to Bertstyle approach. + API to convert model to Bertstyle approach. Bertstyle Approach: - 1. No Prefill/Decode separably compiled - 2. No KV retention logic. - 3. KV is every time computed for all the tokens until EOS/max_length - - Args: - model_name (str): The name of the model to be used. - qeff_model (QEFFBaseModel): Transformed KV torch model to be used - tokenizer (HF AutoTokenizer): Tokenizer to prepare inputs. - onnx_dir_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location. - seq_len (int, optional): The length of the sequence. Default is 128. + 1. No Prefill/Decode separably compiled. + 2. No KV retention logic. + 3. KV is every time computed for all the tokens until EOS/max_length. + + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: `gpt2`. + :qeff_model (QEFFAutoModelForCausalLM): Transformed KV torch model to be used. + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer. + :onnx_dir_path (str): Path to save exported ONNX file. + :seq_len (int): The length of the sequence. + + Returns: + :str: Path of exported ``ONNX`` file. """ if os.path.exists(onnx_dir_path): logger.warning(f"Overriding {onnx_dir_path}") @@ -147,19 +150,22 @@ def convert_to_cloud_kvstyle( seq_len: int, ) -> str: """ - Function Modeling changes for kv retention and export to Onnx. - KV Style Approach: - 1. This architecture is particularly suitable for autoregressive tasks - 2. where sequence generation involves processing one token at a time + API to convert model with kv retention and export to ONNX. + KV Style Approach- + 1. This architecture is particularly suitable for auto-regressive tasks. + 2. where sequence generation involves processing one token at a time. 3. And contextual information from earlier tokens is crucial for predicting the next token. 4. The inclusion of a kV cache enhances the efficiency of the decoding process, making it more computationally efficient. - Args: - model_name (str): The name of the model to be used. - qeff_model (QEFFBaseModel): Transformed KV torch model to be used - tokenizer (HF AutoTokenizer): Tokenzier to prepare inputs. - onnx_dir_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location. - seq_len (int, optional): The length of the sequence. Default is 128. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: `gpt2`. + :qeff_model (QEFFAutoModelForCausalLM): Transformed KV torch model to be used. + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer. + :onnx_dir_path (str): Path to save exported ONNX file. + :seq_len (int): The length of the sequence. + + Returns: + :str: Path of exported ``ONNX`` file. """ warnings.warn( "\033[93mThis function will be deprecated soon, use QEfficient.export instead\033[0m", @@ -369,22 +375,35 @@ def qualcomm_efficient_converter( form_factor: str = "cloud", ) -> Tuple[str, str]: """ - Function to convert the input string using the specified model and returns the result. - - Args: - model_name (str): The name of the model to be used. - model_kv (torch.nn.Module): Transformed KV torch model to be used - local_model_dir(str): Path to custom model weights and config files - tokenizer (HF AutoTokenizer): Tokenzier to prepare inputs. - cache_dir (str): Path to cache dir if not specified, default HF cache_dir will be used. - onnx_dir_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location. - hf_token (bool): If True, an authentication token will be used. Default is False. - seq_len (int, optional): The length of the sequence. Default is 128. - kv (bool): If True, key-value pairs will be used. Default is True. - form_factor (str): form_factor of the hardware, currently only accepts "cloud". + This method is an alias for ``QEfficient.export``. + + Usage 1: This method can be used by passing ``model_name`` and ``local_model_dir`` or ``cache_dir`` if required for loading from local dir. + This will download the model from ``HuggingFace`` and export it to ``ONNX`` graph and returns generated files path check below. + + Usage 2: You can pass ``model_name`` and ``model_kv`` as an object of ``QEfficient.QEFFAutoModelForCausalLM``, In this case will directly export the ``model_kv.model`` to ``ONNX`` + + We will be deprecating this function and it will be replaced by ``QEffAutoModelForCausalLM.export``. + + ``Mandatory`` Args: + :model_name (str): The name of the model to be used. + ``Optional`` Args: + :model_kv (torch.nn.Module): Transformed ``KV torch model`` to be used. ``Defaults to None``. + :local_model_dir (str): Path of local model. ``Defaults to None``. + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer. ``Defaults to None``. + :cache_dir (str): Path of the ``cache`` directory. ``Defaults to None``. + :onnx_dir_path (str): Path to store ``ONNX`` file. ``Defaults to None``. + :hf_token (str): HuggingFace token to access gated models. ``Defaults is None``. + :seq_len (int): The length of the sequence. ``Defaults is 128``. + :kv (bool): If false, it will export to Bert style. ``Defaults is True``. + :form_factor (str): Form factor of the hardware, currently only ``cloud`` is accepted. ``Defaults to cloud``. Returns: - None, if automation is False, else path to exported Onnx file + :Tuple[str, str]: Path to Base ``ONNX`` dir and path to generated ``ONNX`` model + + .. code-block:: python + + import QEfficient + base_path, onnx_model_path = QEfficient.export(model_name="gpt2") """ warnings.warn( @@ -392,6 +411,7 @@ def qualcomm_efficient_converter( DeprecationWarning, stacklevel=2, ) + # Get model_kv first model_kv = ( model_kv diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py index 8c33bc6ca..75ee08a89 100644 --- a/QEfficient/exporter/export_utils.py +++ b/QEfficient/exporter/export_utils.py @@ -27,6 +27,19 @@ def export_onnx( gen_models_path: str, model_base_name: str, ) -> str: + """ + API for export PyTorch model to ONNX. + + Args: + :pt_model (torch.nn.Module): PyTorch model that will be exported to ``ONNX`` format. + :inputs (Dict[str, torch.Tensor]): Processed torch input for the model. + :output_names (List[str]): Output of pytorch model inference. + :gen_models_path (str): Path of generated ``ONNX`` model. + :model_base_name (str): Base name for the exported ``ONNX`` file. + + Return: + :str: Updated base name of exported ``ONNX`` model. + """ # Inspect the model's forward method arguments pt_model_code = pt_model.forward.__code__ pt_input_names = pt_model_code.co_varnames[1 : pt_model_code.co_argcount] @@ -124,6 +137,17 @@ def export_onnx( def save_onnx(model: Union[onnx.ModelProto, str], gen_models_path: str, model_base_name: str) -> str: + """ + API to save ONNX model and it's data separately if size of ``ONNX`` model is greater than 2GB. + + Args: + :model (Union[onnx.ModelProto, str]): Pass ``ONNX`` model or path of the model. + :gen_models_path (str): Path of generated ``ONNX`` model. + :model_base_name (str): Base name of the HuggingFace model. + + Return: + :str: Base name of ``ONNX`` exported model. + """ if isinstance(model, str): model = onnx.load(f"{gen_models_path}/{model}.onnx") @@ -149,20 +173,21 @@ def save_onnx(model: Union[onnx.ModelProto, str], gen_models_path: str, model_ba return model_base_name -def remove_temp_file(file_path_model, file_path_weights): +def remove_temp_file(model_file_path: str, weights_file_path: str): """ - Function to remove a temporary file + API to remove a temporary file - :param str file_path: Path to the file to be deleted - :file_path_weights: Path to the weights file + Args: + :model_file_path (str); Path to the file to be deleted + :weights_file_path (str): Path to the weights file """ try: - os.remove(file_path_model) - os.remove(file_path_weights) + os.remove(model_file_path) + os.remove(weights_file_path) except FileNotFoundError: - print(f"File '{file_path_model}' does not exist.") + print(f"File '{model_file_path}' does not exist.") except Exception as e: - print(f"Error deleting file '{file_path_model}': {e}") + print(f"Error deleting file '{model_file_path}': {e}") def fix_onnx_fp16( @@ -173,6 +198,20 @@ def fix_onnx_fp16( model_base_name: str, pt_outputs: Dict[str, torch.Tensor], ) -> str: + """ + API to clip model weights in fp16 range and save updated clipped ``ONNX`` model. + + Args: + :inputs (Dict[str, torch.Tensor]): Processed torch input for the model. + :output_names (List[str]): Output names of pytorch model inference. + :ort_outputs (List[np.ndarray]): Output of onnxruntime. + :gen_models_path (str): Path of generated ``ONNX`` model. + :model_base_name (str): Base name for the exported ONNX model. + :pt_outputs (Dict[str, torch.Tensor]): Output of PyTorch model inference. + + Return: + :str: Updated base name of exported ONNX model. + """ model = onnx.load(os.path.join(gen_models_path, f"{model_base_name}.onnx")) # TODO: Remove this `fix_onnx_fp16` function and replace with this transform # as we're not utilizing the validations done in this function @@ -227,6 +266,15 @@ def generate_input_files( inputs: Dict[str, torch.Tensor], input_list_file: str, ): + """ + API to generate input files, required for Cloud AI 100 execution. + + Args: + :input_files_path (str): Path to save input files. + :input_names (List[str]): Names of inputs to be saved. + :inputs (dict[str, torch.tensor]): Input tensors to be saved in raw format. + :input_list_file (str): File name to save the names of inputs in order. Example - "input_list.txt" + """ # inputFiles os.makedirs(input_files_path, exist_ok=True) filenames = [] @@ -252,6 +300,19 @@ def run_model_on_ort( pt_outputs: Dict[str, torch.Tensor], dtype: bool = True, ) -> Tuple[List[str], List[np.ndarray]]: + """ + API to run ONNX model on ONNX runtime + + Args: + :onnx_path (str): Path of ONNX model. + :inputs (Dict[str, torch.Tensor]): Processed torch input for the model. + :output_names (List[str]): Output from pytorch inference. + :pt_outputs (Dict[str, torch.Tensor]): Output of PyTorch model inference. + :dtype (bool): If False it will consider you are passing clipped version of ``ONNX`` model. + + Return: + :Tuple[List[str], List[np.ndarray]]: input_names + """ try: if dtype: info_string = "fp32" diff --git a/QEfficient/generation/__init__.py b/QEfficient/generation/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/generation/__init__.py +++ b/QEfficient/generation/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/generation/cloud_infer.py b/QEfficient/generation/cloud_infer.py index 558f9164a..aac3d60d7 100644 --- a/QEfficient/generation/cloud_infer.py +++ b/QEfficient/generation/cloud_infer.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- @@ -48,6 +48,15 @@ def __init__( activate: bool = True, enable_debug_logs: bool = False, ): + """ + Initialise for QAIC inference Session + --------- + + :qpc_path: str. Path to the save generated binary file after compilation. + :device_ids: List[int]. Device Ids to be used for compilation. if devices > 1, it enables multiple card setup. + :activate: bool. If false, activation will be disabled. Default=True. + :enable_debug_logs: bool. If True, It will enable debug logs. Default=False. + """ # Load QPC devices = qaicrt.QIDList(device_ids) self.context = qaicrt.Context(devices) @@ -92,14 +101,25 @@ def output_names(self) -> List[str]: return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_OUTPUT] def activate(self): + """Activate qpc""" + self.program.activate() self.execObj = qaicrt.ExecObj(self.context, self.program) def deactivate(self): + """Deactivate qpc""" + del self.execObj self.program.deactivate() def set_buffers(self, buffers: Dict[str, np.ndarray]): + """ + Provide buffer mapping for input and output + + Args: + :buffer (Dict[str, np.ndarray]): Parameter for buffer mapping. + """ + for buffer_name, buffer in buffers.items(): if buffer_name not in self.binding_index_map: warn(f'Buffer: "{buffer_name}" not found') @@ -112,9 +132,25 @@ def set_buffers(self, buffers: Dict[str, np.ndarray]): ) def skip_buffers(self, skipped_buffer_names: List[str]): + """ + skip buffer mapping for given list of buffer names + + Args: + :skipped_buffer_name: List[str]. List of buffer name to be skipped. + """ + self.set_buffers({k: np.array([]) for k in skipped_buffer_names}) def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + """ + Execute on cloud AI 100 + + Args: + :inputs (Dict[str, np.ndarray]): Processed numpy inputs for the model. + + Return: + :Dict[str, np.ndarray]: + """ # Set inputs self.set_buffers(inputs) assert self.execObj.setData(self.qbuffers, self.buf_dims) == qaicrt.QStatus.QS_SUCCESS, "Failed to setData" diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 845f4fb87..d3cd87247 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -23,14 +23,16 @@ @dataclass class CloudAI100ExecInfo: """ - holds all the information about Cloud AI 100 execution - :batch_size: int - :generated_texts: Union[List[List[str]], List[str]] - :generated_ids: Union[List[np.ndarray], np.ndarray] - :prefill_time: float - :decode_perf: float - :total_perf: float - :total_time: float + Holds all the information about Cloud AI 100 execution + + Args: + :batch_size (int): Batch size of the QPC compilation. + :generated_texts (Union[List[List[str]], List[str]]): Generated text(s). + :generated_ids (Union[List[np.ndarray], np.ndarray]): Generated IDs. + :prefill_time (float): Time for prefilling. + :decode_perf (float): Decoding performance. + :total_perf (float): Total performance. + :total_time (float): Total time. """ batch_size: int @@ -99,6 +101,16 @@ def latency_stats_bertstyle( prompt: str, device_id: List[int] = [0], ): + """ + Function to execute Bertstyle ONNX model on Cloud AI 100. + + Args: + :model_name (str): Hugging Face Model Card name, Example: gpt2. + :qpc_path (str): Path to save generated binary file after compilation. + :seq_len (int): Sequence length. + :prompt (str): Sample prompt for the model text generation. + :device_id (List[int]): Device Ids to be used for compilation. If devices > 1, it enables multiple card setup. + """ session = QAICInferenceSession(qpc_path, device_id) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, padding_side="left") padding_check_and_fix(tokenizer) # Check and fix tokenizer viability @@ -189,6 +201,22 @@ def cloud_ai_100_exec_kv_helper( stream: bool = True, write_io_dir: Optional[str] = None, ): + """ + Helper function to execute QEfficient transformed ONNX model on ``Cloud AI 100`` using compiled QPC file. + + ``Mandatory`` Args: + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer. + :qpc_path (str): Path to the saved generated binary file after compilation. + :prompt (str): Sample prompt for the model text generation. + :ctx_len (int): Input length of the prompt to determine the number of chunks to execute on ``Cloud AI 100``. + ``Optional`` Args: + :generation_len (int): Maximum context length for the model during compilation. ``Defaults to None``. + :device_id (List[int]): Device IDs to be used for compilation. If ``len(device_id) > 1``, it enables multiple card setup. ``Defaults to [0]``. + :enable_debug_logs (bool): If True, it enables debugging logs. ``Defaults to False``. + :stream (bool): If True, enable streamer, which returns tokens one by one as the model generates them.``Defaults to True``. + :Write_io_dir (str): Path to write the input and output files.``Defaults to None``. + """ + if tokenizer.padding_side != "right": logger.warning("Please use padding_side='right' while initializing the tokenizer") tokenizer.padding_side = "right" @@ -321,6 +349,38 @@ def cloud_ai_100_exec_kv( write_io_dir: Optional[str] = None, automation=False, ): + """ + This method generates output until ``eos`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. + This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed. + If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped. + + ``Mandatory`` Args: + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer. + :qpc_path (str): Path to the saved generated binary file after compilation. + + ``Optional`` Args: + :prompt (str): Sample prompt for the model text generation. ``Defaults to None``. + :prompts_txt_file_path (str): Path of the prompt text file. ``Defaults to None``. + :generation_len (int): Maximum context length for the model during compilation. ``Defaults to None``. + :device_id (List[int]): Device IDs to be used for compilation. If ``len(device_id) > 1``, it enables multiple card setup. ``Defaults to [0]``. + :enable_debug_logs (bool): If True, it enables debugging logs. ``Defaults to False``. + :stream (bool): If True, enable streamer, which returns tokens one by one as the model generates them. ``Defaults to True``. + :Write_io_dir (str): Path to write the input and output files. ``Defaults to None``. + :automation (bool): If true, it prints input, output, and performance stats. ``Defaults to False``. + + Returns: + :CloudAI100ExecInfo: Object holding execution output and performance details. + + .. code-block:: python + + import transformers + import QEfficient + base_path, onnx_model_path = QEfficient.export(model_name="gpt2") + qpc_path = QEfficient.compile(onnx_path=onnx_model_path, qpc_path=os.path.join(base_path, "qpc"), num_cores=14, device_group=[0]) + tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2") + execinfo = QEfficient.cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=qpc_path, prompt="Hi there!!", device_id=[0]) + + """ batch_size, ctx_len = get_compilation_dims(qpc_path) prompt: List[str] = get_input_prompts(prompt, prompts_txt_file_path) prompt = fix_prompts(prompt, batch_size) diff --git a/QEfficient/transformers/__init__.py b/QEfficient/transformers/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/__init__.py +++ b/QEfficient/transformers/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py index fe70bbab0..0d007e4e6 100644 --- a/QEfficient/transformers/cache_utils.py +++ b/QEfficient/transformers/cache_utils.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/codegen/__init__.py b/QEfficient/transformers/models/codegen/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/models/codegen/__init__.py +++ b/QEfficient/transformers/models/codegen/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/falcon/__init__.py b/QEfficient/transformers/models/falcon/__init__.py index cb1682a79..d259e435a 100644 --- a/QEfficient/transformers/models/falcon/__init__.py +++ b/QEfficient/transformers/models/falcon/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/gpt2/__init__.py b/QEfficient/transformers/models/gpt2/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/models/gpt2/__init__.py +++ b/QEfficient/transformers/models/gpt2/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/gptj/__init__.py b/QEfficient/transformers/models/gptj/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/models/gptj/__init__.py +++ b/QEfficient/transformers/models/gptj/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/llama/__init__.py b/QEfficient/transformers/models/llama/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/models/llama/__init__.py +++ b/QEfficient/transformers/models/llama/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py index f3e068b38..bce922f40 100644 --- a/QEfficient/transformers/models/llama/modeling_llama.py +++ b/QEfficient/transformers/models/llama/modeling_llama.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/mistral/__init__.py b/QEfficient/transformers/models/mistral/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/models/mistral/__init__.py +++ b/QEfficient/transformers/models/mistral/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/mistral/modeling_mistral.py b/QEfficient/transformers/models/mistral/modeling_mistral.py index d703ea3f2..d2a778500 100644 --- a/QEfficient/transformers/models/mistral/modeling_mistral.py +++ b/QEfficient/transformers/models/mistral/modeling_mistral.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/mixtral_moe/__init__.py b/QEfficient/transformers/models/mixtral_moe/__init__.py index 8694aa938..d259e435a 100644 --- a/QEfficient/transformers/models/mixtral_moe/__init__.py +++ b/QEfficient/transformers/models/mixtral_moe/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py index da8e5cf8c..2b7261aa3 100644 --- a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py +++ b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 87fb67847..21c893b41 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -26,7 +26,7 @@ class QEFFTransformersBase(QEFFBaseModel): """ - Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from src/transformers/models/auto/modeling_auto.py file. + Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from transformers/models/modeling_auto.py file. """ def __init__(self, model: nn.Module, pretrained_model_name_or_path: str, **kwargs) -> None: @@ -52,20 +52,34 @@ def __init__(self, model: nn.Module, pretrained_model_name_or_path: str, **kwarg def __repr__(self) -> str: return f"{self.__class__.__name__}\n" + self.model.__repr__() - @property - def tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: - if self._tokenizer is None: - self._tokenizer = self.get_tokenizer() - return self._tokenizer - @classmethod def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): """ - This method accepts All the parameters that are acceptable by transformers.AutoModelForCausalLM. + This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModelForCausalLM. + Once the model is initialized, you can use other methods such as export, compile, and generate on the same object. + + Accepts All the parameters that are acceptable by ``transformers.AutoModelForCausalLM`` There are few additional parameters that this method can take. - --------- - :transform: bool. Whether to optimize model for KV retention; default is True. Pass False to get BertStyle model. - :model_card_name: str. HuggingFace model card name or name of the model if custom, used for deciding folder name while saving ONNX/qpc files. + + ``Mandatory`` Args: + :transform (bool): Whether to optimize model for KV retention; default is ``True``. Pass ``False`` to get BertStyle model. + :model_card_name (str): ``HuggingFace`` model card name or name of the model if custom, used for deciding directory name while saving ``ONNX/qpc`` files. + + Example usage: + + .. code-block:: python + + from QEfficient import QEFFAutoModelForCausalLM + + # Initialize the model using from_pretrained similar to transformers.AutoModelForCausalLM + model = QEFFAutoModelForCausalLM.from_pretrained("gpt2") + + # Now you can directly compile the model for Cloud AI 100 + model.compile(num_cores=14, device_group=[0]) # Considering you have a Cloud AI 100 Standard SKU + + # You can now execute the model + model.generate(prompts=["Hi there!!"]) + """ model_card_name = kwargs.pop( "model_card_name", None @@ -85,6 +99,18 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): **kwargs, ) + @property + def tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + """Returns the tokenizer for given model based on ``self.pretrained_model_name_or_path``. + Loads the tokenizer if required. + + Returns: + :Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: Tokenizer from ``transformers`` for the given model. + """ + if self._tokenizer is None: + self._tokenizer = self.get_tokenizer() + return self._tokenizer + def get_tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=self.pretrained_model_name_or_path, **self.kwargs) return tokenizer @@ -92,12 +118,30 @@ def get_tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: class QEFFAutoModelForCausalLM(QEFFTransformersBase): """ - QEFF class for manipulating any causal language model from HuggingFace hub. + The QEFF class is designed for manipulating any causal language model from the HuggingFace hub. + Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. + Please note that the QEFF class is also a part of the ``QEfficient`` module. + + ``Mandatory`` Args: + :model (nn.Module): PyTorch model + :pretrained_model_name_or_path (str): We recommend passing name of the model as input here, as you are not using `from_pretrained` method. This name will be used for deciding path of the ``ONNX/qpc`` files generated during ``export``, ``compilation`` stages. + + .. code-block:: python + + from QEfficient import QEFFAutoModelForCausalLM + """ _pytorch_transforms = [CustomOpsTransform, KVCacheTransform] def transform(self): + """ + This method applies all relevant optimization transforms on the model and toggles the ``self.is_transformed`` attribute to True. If the model is already transformed, the method will simply return. + Please note that this method does not require any input arguments." + + Returns: + :obj: Same object with transformed ``self.model`` + """ if self.is_transformed: return for transform in self._pytorch_transforms: @@ -108,6 +152,21 @@ def execute(self, *args, **kwargs): # type: ignore raise NotImplementedError("Reached too far!!") def export(self, model_card_name: Optional[str] = None) -> str: + """ + Exports the model to ``ONNX`` format using ``torch.onnx.export``. + The model should already be transformed i.e. ``self.is_transformed`` should be ``True``. + Otherwise, this will raise an ``AssertionError``. + We currently don't support exporting non-transformed models. Please refer to the ``convert_to_cloud_bertstyle`` function in the **Low-Level API** for a legacy function that supports this." + + ``Optional`` Args: + :model_card_name (Optional[str]): Name of the model card. Mandatory when model is initialized with path for ``pretrained_model_name_or_path`` argument during initialization. ``Defaults to None.`` + + Raises: + :AttributeError: If ``pretrained_model_name_or_path`` is a path, this function needs model card name of the model so that it can distinguish between directories while saving the ``ONNX`` files generated. So, user needs to pass ``model_card_name`` as a valid ``string`` in that case, Otherwise this will raise the error. + + Returns: + :str: Path of the generated ``ONNX`` graph. + """ assert self.is_transformed, "Please first run transform on the QEFFAutoModelForCausalLM object" # Make sure model_card_name is available for export @@ -135,6 +194,27 @@ def compile( mos: int = -1, aic_enable_depth_first: bool = False, ) -> str: + """ + This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package. + If the model has not been exported yet, this method will handle the export process. + The generated ``qpc`` can be found under the directory ``efficient-transformers/qeff_models/{self.model_card_name}/qpc``. + + ``Mandatory`` Args: + :num_cores (int): Number of cores used to compile the model. + :device_group (List[int]): If this is a list of more that one integers, tensor-slicing is invoked. + ``Optional`` Args: + :model_card_name (Optional[str], optional): Name of the model, Mandatory if ``self.pretrained_model_name_or_path`` is a path. ``Defaults to None``. + :batch_size (int, optional): Batch size. ``Defaults to 1``. + :prompt_len (int, optional): The length of the Prefill prompt should be less that ``prompt_len``. ``Defaults to 32``. + :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``. + :mxfp6 (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``. + :mxint8 (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``. + :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``. + :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. + + Returns: + :str: Path of the compiled ``qpc`` package. + """ # Export first if self.ort_runtime_args are not populated if self.onnx_path is None: logger.info(f"Exporting the {self.model.__class__.__name__} model to ONNX for compilation!") @@ -173,6 +253,16 @@ def compile( return self.qpc_path def generate(self, prompts: List[str], runtime: str = "AI_100", **kwargs): + """ + This method generates output until ``eos`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. + This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed. + If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped. + + ``Mandatory`` Args: + :prompts (List[str]): List of prompts to run the execution. + ``optional`` Args: + :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100". + """ assert Runtime(runtime) == Runtime.AI_100, "Only AI_100 runtime is supported right now via generate API" self.run_cloud_ai_100(prompts=prompts, **kwargs) diff --git a/QEfficient/transformers/models/mpt/__init__.py b/QEfficient/transformers/models/mpt/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/models/mpt/__init__.py +++ b/QEfficient/transformers/models/mpt/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/phi3/__init__.py b/QEfficient/transformers/models/phi3/__init__.py index 9b442c4ae..da26921c5 100644 --- a/QEfficient/transformers/models/phi3/__init__.py +++ b/QEfficient/transformers/models/phi3/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/phi3/modeling_phi3.py b/QEfficient/transformers/models/phi3/modeling_phi3.py index aebc92bf0..17f1a8bfc 100644 --- a/QEfficient/transformers/models/phi3/modeling_phi3.py +++ b/QEfficient/transformers/models/phi3/modeling_phi3.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/qwen2/__init__.py b/QEfficient/transformers/models/qwen2/__init__.py index cb1682a79..d259e435a 100644 --- a/QEfficient/transformers/models/qwen2/__init__.py +++ b/QEfficient/transformers/models/qwen2/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py index 0d5cd19f2..d5dc1b001 100644 --- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py +++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/starcoder2/__init__.py b/QEfficient/transformers/models/starcoder2/__init__.py index cb1682a79..d259e435a 100644 --- a/QEfficient/transformers/models/starcoder2/__init__.py +++ b/QEfficient/transformers/models/starcoder2/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py index f63b19327..b6a4d4d23 100644 --- a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py +++ b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py index eccdba286..e2c65c6c3 100644 --- a/QEfficient/transformers/transform.py +++ b/QEfficient/transformers/transform.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- @@ -19,9 +19,10 @@ def replace_module_with_qeff_layers(model: nn.Module) -> None: """ - Replaces the transformers nn.Module classes with optmized QEff classes in place. - ---------- - :param model: torch.nn.Module. Base PyTorch model. + Replaces the transformers nn.Module classes with optimized QEff classes in place. + + Args: + :model (torch.nn.Module) Base PyTorch model. """ # Replace if module class is registed in TransformersToQEffModulesDict target_module = TransformersToQEffModulesDict.get(model.__class__) @@ -36,9 +37,12 @@ def replace_module_with_qeff_layers(model: nn.Module) -> None: def get_params_hash(model: nn.Module) -> str: """ Creates a Hash of all the parameters values i.e. weights using SHA256 algo. - -------- - :param model: torch.nn.Module. Base PyTorch model. - :returns: str. Hash string + + Args: + model (torch.nn.Module): Base PyTorch model. + + Returns: + :str: Hash string """ hasher = hashlib.sha256() for _, params in model.named_parameters(): @@ -49,13 +53,13 @@ def get_params_hash(model: nn.Module) -> str: def transform_lm(model: nn.Module) -> nn.Module: """ - Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for cloud AI 100. - --------- + Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for Cloud AI 100. + Args: - param model (torch.nn.Module): PyTorch model. + model (torch.nn.Module): PyTorch model. Returns: - torch.nn.Module: PyTorch Module with replaced QEff layers. + :torch.nn.Module: PyTorch Module with replaced QEff layers. """ # Introducnig qeff_transformed attribue in model to check status of transform @@ -84,11 +88,11 @@ def transform_lm(model: nn.Module) -> nn.Module: def transform(model: QEFFBaseModel, form_factor="cloud"): """ - This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for cloud AI 100. + This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for Cloud AI 100. Will replace the torch.nn.Module layers of passed QEffModel with optimized implementation of the same. - model: object of any instance of class that is child of `QEFFBaseAutoModelFactory` - form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"]. + model (torch.nn.Module): object of any instance of class that is child of `QEFFBaseAutoModelFactory` + form_factor (str): form factor configuration for optimizing the model, available options=["cloud", "edge"]. """ assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!" # FIXME: move this to class and use model.transform() diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index d3283ece1..61efc4f92 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -82,9 +82,12 @@ def qpc_exists(qpc_dir_path: str) -> bool: 1. Boolean variable indicating if qpc files exist 2. Path of the qpc dir if found. --------- - :param model_name: str. HF Model card name. - :param dir_path: str. Path of qpc directory. - :return: Union[Tuple[bool, str]]: qpc_exists and path to qpc directory + + :model_name: `str` - HF Model card name. + :dir_path: `str` - Path of qpc directory. + + Return: + qpc_exists and path to qpc directory """ # Compute the boolean indicating if the QPC exists @@ -97,8 +100,11 @@ def onnx_exists(model_name: str) -> Tuple[bool, str, str]: """ Checks if qpc files already exists, removes the directory if files have been manipulated. --------- - :param model_name: str. HF Model card name. - :return: Union[Tuple[bool, str, str]]: onnx_exists and path to onnx file and directory + + :model_name: `str`- HF Model card name. + + Return: + onnx_exists and path to onnx file and directory """ model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) os.makedirs(model_card_dir, exist_ok=True) @@ -182,7 +188,7 @@ def padding_check_and_fix(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokeni Checks and fixes tokenizer paddding side and pad_token_id viability. -------- - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]. Pass model tokenizer to check and fix. + tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to check and fix. """ if tokenizer.padding_side != "right": logger.warning(f"Setting tokenizer padding_side to 'right', got {tokenizer.padding_side}") @@ -208,7 +214,8 @@ def get_padding_shape_from_config(config, batch_size, seq_len): :batch_size: int. number of input prompts used to create inputs :seq_len: int. sequence length to run the model for. - :return: List[int, int, int, int] + Return: + List[int, int, int, int] """ if hasattr(config, "n_head"): # Assuming n_head is a key in the config (GPTs/CodeGen) @@ -242,7 +249,8 @@ def get_num_layers_from_config(config): :config: AutoConfig from pretrained model. - :return: int: number of layers + Return: + number of layers """ if hasattr(config, "n_layer"): # Assuming n_layer is a key in the config (GPTs/CodeGen) diff --git a/QEfficient/utils/device_utils.py b/QEfficient/utils/device_utils.py index 8faaf5f10..fded5d8e4 100644 --- a/QEfficient/utils/device_utils.py +++ b/QEfficient/utils/device_utils.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- @@ -13,6 +13,13 @@ def get_available_device_id(): + """ + API to check available device id. + + Return: + :int: Available device id. + """ + device_id = 0 result = None while 1: diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py index 657b836ba..79e8653d8 100644 --- a/QEfficient/utils/generate_inputs.py +++ b/QEfficient/utils/generate_inputs.py @@ -15,14 +15,14 @@ class InputHandler: def __init__(self, batch_size, tokenizer, config, prompt, prompt_len, ctx_len): """ Initialization - -------- - - :batch_size: int. Number of prompts to run in one batch. - :tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]. Pass model tokenizer. - :config: AutoConfig from pretrained model. - :prompt: List[str]. String to used as input prompt for the model. - :prompt_len: int. prompt length for the model to compile. - :ctx_len: int. Maximum context length to compile the model. + + ``Mandatory`` Args: + :batch_size (int): Number of prompts to run in one batch. + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. + :config (AutoConfig): From pretrained model. + :prompt (List[str]): String to used as input prompt for the model. + :prompt_len (int): Prompt length for the model to compile. + :ctx_len (int): Maximum context length to compile the model. """ # check and fix tokenizer viability padding_check_and_fix(tokenizer) @@ -36,9 +36,9 @@ def __init__(self, batch_size, tokenizer, config, prompt, prompt_len, ctx_len): def prepare_pytorch_inputs(self): """ Function responsible for creating Prefill stage tensor inputs for PyTorch model. - -------- - :return inputs: Dict. input_ids, position_ids, past_key_values + Return: + :Dict: input_ids, position_ids, past_key_values """ inputs = self.tokenizer( @@ -79,12 +79,13 @@ def prepare_pytorch_inputs(self): def update_pytorch_inputs(self, inputs, pt_outputs): """ Function responsible for updating Prefill stage inputs to create decode stage inputs for PyTorch model. - -------- - :inputs: Dict. Pytorch inputs from previous iteration - :pt_outputs: Dict. Pytorch outputs from previous iteration + ``Mandatory`` Args: + :inputs (Dict): Pytorch inputs from previous iteration + :pt_outputs (Dict): Pytorch outputs from previous iteration - :return updated_inputs: Dict. Updated input_ids, position_ids and past_key_values + Return: + :Dict: Updated input_ids, position_ids and past_key_values """ updated_inputs = {} updated_inputs["input_ids"] = pt_outputs["logits"].argmax(-1).reshape(-1, 1) @@ -97,9 +98,9 @@ def update_pytorch_inputs(self, inputs, pt_outputs): def prepare_ort_inputs(self): """ Function responsible for creating Prefill stage numpy inputs for ONNX model to be run on ONNXRT. - -------- - :return inputs: Dict. input_ids, position_ids, past_key_values + Return: + :Dict: input_ids, position_ids, past_key_values """ inputs = self.tokenizer( @@ -129,12 +130,13 @@ def prepare_ort_inputs(self): def update_ort_inputs(self, inputs, ort_outputs): """ Function responsible for updating Prefill stage inputs to create inputs for decode stage inputs for ONNX model to be run on ONNXRT. - -------- - :inputs: Dict. NumPy inputs of Onnx model from previous iteration - :ort_outputs: Dict. Numpy outputs of Onnx model from previous iteration + ``Mandatory`` Args: + :inputs (Dict): NumPy inputs of Onnx model from previous iteration + :ort_outputs (Dict): Numpy outputs of Onnx model from previous iteration - :return updated_inputs: Dict. Updated input_ids, position_ids and past_key_values + Return: + :Dict: Updated input_ids, position_ids and past_key_values """ updated_inputs = {} @@ -149,11 +151,12 @@ def update_ort_inputs(self, inputs, ort_outputs): def update_ort_outputs(self, ort_outputs): """ Function responsible for updating ONNXRT session outputs. - -------- - :ort_outputs: Dict. Numpy outputs of Onnx model from current iteration + ``Mandatory`` Args: + :ort_outputs (Dict): Numpy outputs of Onnx model from current iteration - :return updated_outputs: Dict. Updated past_key_values, logits + Return: + updated_outputs (Dict): Updated past_key_values, logits """ present_key_values = [] diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py index 8dbfd378d..c17fde29c 100644 --- a/QEfficient/utils/logging_utils.py +++ b/QEfficient/utils/logging_utils.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index ee5471cb1..8acd36f46 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -20,23 +20,23 @@ class ApiRunner: ApiRunner class is responsible for running: --------- - 1. HuggingFace PyTorch model + 1. HuggingFace ``PyTorch`` model 2. Transformed KV Pytorch Model - 3. ONNX model on ONNXRT - 4. ONNX model on Cloud AI 100 + 3. ``ONNX`` model on ONNXRT + 4. ``ONNX`` model on Cloud AI 100 """ def __init__(self, batch_size, tokenizer, config, prompt, prompt_len, ctx_len): """ Initialization - -------- - - :batch_size: int. Number of prompts to run in one batch. - :tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]. Pass model tokenizer. - :config: AutoConfig from pretrained model. - :prompt: List[str]. input prompt for running the model. - :prompt_len: int. prompt length to compile the model. - :ctx_len: int. Maximum context length to compile the model. + + Args: + :batch_size (int): Number of prompts to run in one batch. + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. + :config (AutoConfig): From pretrained model. + :prompt (List[str]): Input prompt for running the model. + :prompt_len (int): Prompt length to compile the model. + :ctx_len (int): Maximum context length to compile the model. """ self.input_handler = InputHandler( batch_size=batch_size, @@ -52,12 +52,13 @@ def __init__(self, batch_size, tokenizer, config, prompt, prompt_len, ctx_len): @torch.no_grad() def run_hf_model_on_pytorch(self, model_hf): """ - Function responsible for running HuggingFace PyTorch model and return the output tokens - -------- + Function responsible for running HuggingFace ``PyTorch`` model and return the output tokens - :model_hf: torch.nn.module. Original PyTorch model + ``Mandatory`` Args: + :model_hf (torch.nn.module): Original ``PyTorch`` model - :return generated_ids: numpy.ndarray. Generated output tokens + Return: + :numpy.ndarray: Generated output tokens """ input_ids = self.input_handler.tokenizer.encode(self.input_handler.prompt[0], return_tensors="pt") @@ -78,12 +79,13 @@ def run_hf_model_on_pytorch(self, model_hf): def run_kv_model_on_pytorch(self, model): """ - Function responsible for running KV PyTorch model and return the output tokens - -------- + Function responsible for running KV ``PyTorch`` model and return the output tokens - :model: torch.nn.module. Transformed PyTorch model + ``Mandatory`` Args: + :model (torch.nn.module): Transformed ``PyTorch`` model - :return generated_ids: numpy.ndarray. Generated output tokens + Return: + :numpy.ndarray: Generated output tokens """ generated_ids = [] @@ -103,16 +105,16 @@ def run_kv_model_on_pytorch(self, model): print("Completion:", repr(predicted_string)) return generated_ids - def run_ort_session(self, inputs, session): + def run_ort_session(self, inputs, session) -> dict: """ - Function responsible for running onnxrt session with given inputs and - passing retained state outputs to be used for next iteration inputs - -------- + Function responsible for running onnxrt session with given inputs and passing retained state outputs to be used for next iteration inputs - :inputs: Dict. Numpy inputs of Onnx model - :session: 'onnxruntime.capi.onnxruntime_inference_collection.InferenceSession'. + ``Mandatory`` Args: + :inputs (Dict): + :session (onnxruntime.capi.onnxruntime_inference_collection.InferenceSession): - :return outputs: Dict. Numpy outputs of Onnx model + Return: + :Dict: Numpy outputs of Onnx model """ output_names = [x.name for x in session.get_outputs()] session_input_names = [x.name for x in session.get_inputs()] @@ -126,12 +128,13 @@ def run_ort_session(self, inputs, session): def run_kv_model_on_ort(self, model_path): """ - Function responsible for running ONNX model on onnxruntime and return the output tokens - -------- + Function responsible for running ``ONNX`` model on onnxruntime and return the output tokens - :model_path: str. Path to the Onnx model. + ``Mandatory`` Args: + :model_path (str): Path to the Onnx model. - :return generated_ids: numpy.ndarray. Generated output tokens + Return: + :numpy.ndarray: Generated output tokens """ # Replace invalid index value for INT32 max to 0 using add_initializer @@ -172,13 +175,14 @@ def run_kv_model_on_ort(self, model_path): def run_kv_model_on_cloud_ai_100(self, qpc_path, device_group): """ - Function responsible for running ONNX model on Cloud AI 100 and return the output tokens - -------- + Function responsible for running ``ONNX`` model on Cloud AI 100 and return the output tokens - :qpc_path: str. path to qpc generated after compilation - :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. + ``Mandatory`` Args: + :qpc_path (str): path to qpc generated after compilation + :device_group (List[int]): Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. - :return generated_ids: numpy.ndarray. Generated output tokens + Return: + :numpy.ndarray: Generated output tokens """ execinfo = cloud_ai_100_exec_kv_helper( tokenizer=self.input_handler.tokenizer, diff --git a/README.md b/README.md index bd1daab65..883a718af 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,8 @@ -
- -
+![alt text](docs/image/Cloud_AI_100.png) ---- -# Qualcomm Transformers Library +--- +# Efficient Transformers Library --- *Latest news* :fire: