diff --git a/LICENSE b/LICENSE index ffde93d75..03e0408a6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted (subject to the limitations in the diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 8ab26734a..742163e5e 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/base/__init__.py b/QEfficient/base/__init__.py index 5fb59abd0..257051d97 100644 --- a/QEfficient/base/__init__.py +++ b/QEfficient/base/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py index adfafbb30..450e5618e 100644 --- a/QEfficient/cloud/compile.py +++ b/QEfficient/cloud/compile.py @@ -55,7 +55,7 @@ "--device-group", required=True, type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")], - help="Cloud AI 100 device ids (comma-separated) e.g. [0] ", + help="Cloud AI 100 device ids (comma-separated) e.g. [0,1] ", ) parser.add_argument( "--aic_enable_depth_first", diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index bd22830c0..c6145dfd5 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -25,18 +25,23 @@ def main( hf_token: Optional[str] = None, ) -> None: """ - Helper function used by execute CLI app to run the Model on Cloud AI 100 Platform. - --------- + Helper function used by execute CLI app to run the Model on ``Cloud AI 100`` Platform. - :model_name: str. Hugging Face Model Card name, Example: "gpt2" - :qpc_path: str. Path to the generated binary after compilation. - :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. - :local_model_dir: str. Path to custom model weights and config files. - :prompt: str. Sample prompt for the model text generation - :prompts_txt_file_path: str. Path to txt file for multiple input prompts - :generation_len: int. Number of tokens to be generated. - :cache_dir: str. Cache dir where downloaded HuggingFace files are stored. - :hf_token: str. HuggingFace login token to access private repos. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2``. + :qpc_path (str): Path to the generated binary after compilation. + :device_group (List[int]): Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. + ``Optional`` Args: + :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` + :prompt (str): Sample prompt for the model text generation. ``Defaults to None.`` + :prompts_txt_file_path (str): Path to txt file for multiple input prompts. ``Defaults to None.`` + :generation_len (int): Number of tokens to be generated. ``Defaults to None.`` + :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to Constants.CACHE_DIR.`` + :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` + + .. code-block:: bash + + python -m QEfficient.cloud.execute OPTIONS """ tokenizer = load_hf_tokenizer( pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index c7dee0e2b..47b62b92a 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -28,13 +28,14 @@ def get_onnx_model_path( ): """ exports the model to onnx if pre-exported file is not found and returns onnx_model_path - --------- - :model_name: str. Hugging Face Model Card name, Example: "gpt2" - :cache_dir: str. Cache dir where downloaded HuggingFace files are stored. - :tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]. Pass model tokenizer. - :hf_token: str. HuggingFace login token to access private repos. - :local_model_dir: str. Path to custom model weights and config files. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2``. + ``Optional`` Args: + :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. ``Defaults to None.`` + :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` + :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` """ onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name) if onnx_path_exists: @@ -67,12 +68,19 @@ def main( ) -> None: """ Helper function used by export CLI app for exporting to ONNX Model. - --------- - :model_name: str. Hugging Face Model Card name, Example: gpt2 - :cache_dir: str. Cache dir to store the downloaded HuggingFace files. - :hf_token: str. HuggingFace login token to access private repos. - :local_model_dir: str. Path to custom model weights and config files. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2``. + + ``Optional`` Args: + :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` + :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` + :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` + + .. code-block:: bash + + python -m QEfficient.cloud.export OPTIONS + """ cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir) get_onnx_model_path(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token, local_model_dir=local_model_dir) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index f33dcc285..44d93933f 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -36,29 +36,29 @@ def main( hf_token: Optional[str] = None, ) -> None: """ - Helper function used by infer CLI app; to export, compile and execute the model on Cloud AI 100 Platform. - 1. Check if compiled qpc for given config already exists, if it does jump to execute, else - 2. Check if exported ONNX file already exists, if true, jump to compilation -> execution, else - 3. Check if HF model exists in cache, if true, start transform -> export -> compilation -> execution, else, - 4. Download HF model -> transform -> export -> compile -> execute - --------- + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + :num_cores (int): Number of cores to compile model on. + :device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled. + ``Optional`` Args: + :prompt (str): Sample prompt for the model text generation. ``Defaults to None.`` + :prompts_txt_file_path (str): Path to txt file for multiple input prompts. ``Defaults to None.`` + :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.`` + :mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.`` + :batch_size (int): Batch size to compile the model for. ``Defaults to 1.`` + :prompt_len (int): Prompt length for the model to compile. ``Defaults to 32.`` + :ctx_len (int): Maximum context length to compile the model. ``Defaults to 128.`` + :generation_len (int): Number of tokens to be generated. ``Defaults to False.`` + :mxfp6 (bool): Enable compilation for MXFP6 precision. ``Defaults to False.`` + :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.`` + :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` + :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` + :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` + + .. code-block:: bash + + python -m QEfficient.cloud.infer OPTIONS - :model_name: str. Hugging Face Model Card name, Example: "gpt2" - :num_cores: int. :num_cores: int. Number of cores to compile model on. - :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. - :prompt: str. Sample prompt for the model text generation - :prompts_txt_file_path: str. Path to txt file for multiple input prompts - :aic_enable_depth_first: bool. Enables DFS with default memory size, disabled by default. - :mos: int. Effort level to reduce the on-chip memory. - :batch_size: int. Batch size to compile the model for. - :prompt_len: int. prompt length for the model to compile. - :ctx_len: int. Maximum context length to compile the model. - :generation_len: int. Number of tokens to be generated. - :mxfp6: bool. Enable compilation for MXFP6 precision - :mxint8: Compress Present/Past KV to MXINT8 using CustomIO config, default is False. - :local_model_dir: str. Path to custom model weights and config files. - :cache_dir: str. Cache dir where downloaded HuggingFace files are stored. - :hf_token: str. HuggingFace login token to access private repos. """ cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir) tokenizer = load_hf_tokenizer( diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py index c2ed0106f..bcd4c7feb 100644 --- a/QEfficient/compile/compile_helper.py +++ b/QEfficient/compile/compile_helper.py @@ -113,21 +113,28 @@ def compile( **kwargs, ) -> str: """ - Helper function used by compile CLI app for compiling the Onnx Model on Cloud AI 100 Platform with given config. - --------- - - :onnx_path: str. Generated Onnx Model Path. - :qpc_path: str. Path for saving compiled qpc binaries. - :num_cores: int. Number of cores to compile model on. - :device_group: List[int]. Used for finding number of devices to compile for. - :aic_enable_depth_first: bool. Enables DFS with default memory size, disabled by default. - :mos: int. Effort level to reduce the on-chip memory. - :batch_size: int. Batch size to compile the model for. - :prompt_len: int. prompt len for the model to compile. - :ctx_len: int. Maximum context length to compile the model. - :mxfp6: bool. Enable compilation for MXFP6 precision - :mxint8: Compress Present/Past KV to MXINT8 using CustomIO config, default is False. - :custom_io_file_path: str. Path to custom IO file. + Compiles the given ``ONNX`` model using Cloud AI 100 platform SDK compiler and saves the compiled ``qpc`` package at ``qpc_path``. + Generates tensor-slicing configuration if multiple devices are passed in ``device_group``. + + This function will be deprecated soon and will be replaced by ``QEFFAutoModelForCausalLM.compile``. + + ``Mandatory`` Args: + :onnx_path (str): Generated ``ONNX`` Model Path. + :qpc_path (str): Path for saving compiled qpc binaries. + :num_cores (int): Number of cores to compile the model on. + :device_group (List[int]): Used for finding the number of devices to compile for. + ``Optional`` Args: + :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.`` + :mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.`` + :batch_size (int): Batch size to compile the model for. ``Defaults to 1.`` + :prompt_len (int): Prompt length for the model to compile. ``Defaults to 32`` + :ctx_len (int): Maximum context length to compile the model. ``Defaults to 128`` + :mxfp6 (bool): Enable compilation for ``MXFP6`` precision. ``Defaults to True.`` + :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.`` + :custom_io_file_path (str): Path to ``customIO`` file (formatted as a string). ``Defaults to None.`` + + Returns: + :str: Path to compiled ``qpc`` package. """ os.makedirs(qpc_path, exist_ok=True) specialization_json_path = os.path.join(qpc_path, "specializations.json") diff --git a/QEfficient/customop/__init__.py b/QEfficient/customop/__init__.py index 9bfd08998..5b0e1ff02 100644 --- a/QEfficient/customop/__init__.py +++ b/QEfficient/customop/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/customop/rms_norm.py b/QEfficient/customop/rms_norm.py index 210cca686..4cb1df71a 100644 --- a/QEfficient/customop/rms_norm.py +++ b/QEfficient/customop/rms_norm.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/exporter/__init__.py b/QEfficient/exporter/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/exporter/__init__.py +++ b/QEfficient/exporter/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index d775881a8..dda7487a3 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -32,18 +32,21 @@ def convert_to_cloud_bertstyle( seq_len: int, ) -> str: """ - Function to convert the model to Bertstyle approach. + API to convert model to Bertstyle approach. Bertstyle Approach: - 1. No Prefill/Decode separably compiled - 2. No KV retention logic. - 3. KV is every time computed for all the tokens until EOS/max_length - - Args: - model_name (str): The name of the model to be used. - qeff_model (QEFFBaseModel): Transformed KV torch model to be used - tokenizer (HF AutoTokenizer): Tokenizer to prepare inputs. - onnx_dir_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location. - seq_len (int, optional): The length of the sequence. Default is 128. + 1. No Prefill/Decode separably compiled. + 2. No KV retention logic. + 3. KV is every time computed for all the tokens until EOS/max_length. + + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: `gpt2`. + :qeff_model (QEFFAutoModelForCausalLM): Transformed KV torch model to be used. + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer. + :onnx_dir_path (str): Path to save exported ONNX file. + :seq_len (int): The length of the sequence. + + Returns: + :str: Path of exported ``ONNX`` file. """ if os.path.exists(onnx_dir_path): logger.warning(f"Overriding {onnx_dir_path}") @@ -147,19 +150,22 @@ def convert_to_cloud_kvstyle( seq_len: int, ) -> str: """ - Function Modeling changes for kv retention and export to Onnx. - KV Style Approach: - 1. This architecture is particularly suitable for autoregressive tasks - 2. where sequence generation involves processing one token at a time + API to convert model with kv retention and export to ONNX. + KV Style Approach- + 1. This architecture is particularly suitable for auto-regressive tasks. + 2. where sequence generation involves processing one token at a time. 3. And contextual information from earlier tokens is crucial for predicting the next token. 4. The inclusion of a kV cache enhances the efficiency of the decoding process, making it more computationally efficient. - Args: - model_name (str): The name of the model to be used. - qeff_model (QEFFBaseModel): Transformed KV torch model to be used - tokenizer (HF AutoTokenizer): Tokenzier to prepare inputs. - onnx_dir_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location. - seq_len (int, optional): The length of the sequence. Default is 128. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: `gpt2`. + :qeff_model (QEFFAutoModelForCausalLM): Transformed KV torch model to be used. + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer. + :onnx_dir_path (str): Path to save exported ONNX file. + :seq_len (int): The length of the sequence. + + Returns: + :str: Path of exported ``ONNX`` file. """ warnings.warn( "\033[93mThis function will be deprecated soon, use QEfficient.export instead\033[0m", @@ -369,22 +375,35 @@ def qualcomm_efficient_converter( form_factor: str = "cloud", ) -> Tuple[str, str]: """ - Function to convert the input string using the specified model and returns the result. - - Args: - model_name (str): The name of the model to be used. - model_kv (torch.nn.Module): Transformed KV torch model to be used - local_model_dir(str): Path to custom model weights and config files - tokenizer (HF AutoTokenizer): Tokenzier to prepare inputs. - cache_dir (str): Path to cache dir if not specified, default HF cache_dir will be used. - onnx_dir_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location. - hf_token (bool): If True, an authentication token will be used. Default is False. - seq_len (int, optional): The length of the sequence. Default is 128. - kv (bool): If True, key-value pairs will be used. Default is True. - form_factor (str): form_factor of the hardware, currently only accepts "cloud". + This method is an alias for ``QEfficient.export``. + + Usage 1: This method can be used by passing ``model_name`` and ``local_model_dir`` or ``cache_dir`` if required for loading from local dir. + This will download the model from ``HuggingFace`` and export it to ``ONNX`` graph and returns generated files path check below. + + Usage 2: You can pass ``model_name`` and ``model_kv`` as an object of ``QEfficient.QEFFAutoModelForCausalLM``, In this case will directly export the ``model_kv.model`` to ``ONNX`` + + We will be deprecating this function and it will be replaced by ``QEffAutoModelForCausalLM.export``. + + ``Mandatory`` Args: + :model_name (str): The name of the model to be used. + ``Optional`` Args: + :model_kv (torch.nn.Module): Transformed ``KV torch model`` to be used. ``Defaults to None``. + :local_model_dir (str): Path of local model. ``Defaults to None``. + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer. ``Defaults to None``. + :cache_dir (str): Path of the ``cache`` directory. ``Defaults to None``. + :onnx_dir_path (str): Path to store ``ONNX`` file. ``Defaults to None``. + :hf_token (str): HuggingFace token to access gated models. ``Defaults is None``. + :seq_len (int): The length of the sequence. ``Defaults is 128``. + :kv (bool): If false, it will export to Bert style. ``Defaults is True``. + :form_factor (str): Form factor of the hardware, currently only ``cloud`` is accepted. ``Defaults to cloud``. Returns: - None, if automation is False, else path to exported Onnx file + :Tuple[str, str]: Path to Base ``ONNX`` dir and path to generated ``ONNX`` model + + .. code-block:: python + + import QEfficient + base_path, onnx_model_path = QEfficient.export(model_name="gpt2") """ warnings.warn( @@ -392,6 +411,7 @@ def qualcomm_efficient_converter( DeprecationWarning, stacklevel=2, ) + # Get model_kv first model_kv = ( model_kv diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py index 8c33bc6ca..75ee08a89 100644 --- a/QEfficient/exporter/export_utils.py +++ b/QEfficient/exporter/export_utils.py @@ -27,6 +27,19 @@ def export_onnx( gen_models_path: str, model_base_name: str, ) -> str: + """ + API for export PyTorch model to ONNX. + + Args: + :pt_model (torch.nn.Module): PyTorch model that will be exported to ``ONNX`` format. + :inputs (Dict[str, torch.Tensor]): Processed torch input for the model. + :output_names (List[str]): Output of pytorch model inference. + :gen_models_path (str): Path of generated ``ONNX`` model. + :model_base_name (str): Base name for the exported ``ONNX`` file. + + Return: + :str: Updated base name of exported ``ONNX`` model. + """ # Inspect the model's forward method arguments pt_model_code = pt_model.forward.__code__ pt_input_names = pt_model_code.co_varnames[1 : pt_model_code.co_argcount] @@ -124,6 +137,17 @@ def export_onnx( def save_onnx(model: Union[onnx.ModelProto, str], gen_models_path: str, model_base_name: str) -> str: + """ + API to save ONNX model and it's data separately if size of ``ONNX`` model is greater than 2GB. + + Args: + :model (Union[onnx.ModelProto, str]): Pass ``ONNX`` model or path of the model. + :gen_models_path (str): Path of generated ``ONNX`` model. + :model_base_name (str): Base name of the HuggingFace model. + + Return: + :str: Base name of ``ONNX`` exported model. + """ if isinstance(model, str): model = onnx.load(f"{gen_models_path}/{model}.onnx") @@ -149,20 +173,21 @@ def save_onnx(model: Union[onnx.ModelProto, str], gen_models_path: str, model_ba return model_base_name -def remove_temp_file(file_path_model, file_path_weights): +def remove_temp_file(model_file_path: str, weights_file_path: str): """ - Function to remove a temporary file + API to remove a temporary file - :param str file_path: Path to the file to be deleted - :file_path_weights: Path to the weights file + Args: + :model_file_path (str); Path to the file to be deleted + :weights_file_path (str): Path to the weights file """ try: - os.remove(file_path_model) - os.remove(file_path_weights) + os.remove(model_file_path) + os.remove(weights_file_path) except FileNotFoundError: - print(f"File '{file_path_model}' does not exist.") + print(f"File '{model_file_path}' does not exist.") except Exception as e: - print(f"Error deleting file '{file_path_model}': {e}") + print(f"Error deleting file '{model_file_path}': {e}") def fix_onnx_fp16( @@ -173,6 +198,20 @@ def fix_onnx_fp16( model_base_name: str, pt_outputs: Dict[str, torch.Tensor], ) -> str: + """ + API to clip model weights in fp16 range and save updated clipped ``ONNX`` model. + + Args: + :inputs (Dict[str, torch.Tensor]): Processed torch input for the model. + :output_names (List[str]): Output names of pytorch model inference. + :ort_outputs (List[np.ndarray]): Output of onnxruntime. + :gen_models_path (str): Path of generated ``ONNX`` model. + :model_base_name (str): Base name for the exported ONNX model. + :pt_outputs (Dict[str, torch.Tensor]): Output of PyTorch model inference. + + Return: + :str: Updated base name of exported ONNX model. + """ model = onnx.load(os.path.join(gen_models_path, f"{model_base_name}.onnx")) # TODO: Remove this `fix_onnx_fp16` function and replace with this transform # as we're not utilizing the validations done in this function @@ -227,6 +266,15 @@ def generate_input_files( inputs: Dict[str, torch.Tensor], input_list_file: str, ): + """ + API to generate input files, required for Cloud AI 100 execution. + + Args: + :input_files_path (str): Path to save input files. + :input_names (List[str]): Names of inputs to be saved. + :inputs (dict[str, torch.tensor]): Input tensors to be saved in raw format. + :input_list_file (str): File name to save the names of inputs in order. Example - "input_list.txt" + """ # inputFiles os.makedirs(input_files_path, exist_ok=True) filenames = [] @@ -252,6 +300,19 @@ def run_model_on_ort( pt_outputs: Dict[str, torch.Tensor], dtype: bool = True, ) -> Tuple[List[str], List[np.ndarray]]: + """ + API to run ONNX model on ONNX runtime + + Args: + :onnx_path (str): Path of ONNX model. + :inputs (Dict[str, torch.Tensor]): Processed torch input for the model. + :output_names (List[str]): Output from pytorch inference. + :pt_outputs (Dict[str, torch.Tensor]): Output of PyTorch model inference. + :dtype (bool): If False it will consider you are passing clipped version of ``ONNX`` model. + + Return: + :Tuple[List[str], List[np.ndarray]]: input_names + """ try: if dtype: info_string = "fp32" diff --git a/QEfficient/generation/__init__.py b/QEfficient/generation/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/generation/__init__.py +++ b/QEfficient/generation/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/generation/cloud_infer.py b/QEfficient/generation/cloud_infer.py index 558f9164a..aac3d60d7 100644 --- a/QEfficient/generation/cloud_infer.py +++ b/QEfficient/generation/cloud_infer.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- @@ -48,6 +48,15 @@ def __init__( activate: bool = True, enable_debug_logs: bool = False, ): + """ + Initialise for QAIC inference Session + --------- + + :qpc_path: str. Path to the save generated binary file after compilation. + :device_ids: List[int]. Device Ids to be used for compilation. if devices > 1, it enables multiple card setup. + :activate: bool. If false, activation will be disabled. Default=True. + :enable_debug_logs: bool. If True, It will enable debug logs. Default=False. + """ # Load QPC devices = qaicrt.QIDList(device_ids) self.context = qaicrt.Context(devices) @@ -92,14 +101,25 @@ def output_names(self) -> List[str]: return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_OUTPUT] def activate(self): + """Activate qpc""" + self.program.activate() self.execObj = qaicrt.ExecObj(self.context, self.program) def deactivate(self): + """Deactivate qpc""" + del self.execObj self.program.deactivate() def set_buffers(self, buffers: Dict[str, np.ndarray]): + """ + Provide buffer mapping for input and output + + Args: + :buffer (Dict[str, np.ndarray]): Parameter for buffer mapping. + """ + for buffer_name, buffer in buffers.items(): if buffer_name not in self.binding_index_map: warn(f'Buffer: "{buffer_name}" not found') @@ -112,9 +132,25 @@ def set_buffers(self, buffers: Dict[str, np.ndarray]): ) def skip_buffers(self, skipped_buffer_names: List[str]): + """ + skip buffer mapping for given list of buffer names + + Args: + :skipped_buffer_name: List[str]. List of buffer name to be skipped. + """ + self.set_buffers({k: np.array([]) for k in skipped_buffer_names}) def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + """ + Execute on cloud AI 100 + + Args: + :inputs (Dict[str, np.ndarray]): Processed numpy inputs for the model. + + Return: + :Dict[str, np.ndarray]: + """ # Set inputs self.set_buffers(inputs) assert self.execObj.setData(self.qbuffers, self.buf_dims) == qaicrt.QStatus.QS_SUCCESS, "Failed to setData" diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 845f4fb87..d3cd87247 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -23,14 +23,16 @@ @dataclass class CloudAI100ExecInfo: """ - holds all the information about Cloud AI 100 execution - :batch_size: int - :generated_texts: Union[List[List[str]], List[str]] - :generated_ids: Union[List[np.ndarray], np.ndarray] - :prefill_time: float - :decode_perf: float - :total_perf: float - :total_time: float + Holds all the information about Cloud AI 100 execution + + Args: + :batch_size (int): Batch size of the QPC compilation. + :generated_texts (Union[List[List[str]], List[str]]): Generated text(s). + :generated_ids (Union[List[np.ndarray], np.ndarray]): Generated IDs. + :prefill_time (float): Time for prefilling. + :decode_perf (float): Decoding performance. + :total_perf (float): Total performance. + :total_time (float): Total time. """ batch_size: int @@ -99,6 +101,16 @@ def latency_stats_bertstyle( prompt: str, device_id: List[int] = [0], ): + """ + Function to execute Bertstyle ONNX model on Cloud AI 100. + + Args: + :model_name (str): Hugging Face Model Card name, Example: gpt2. + :qpc_path (str): Path to save generated binary file after compilation. + :seq_len (int): Sequence length. + :prompt (str): Sample prompt for the model text generation. + :device_id (List[int]): Device Ids to be used for compilation. If devices > 1, it enables multiple card setup. + """ session = QAICInferenceSession(qpc_path, device_id) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, padding_side="left") padding_check_and_fix(tokenizer) # Check and fix tokenizer viability @@ -189,6 +201,22 @@ def cloud_ai_100_exec_kv_helper( stream: bool = True, write_io_dir: Optional[str] = None, ): + """ + Helper function to execute QEfficient transformed ONNX model on ``Cloud AI 100`` using compiled QPC file. + + ``Mandatory`` Args: + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer. + :qpc_path (str): Path to the saved generated binary file after compilation. + :prompt (str): Sample prompt for the model text generation. + :ctx_len (int): Input length of the prompt to determine the number of chunks to execute on ``Cloud AI 100``. + ``Optional`` Args: + :generation_len (int): Maximum context length for the model during compilation. ``Defaults to None``. + :device_id (List[int]): Device IDs to be used for compilation. If ``len(device_id) > 1``, it enables multiple card setup. ``Defaults to [0]``. + :enable_debug_logs (bool): If True, it enables debugging logs. ``Defaults to False``. + :stream (bool): If True, enable streamer, which returns tokens one by one as the model generates them.``Defaults to True``. + :Write_io_dir (str): Path to write the input and output files.``Defaults to None``. + """ + if tokenizer.padding_side != "right": logger.warning("Please use padding_side='right' while initializing the tokenizer") tokenizer.padding_side = "right" @@ -321,6 +349,38 @@ def cloud_ai_100_exec_kv( write_io_dir: Optional[str] = None, automation=False, ): + """ + This method generates output until ``eos`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. + This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed. + If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped. + + ``Mandatory`` Args: + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer. + :qpc_path (str): Path to the saved generated binary file after compilation. + + ``Optional`` Args: + :prompt (str): Sample prompt for the model text generation. ``Defaults to None``. + :prompts_txt_file_path (str): Path of the prompt text file. ``Defaults to None``. + :generation_len (int): Maximum context length for the model during compilation. ``Defaults to None``. + :device_id (List[int]): Device IDs to be used for compilation. If ``len(device_id) > 1``, it enables multiple card setup. ``Defaults to [0]``. + :enable_debug_logs (bool): If True, it enables debugging logs. ``Defaults to False``. + :stream (bool): If True, enable streamer, which returns tokens one by one as the model generates them. ``Defaults to True``. + :Write_io_dir (str): Path to write the input and output files. ``Defaults to None``. + :automation (bool): If true, it prints input, output, and performance stats. ``Defaults to False``. + + Returns: + :CloudAI100ExecInfo: Object holding execution output and performance details. + + .. code-block:: python + + import transformers + import QEfficient + base_path, onnx_model_path = QEfficient.export(model_name="gpt2") + qpc_path = QEfficient.compile(onnx_path=onnx_model_path, qpc_path=os.path.join(base_path, "qpc"), num_cores=14, device_group=[0]) + tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2") + execinfo = QEfficient.cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=qpc_path, prompt="Hi there!!", device_id=[0]) + + """ batch_size, ctx_len = get_compilation_dims(qpc_path) prompt: List[str] = get_input_prompts(prompt, prompts_txt_file_path) prompt = fix_prompts(prompt, batch_size) diff --git a/QEfficient/transformers/__init__.py b/QEfficient/transformers/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/__init__.py +++ b/QEfficient/transformers/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py index fe70bbab0..0d007e4e6 100644 --- a/QEfficient/transformers/cache_utils.py +++ b/QEfficient/transformers/cache_utils.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/codegen/__init__.py b/QEfficient/transformers/models/codegen/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/models/codegen/__init__.py +++ b/QEfficient/transformers/models/codegen/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/falcon/__init__.py b/QEfficient/transformers/models/falcon/__init__.py index cb1682a79..d259e435a 100644 --- a/QEfficient/transformers/models/falcon/__init__.py +++ b/QEfficient/transformers/models/falcon/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/gpt2/__init__.py b/QEfficient/transformers/models/gpt2/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/models/gpt2/__init__.py +++ b/QEfficient/transformers/models/gpt2/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/gptj/__init__.py b/QEfficient/transformers/models/gptj/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/models/gptj/__init__.py +++ b/QEfficient/transformers/models/gptj/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/llama/__init__.py b/QEfficient/transformers/models/llama/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/models/llama/__init__.py +++ b/QEfficient/transformers/models/llama/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py index f3e068b38..bce922f40 100644 --- a/QEfficient/transformers/models/llama/modeling_llama.py +++ b/QEfficient/transformers/models/llama/modeling_llama.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/mistral/__init__.py b/QEfficient/transformers/models/mistral/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/models/mistral/__init__.py +++ b/QEfficient/transformers/models/mistral/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/mistral/modeling_mistral.py b/QEfficient/transformers/models/mistral/modeling_mistral.py index d703ea3f2..d2a778500 100644 --- a/QEfficient/transformers/models/mistral/modeling_mistral.py +++ b/QEfficient/transformers/models/mistral/modeling_mistral.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/mixtral_moe/__init__.py b/QEfficient/transformers/models/mixtral_moe/__init__.py index 8694aa938..d259e435a 100644 --- a/QEfficient/transformers/models/mixtral_moe/__init__.py +++ b/QEfficient/transformers/models/mixtral_moe/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py index da8e5cf8c..2b7261aa3 100644 --- a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py +++ b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 87fb67847..21c893b41 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -26,7 +26,7 @@ class QEFFTransformersBase(QEFFBaseModel): """ - Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from src/transformers/models/auto/modeling_auto.py file. + Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from transformers/models/modeling_auto.py file. """ def __init__(self, model: nn.Module, pretrained_model_name_or_path: str, **kwargs) -> None: @@ -52,20 +52,34 @@ def __init__(self, model: nn.Module, pretrained_model_name_or_path: str, **kwarg def __repr__(self) -> str: return f"{self.__class__.__name__}\n" + self.model.__repr__() - @property - def tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: - if self._tokenizer is None: - self._tokenizer = self.get_tokenizer() - return self._tokenizer - @classmethod def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): """ - This method accepts All the parameters that are acceptable by transformers.AutoModelForCausalLM. + This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModelForCausalLM. + Once the model is initialized, you can use other methods such as export, compile, and generate on the same object. + + Accepts All the parameters that are acceptable by ``transformers.AutoModelForCausalLM`` There are few additional parameters that this method can take. - --------- - :transform: bool. Whether to optimize model for KV retention; default is True. Pass False to get BertStyle model. - :model_card_name: str. HuggingFace model card name or name of the model if custom, used for deciding folder name while saving ONNX/qpc files. + + ``Mandatory`` Args: + :transform (bool): Whether to optimize model for KV retention; default is ``True``. Pass ``False`` to get BertStyle model. + :model_card_name (str): ``HuggingFace`` model card name or name of the model if custom, used for deciding directory name while saving ``ONNX/qpc`` files. + + Example usage: + + .. code-block:: python + + from QEfficient import QEFFAutoModelForCausalLM + + # Initialize the model using from_pretrained similar to transformers.AutoModelForCausalLM + model = QEFFAutoModelForCausalLM.from_pretrained("gpt2") + + # Now you can directly compile the model for Cloud AI 100 + model.compile(num_cores=14, device_group=[0]) # Considering you have a Cloud AI 100 Standard SKU + + # You can now execute the model + model.generate(prompts=["Hi there!!"]) + """ model_card_name = kwargs.pop( "model_card_name", None @@ -85,6 +99,18 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): **kwargs, ) + @property + def tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + """Returns the tokenizer for given model based on ``self.pretrained_model_name_or_path``. + Loads the tokenizer if required. + + Returns: + :Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: Tokenizer from ``transformers`` for the given model. + """ + if self._tokenizer is None: + self._tokenizer = self.get_tokenizer() + return self._tokenizer + def get_tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=self.pretrained_model_name_or_path, **self.kwargs) return tokenizer @@ -92,12 +118,30 @@ def get_tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: class QEFFAutoModelForCausalLM(QEFFTransformersBase): """ - QEFF class for manipulating any causal language model from HuggingFace hub. + The QEFF class is designed for manipulating any causal language model from the HuggingFace hub. + Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. + Please note that the QEFF class is also a part of the ``QEfficient`` module. + + ``Mandatory`` Args: + :model (nn.Module): PyTorch model + :pretrained_model_name_or_path (str): We recommend passing name of the model as input here, as you are not using `from_pretrained` method. This name will be used for deciding path of the ``ONNX/qpc`` files generated during ``export``, ``compilation`` stages. + + .. code-block:: python + + from QEfficient import QEFFAutoModelForCausalLM + """ _pytorch_transforms = [CustomOpsTransform, KVCacheTransform] def transform(self): + """ + This method applies all relevant optimization transforms on the model and toggles the ``self.is_transformed`` attribute to True. If the model is already transformed, the method will simply return. + Please note that this method does not require any input arguments." + + Returns: + :obj: Same object with transformed ``self.model`` + """ if self.is_transformed: return for transform in self._pytorch_transforms: @@ -108,6 +152,21 @@ def execute(self, *args, **kwargs): # type: ignore raise NotImplementedError("Reached too far!!") def export(self, model_card_name: Optional[str] = None) -> str: + """ + Exports the model to ``ONNX`` format using ``torch.onnx.export``. + The model should already be transformed i.e. ``self.is_transformed`` should be ``True``. + Otherwise, this will raise an ``AssertionError``. + We currently don't support exporting non-transformed models. Please refer to the ``convert_to_cloud_bertstyle`` function in the **Low-Level API** for a legacy function that supports this." + + ``Optional`` Args: + :model_card_name (Optional[str]): Name of the model card. Mandatory when model is initialized with path for ``pretrained_model_name_or_path`` argument during initialization. ``Defaults to None.`` + + Raises: + :AttributeError: If ``pretrained_model_name_or_path`` is a path, this function needs model card name of the model so that it can distinguish between directories while saving the ``ONNX`` files generated. So, user needs to pass ``model_card_name`` as a valid ``string`` in that case, Otherwise this will raise the error. + + Returns: + :str: Path of the generated ``ONNX`` graph. + """ assert self.is_transformed, "Please first run transform on the QEFFAutoModelForCausalLM object" # Make sure model_card_name is available for export @@ -135,6 +194,27 @@ def compile( mos: int = -1, aic_enable_depth_first: bool = False, ) -> str: + """ + This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package. + If the model has not been exported yet, this method will handle the export process. + The generated ``qpc`` can be found under the directory ``efficient-transformers/qeff_models/{self.model_card_name}/qpc``. + + ``Mandatory`` Args: + :num_cores (int): Number of cores used to compile the model. + :device_group (List[int]): If this is a list of more that one integers, tensor-slicing is invoked. + ``Optional`` Args: + :model_card_name (Optional[str], optional): Name of the model, Mandatory if ``self.pretrained_model_name_or_path`` is a path. ``Defaults to None``. + :batch_size (int, optional): Batch size. ``Defaults to 1``. + :prompt_len (int, optional): The length of the Prefill prompt should be less that ``prompt_len``. ``Defaults to 32``. + :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``. + :mxfp6 (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``. + :mxint8 (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``. + :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``. + :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. + + Returns: + :str: Path of the compiled ``qpc`` package. + """ # Export first if self.ort_runtime_args are not populated if self.onnx_path is None: logger.info(f"Exporting the {self.model.__class__.__name__} model to ONNX for compilation!") @@ -173,6 +253,16 @@ def compile( return self.qpc_path def generate(self, prompts: List[str], runtime: str = "AI_100", **kwargs): + """ + This method generates output until ``eos`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. + This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed. + If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped. + + ``Mandatory`` Args: + :prompts (List[str]): List of prompts to run the execution. + ``optional`` Args: + :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100". + """ assert Runtime(runtime) == Runtime.AI_100, "Only AI_100 runtime is supported right now via generate API" self.run_cloud_ai_100(prompts=prompts, **kwargs) diff --git a/QEfficient/transformers/models/mpt/__init__.py b/QEfficient/transformers/models/mpt/__init__.py index 91fee0a49..da26921c5 100644 --- a/QEfficient/transformers/models/mpt/__init__.py +++ b/QEfficient/transformers/models/mpt/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/phi3/__init__.py b/QEfficient/transformers/models/phi3/__init__.py index 9b442c4ae..da26921c5 100644 --- a/QEfficient/transformers/models/phi3/__init__.py +++ b/QEfficient/transformers/models/phi3/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/phi3/modeling_phi3.py b/QEfficient/transformers/models/phi3/modeling_phi3.py index aebc92bf0..17f1a8bfc 100644 --- a/QEfficient/transformers/models/phi3/modeling_phi3.py +++ b/QEfficient/transformers/models/phi3/modeling_phi3.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/qwen2/__init__.py b/QEfficient/transformers/models/qwen2/__init__.py index cb1682a79..d259e435a 100644 --- a/QEfficient/transformers/models/qwen2/__init__.py +++ b/QEfficient/transformers/models/qwen2/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py index 0d5cd19f2..d5dc1b001 100644 --- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py +++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/starcoder2/__init__.py b/QEfficient/transformers/models/starcoder2/__init__.py index cb1682a79..d259e435a 100644 --- a/QEfficient/transformers/models/starcoder2/__init__.py +++ b/QEfficient/transformers/models/starcoder2/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py index f63b19327..b6a4d4d23 100644 --- a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py +++ b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py index eccdba286..e2c65c6c3 100644 --- a/QEfficient/transformers/transform.py +++ b/QEfficient/transformers/transform.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- @@ -19,9 +19,10 @@ def replace_module_with_qeff_layers(model: nn.Module) -> None: """ - Replaces the transformers nn.Module classes with optmized QEff classes in place. - ---------- - :param model: torch.nn.Module. Base PyTorch model. + Replaces the transformers nn.Module classes with optimized QEff classes in place. + + Args: + :model (torch.nn.Module) Base PyTorch model. """ # Replace if module class is registed in TransformersToQEffModulesDict target_module = TransformersToQEffModulesDict.get(model.__class__) @@ -36,9 +37,12 @@ def replace_module_with_qeff_layers(model: nn.Module) -> None: def get_params_hash(model: nn.Module) -> str: """ Creates a Hash of all the parameters values i.e. weights using SHA256 algo. - -------- - :param model: torch.nn.Module. Base PyTorch model. - :returns: str. Hash string + + Args: + model (torch.nn.Module): Base PyTorch model. + + Returns: + :str: Hash string """ hasher = hashlib.sha256() for _, params in model.named_parameters(): @@ -49,13 +53,13 @@ def get_params_hash(model: nn.Module) -> str: def transform_lm(model: nn.Module) -> nn.Module: """ - Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for cloud AI 100. - --------- + Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for Cloud AI 100. + Args: - param model (torch.nn.Module): PyTorch model. + model (torch.nn.Module): PyTorch model. Returns: - torch.nn.Module: PyTorch Module with replaced QEff layers. + :torch.nn.Module: PyTorch Module with replaced QEff layers. """ # Introducnig qeff_transformed attribue in model to check status of transform @@ -84,11 +88,11 @@ def transform_lm(model: nn.Module) -> nn.Module: def transform(model: QEFFBaseModel, form_factor="cloud"): """ - This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for cloud AI 100. + This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for Cloud AI 100. Will replace the torch.nn.Module layers of passed QEffModel with optimized implementation of the same. - model: object of any instance of class that is child of `QEFFBaseAutoModelFactory` - form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"]. + model (torch.nn.Module): object of any instance of class that is child of `QEFFBaseAutoModelFactory` + form_factor (str): form factor configuration for optimizing the model, available options=["cloud", "edge"]. """ assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!" # FIXME: move this to class and use model.transform() diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index d3283ece1..61efc4f92 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -82,9 +82,12 @@ def qpc_exists(qpc_dir_path: str) -> bool: 1. Boolean variable indicating if qpc files exist 2. Path of the qpc dir if found. --------- - :param model_name: str. HF Model card name. - :param dir_path: str. Path of qpc directory. - :return: Union[Tuple[bool, str]]: qpc_exists and path to qpc directory + + :model_name: `str` - HF Model card name. + :dir_path: `str` - Path of qpc directory. + + Return: + qpc_exists and path to qpc directory """ # Compute the boolean indicating if the QPC exists @@ -97,8 +100,11 @@ def onnx_exists(model_name: str) -> Tuple[bool, str, str]: """ Checks if qpc files already exists, removes the directory if files have been manipulated. --------- - :param model_name: str. HF Model card name. - :return: Union[Tuple[bool, str, str]]: onnx_exists and path to onnx file and directory + + :model_name: `str`- HF Model card name. + + Return: + onnx_exists and path to onnx file and directory """ model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) os.makedirs(model_card_dir, exist_ok=True) @@ -182,7 +188,7 @@ def padding_check_and_fix(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokeni Checks and fixes tokenizer paddding side and pad_token_id viability. -------- - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]. Pass model tokenizer to check and fix. + tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to check and fix. """ if tokenizer.padding_side != "right": logger.warning(f"Setting tokenizer padding_side to 'right', got {tokenizer.padding_side}") @@ -208,7 +214,8 @@ def get_padding_shape_from_config(config, batch_size, seq_len): :batch_size: int. number of input prompts used to create inputs :seq_len: int. sequence length to run the model for. - :return: List[int, int, int, int] + Return: + List[int, int, int, int] """ if hasattr(config, "n_head"): # Assuming n_head is a key in the config (GPTs/CodeGen) @@ -242,7 +249,8 @@ def get_num_layers_from_config(config): :config: AutoConfig from pretrained model. - :return: int: number of layers + Return: + number of layers """ if hasattr(config, "n_layer"): # Assuming n_layer is a key in the config (GPTs/CodeGen) diff --git a/QEfficient/utils/device_utils.py b/QEfficient/utils/device_utils.py index 8faaf5f10..fded5d8e4 100644 --- a/QEfficient/utils/device_utils.py +++ b/QEfficient/utils/device_utils.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- @@ -13,6 +13,13 @@ def get_available_device_id(): + """ + API to check available device id. + + Return: + :int: Available device id. + """ + device_id = 0 result = None while 1: diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py index 657b836ba..79e8653d8 100644 --- a/QEfficient/utils/generate_inputs.py +++ b/QEfficient/utils/generate_inputs.py @@ -15,14 +15,14 @@ class InputHandler: def __init__(self, batch_size, tokenizer, config, prompt, prompt_len, ctx_len): """ Initialization - -------- - - :batch_size: int. Number of prompts to run in one batch. - :tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]. Pass model tokenizer. - :config: AutoConfig from pretrained model. - :prompt: List[str]. String to used as input prompt for the model. - :prompt_len: int. prompt length for the model to compile. - :ctx_len: int. Maximum context length to compile the model. + + ``Mandatory`` Args: + :batch_size (int): Number of prompts to run in one batch. + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. + :config (AutoConfig): From pretrained model. + :prompt (List[str]): String to used as input prompt for the model. + :prompt_len (int): Prompt length for the model to compile. + :ctx_len (int): Maximum context length to compile the model. """ # check and fix tokenizer viability padding_check_and_fix(tokenizer) @@ -36,9 +36,9 @@ def __init__(self, batch_size, tokenizer, config, prompt, prompt_len, ctx_len): def prepare_pytorch_inputs(self): """ Function responsible for creating Prefill stage tensor inputs for PyTorch model. - -------- - :return inputs: Dict. input_ids, position_ids, past_key_values + Return: + :Dict: input_ids, position_ids, past_key_values """ inputs = self.tokenizer( @@ -79,12 +79,13 @@ def prepare_pytorch_inputs(self): def update_pytorch_inputs(self, inputs, pt_outputs): """ Function responsible for updating Prefill stage inputs to create decode stage inputs for PyTorch model. - -------- - :inputs: Dict. Pytorch inputs from previous iteration - :pt_outputs: Dict. Pytorch outputs from previous iteration + ``Mandatory`` Args: + :inputs (Dict): Pytorch inputs from previous iteration + :pt_outputs (Dict): Pytorch outputs from previous iteration - :return updated_inputs: Dict. Updated input_ids, position_ids and past_key_values + Return: + :Dict: Updated input_ids, position_ids and past_key_values """ updated_inputs = {} updated_inputs["input_ids"] = pt_outputs["logits"].argmax(-1).reshape(-1, 1) @@ -97,9 +98,9 @@ def update_pytorch_inputs(self, inputs, pt_outputs): def prepare_ort_inputs(self): """ Function responsible for creating Prefill stage numpy inputs for ONNX model to be run on ONNXRT. - -------- - :return inputs: Dict. input_ids, position_ids, past_key_values + Return: + :Dict: input_ids, position_ids, past_key_values """ inputs = self.tokenizer( @@ -129,12 +130,13 @@ def prepare_ort_inputs(self): def update_ort_inputs(self, inputs, ort_outputs): """ Function responsible for updating Prefill stage inputs to create inputs for decode stage inputs for ONNX model to be run on ONNXRT. - -------- - :inputs: Dict. NumPy inputs of Onnx model from previous iteration - :ort_outputs: Dict. Numpy outputs of Onnx model from previous iteration + ``Mandatory`` Args: + :inputs (Dict): NumPy inputs of Onnx model from previous iteration + :ort_outputs (Dict): Numpy outputs of Onnx model from previous iteration - :return updated_inputs: Dict. Updated input_ids, position_ids and past_key_values + Return: + :Dict: Updated input_ids, position_ids and past_key_values """ updated_inputs = {} @@ -149,11 +151,12 @@ def update_ort_inputs(self, inputs, ort_outputs): def update_ort_outputs(self, ort_outputs): """ Function responsible for updating ONNXRT session outputs. - -------- - :ort_outputs: Dict. Numpy outputs of Onnx model from current iteration + ``Mandatory`` Args: + :ort_outputs (Dict): Numpy outputs of Onnx model from current iteration - :return updated_outputs: Dict. Updated past_key_values, logits + Return: + updated_outputs (Dict): Updated past_key_values, logits """ present_key_values = [] diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py index 8dbfd378d..c17fde29c 100644 --- a/QEfficient/utils/logging_utils.py +++ b/QEfficient/utils/logging_utils.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index ee5471cb1..8acd36f46 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -20,23 +20,23 @@ class ApiRunner: ApiRunner class is responsible for running: --------- - 1. HuggingFace PyTorch model + 1. HuggingFace ``PyTorch`` model 2. Transformed KV Pytorch Model - 3. ONNX model on ONNXRT - 4. ONNX model on Cloud AI 100 + 3. ``ONNX`` model on ONNXRT + 4. ``ONNX`` model on Cloud AI 100 """ def __init__(self, batch_size, tokenizer, config, prompt, prompt_len, ctx_len): """ Initialization - -------- - - :batch_size: int. Number of prompts to run in one batch. - :tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]. Pass model tokenizer. - :config: AutoConfig from pretrained model. - :prompt: List[str]. input prompt for running the model. - :prompt_len: int. prompt length to compile the model. - :ctx_len: int. Maximum context length to compile the model. + + Args: + :batch_size (int): Number of prompts to run in one batch. + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. + :config (AutoConfig): From pretrained model. + :prompt (List[str]): Input prompt for running the model. + :prompt_len (int): Prompt length to compile the model. + :ctx_len (int): Maximum context length to compile the model. """ self.input_handler = InputHandler( batch_size=batch_size, @@ -52,12 +52,13 @@ def __init__(self, batch_size, tokenizer, config, prompt, prompt_len, ctx_len): @torch.no_grad() def run_hf_model_on_pytorch(self, model_hf): """ - Function responsible for running HuggingFace PyTorch model and return the output tokens - -------- + Function responsible for running HuggingFace ``PyTorch`` model and return the output tokens - :model_hf: torch.nn.module. Original PyTorch model + ``Mandatory`` Args: + :model_hf (torch.nn.module): Original ``PyTorch`` model - :return generated_ids: numpy.ndarray. Generated output tokens + Return: + :numpy.ndarray: Generated output tokens """ input_ids = self.input_handler.tokenizer.encode(self.input_handler.prompt[0], return_tensors="pt") @@ -78,12 +79,13 @@ def run_hf_model_on_pytorch(self, model_hf): def run_kv_model_on_pytorch(self, model): """ - Function responsible for running KV PyTorch model and return the output tokens - -------- + Function responsible for running KV ``PyTorch`` model and return the output tokens - :model: torch.nn.module. Transformed PyTorch model + ``Mandatory`` Args: + :model (torch.nn.module): Transformed ``PyTorch`` model - :return generated_ids: numpy.ndarray. Generated output tokens + Return: + :numpy.ndarray: Generated output tokens """ generated_ids = [] @@ -103,16 +105,16 @@ def run_kv_model_on_pytorch(self, model): print("Completion:", repr(predicted_string)) return generated_ids - def run_ort_session(self, inputs, session): + def run_ort_session(self, inputs, session) -> dict: """ - Function responsible for running onnxrt session with given inputs and - passing retained state outputs to be used for next iteration inputs - -------- + Function responsible for running onnxrt session with given inputs and passing retained state outputs to be used for next iteration inputs - :inputs: Dict. Numpy inputs of Onnx model - :session: 'onnxruntime.capi.onnxruntime_inference_collection.InferenceSession'. + ``Mandatory`` Args: + :inputs (Dict): + :session (onnxruntime.capi.onnxruntime_inference_collection.InferenceSession): - :return outputs: Dict. Numpy outputs of Onnx model + Return: + :Dict: Numpy outputs of Onnx model """ output_names = [x.name for x in session.get_outputs()] session_input_names = [x.name for x in session.get_inputs()] @@ -126,12 +128,13 @@ def run_ort_session(self, inputs, session): def run_kv_model_on_ort(self, model_path): """ - Function responsible for running ONNX model on onnxruntime and return the output tokens - -------- + Function responsible for running ``ONNX`` model on onnxruntime and return the output tokens - :model_path: str. Path to the Onnx model. + ``Mandatory`` Args: + :model_path (str): Path to the Onnx model. - :return generated_ids: numpy.ndarray. Generated output tokens + Return: + :numpy.ndarray: Generated output tokens """ # Replace invalid index value for INT32 max to 0 using add_initializer @@ -172,13 +175,14 @@ def run_kv_model_on_ort(self, model_path): def run_kv_model_on_cloud_ai_100(self, qpc_path, device_group): """ - Function responsible for running ONNX model on Cloud AI 100 and return the output tokens - -------- + Function responsible for running ``ONNX`` model on Cloud AI 100 and return the output tokens - :qpc_path: str. path to qpc generated after compilation - :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. + ``Mandatory`` Args: + :qpc_path (str): path to qpc generated after compilation + :device_group (List[int]): Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. - :return generated_ids: numpy.ndarray. Generated output tokens + Return: + :numpy.ndarray: Generated output tokens """ execinfo = cloud_ai_100_exec_kv_helper( tokenizer=self.input_handler.tokenizer, diff --git a/README.md b/README.md index bd1daab65..883a718af 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,8 @@ -

- - Cloud AI 100 - -

+![alt text](docs/image/Cloud_AI_100.png) ---- -# Qualcomm Transformers Library +--- +# Efficient Transformers Library --- *Latest news* :fire:
@@ -21,6 +17,9 @@ - [05/2024] Added support for [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) & [Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1). - [04/2024] Initial release of [efficient transformers](https://github.com/quic/efficient-transformers) for seamless inference on pre-trained LLMs. + +# Overview + ## Train anywhere, Infer on Qualcomm Cloud AI with a Developer-centric Toolchain This library provides reimplemented blocks of LLMs which are used to make the models functional and highly performant on Qualcomm Cloud AI 100. @@ -31,9 +30,9 @@ For other models, there is comprehensive documentation to inspire upon the chang 1. Reimplemented blocks from Transformers which enable efficient on-device retention of intermediate states. 2. Graph transformations to enable execution of key operations in lower precision 3. Graph transformations to replace some operations to other mathematically equivalent operations -4. Handling for underflows and overflows in lower precision +4. Handling for under-flows and overflows in lower precision 5. Patcher modules to map weights of original model's operations to updated model's operations -6. Exporter module to export the model source into a ONNX Graph. +6. Exporter module to export the model source into a `ONNX` Graph. 7. Sample example applications and demo notebooks 8. Unit test templates. @@ -42,223 +41,35 @@ For other models, there is comprehensive documentation to inspire upon the chang 2. If the PR modifies any common utilities, tests need to be included to execute tests of all models included in the library. -## Validated Models - -* [GPT2](https://huggingface.co/openai-community/gpt2) -* [Llama-3-8b](https://huggingface.co/meta-llama/Meta-Llama-3-8B) -* [Llama-3-70b](https://huggingface.co/meta-llama/Meta-Llama-3-70B) -* [Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) -* [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) -* [Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) -* [CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) -* [CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) -* [CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) -* [Salesforce/codegen25-7b-mono_P](https://huggingface.co/Salesforce/codegen25-7b-mono_P) -* [Salesforce/xgen-7b-8k-base](https://huggingface.co/Salesforce/xgen-7b-8k-base) -* [MPT-7b](https://huggingface.co/mosaicml/mpt-7b) -* [Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) -* [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) -* [Vicuna-v0](https://huggingface.co/lmsys/vicuna-13b-delta-v0) -* [Vicuna-v1.3](https://huggingface.co/lmsys/vicuna-13b-v1.3) -* [Vicuna-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5) -* [Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) -* [StarCoder2-15B](https://huggingface.co/bigcode/starcoder2-15b) -* [Phi3-Mini-4K-Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) -* [Codestral-22B-v0.1](https://huggingface.co/mistralai/Codestral-22B-v0.1) -* [Falcon-40b](https://huggingface.co/tiiuae/falcon-40b) -* [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6b) - -## Models Coming Soon - -* [Jais-13b](https://huggingface.co/core42/jais-13b) -* [Jais-30b](https://huggingface.co/core42/jais-30b-chat-v1) -* [Chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b) -* [Baichuan2-7B-Base](baichuan-inc/Baichuan2-7B-Base) - - -## Requirements -System Requirements: -1. [Supported Linux OS](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/#operating-systems) - Ubuntu, RHEL and AWS Linux -2. [Pre-requisites installed](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Pre-requisites/pre-requisites/) -3. [Cloud AI 100 Platform and Apps SDK installed](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Cloud-AI-SDK/Cloud-AI-SDK/) -4. [Multi-device support enabled for model sharding](https://github.com/quic/cloud-ai-sdk/tree/1.12/utils/multi-device) - -:bulb: Use bash terminal - -:memo: If using ZSH terminal then "device_group" should be in single quotes e.g. "--device_group '[0]'" - -## Installation -```bash -pip install -U pip -pip install git+https://github.com/quic/efficient-transformers -``` - -# Quick Start Guide - -QEfficient Library was designed with one goal: **to make onboarding of models inference straightforward for any Transformer architecture, while leveraging the complete power of Cloud AI platform** - -To achieve this, we have 2 levels of APIs, with different levels of abstraction. -1. High-level APIs abstract away complex details, offering a simpler interface. They're ideal for quick development and prototyping. If you're new to a technology or want to minimize coding effort, high-level APIs are more user-friendly. - -2. Low-level APIs offer more granular control, ideal for when customization is necessary. These are particularly useful for users who are trying their own models, not hosted on HF but are implemented based on Transformers. - -In summary: - -* Choose high-level APIs for quick development, simplicity, and ease of use. -* Opt for low-level APIs when you need fine-tuned control, optimization, or advanced customization. - - - -## Using High Level APIs -
- -| High Level APIs | Sample use | Arguments | -|-----------------|------------|-------------------| -| QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • **prompt : Optional
  • **prompts_txt_file_path : Optional
  • aic_enable_depth_first : Optional
  • mos : Optional [Default=-1]
  • batch_size : Optional [Default=1]
  • prompt_len : Optional [Default=32]
  • ctx_len : Optional [Default=128]
  • generation_len : Optional [Default=None]
  • mxfp6 : Optional
  • mxint8 : Optional
  • local_model_dir : Optional [Path to custom model weights and config file]
  • cache_dir : Optional [Path to the directory used for saving HuggingFace cache, Default is "efficient-transformers/cache_dir".]
  • hf_token : Optional
  • verbose : Optional
  • | -| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexcute) |
  • model_name : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • local_model_dir : Optional [Path to custom model weights and config file]
  • **prompt : Optional
  • **prompts_txt_file_path : Optional
  • generation_len : Optional [Default=None]
  • cache_dir : Optional [Path to the directory used for saving HuggingFace cache, Default is "efficient-transformers/cache_dir".]
  • hf_token : Optional
  • | - -**One argument, prompt or prompts_txt_file_path must be passed.** - -### 1. Use QEfficient.cloud.infer - -This is the single e2e python api in the library, which takes model_card name as input along with other compile args if necessary and does everything in one go. - -* Torch Download → Optimize for Cloud AI 100 → Export to ONNX → Verify (CPU) → Compile on Cloud AI 100 → [Execute](#2-use-of-qefficientcloudexecute) -* It skips the ONNX export/compile stage if ONNX file or qpc found on path - - -```bash -# Check out the options using the help menu -python -m QEfficient.cloud.infer --help -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first - -# If executing for batch size>1, - -# Either pass input prompts in single string, seperated with pipe (|) symbol". Example below - -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth -theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first - -# Or pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder. - -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first - ``` -### 2. Use of QEfficient.cloud.execute - -Once we have compiled the QPC, we can now use the precompiled QPC in execute API to run for different prompts, like below: - +## Quick Installation ```bash -python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs --prompt "Once upon a time in" --device_group [0] -``` - -We can also enable MQ, just based on the number of devices. Based on the "--device_group" as input it will create TS config on the fly. If "--device_group [0,1]" it will create TS config for 2 devices and use it for compilation, if "--device_group 0" then TS compilation is skipped and single soc execution is enabled. - -```bash -python -m QEfficient.cloud.infer --model_name Salesforce/codegen-2B-mono --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0,1] --prompt "def fibonacci(n):" --mos 1 --aic_enable_depth_first - -# Once qpc is saved, you can use the execute API to run for different prompts -python -m QEfficient.cloud.execute --model_name Salesforce/codegen-2B-mono --qpc-path qeff_models/Salesforce/codegen-2B-mono/qpc_16cores_1BS_32PL_128CL_2devices_mxfp6/qpcs --prompt "def binary_search(array: np.array, k: int):" --device_group [0,1] - -# To disable MQ, just pass single soc like below: -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first -``` - - - - -| High Level APIs | Single SoC | Tensor Slicing | -|-----------------|------------|-------------------| -| QEfficient.cloud.infer | python -m QEfficient.cloud.infer --model_name $\color{green} {model}$ --batch_size 1 --prompt_len 128 --ctx_len 1024 --num_cores 16 --device_group [0] --prompt "My name is" --mxfp6 --hf_token $\color{green}{xyz}$ --mos 1 --aic_enable_depth_first | python -m QEfficient.cloud.infer --model_name $\color{green}{model}$ --batch_size 1 --prompt_len 128 --ctx_len 1024 --num_cores 16 --device_group [0,1,2,3] --prompt "My name is" --mxfp6 --hf_token $\color{green}{xyz}$ --mos 1 --aic_enable_depth_first | -| QEfficient.cloud.execute | python -m QEfficient.cloud.execute --model_name $\color{green}{model}$ --device_group [0] --qpc_path $\color{green}{path}$ --prompt "My name is" --hf_token $\color{green}{xyz}$ | python -m QEfficient.cloud.execute --model_name $\color{green}{model}$ --device_group [0,1,2,3] --qpc_path $\color{green}{path}$ --prompt "My name is" --hf_token $\color{green}{xyz}$ | - -:memo: Replace $\color{green}{model}$ , $\color{green}{path}$ and $\color{green}{xyz}$ with preferred model card name, qpc path and hf token respectively. - - - -## Using Low Level APIs - - -
    +# Create Python virtual env and activate it. (Required Python 3.8) -| Low Level APIs | Sample use | Arguments | -|-----------------|------------|-------------------| -| QEfficient.transform | [click here](#1-model-download-and-transform) |
  • model : $\color{green} {Mandatory}$
  • form_factor : Optional [Default="cloud"]
  • | -| QEfficient.export | [click here](#2-onnx-export-of-transformed-model) |
  • model_name : $\color{green} {Mandatory}$
  • model_kv : Optional
  • local_model_dir : Optional [Path to custom model weights and config file]
  • tokenizer : Optional
  • cache_dir : Optional [Path to the directory used for saving HuggingFace cache, Default is "efficient-transformers/cache_dir".]
  • onnx_dir_path : Optional
  • hf_token : Optional
  • seq_length : Optional [Default=128]
  • kv : Optional [Default=True]
  • form_factor : Optional [Default="cloud"]
  • -| QEfficient.compile | [click here](#3-compile-on-cloud-ai-100) |
  • onnx_path : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default=1]
  • prompt_len : Optional [Default=32]
  • ctx_len : Optional [Default=128]
  • aic_enable_depth_first : Optional [Default=False]
  • mos : Optional [Default=-1]
  • mxint8 : Optional [Defaul=False]
  • mxfp6 : Optional [Default=True]
  • custom_io_file_path : Optional [Default=None]
  • | -|QEfficient.cloud_ai_100_exec_kv | [click here](#4-run-benchmark) |
  • tokenizer : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • **prompt : Optional
  • **prompts_txt_file_path : Optional
  • device_id : Optional [Default=[0]]
  • generation_len : Optional [Default=None]
  • enable_debug_logs : Optional [Default=False]
  • stream : Optional [Default=True]
  • write_io_dir : Optional
  • automation : Optional [Default=False]
  • | - -**In QEfficient.cloud_ai_100_exec_kv, atleast one argument, prompt or prompts_txt_file_path must be passed.
    - -### 1. Model download and Optimize for Cloud AI 100 - -Initialize QEfficient and transform the models, Check the list of supported architectures in the repo. - -```Python -# Initiate the Orignal Transformer model -import os - -from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM - -# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir. -# os.environ["TRANSFORMERS_CACHE"] = "/local/mnt/workspace/hf_cache" - -# ROOT_DIR = os.path.dirname(os.path.abspath("")) -# CACHE_DIR = os.path.join(ROOT_DIR, "tmp") #, you can use a different location for just one model by passing this param as cache_dir in below API. - -# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl -model_name = "gpt2" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib. - -qeff_model = AutoModelForCausalLM.from_pretrained(model_name) -print(f"{model_name} optmized for AI 100 \n", qeff_model) -``` - -### 2. Export and Compile with one API - -use the qualcomm_efficient_converter API to export the KV transformed Model to ONNX and Verify on Torch. - -```Python -# We can now export the modified models to Onnx framework -# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for -# Cloud AI 100 Platform. - -# While generating the ONNX model, this will clip the overflow constants to fp16 -# Verify the model on Onnxruntime vs Pytorch - -# Then generate inputs and customio yaml file required for compilation. -# Compile the model for provided compilation arguments -# Please use platform SDk to Check num_cores for your card. - -generated_qpc_path = qeff_model.compile( - num_cores=14, - mxfp6=True, - device_group=[0], -) -``` +python3.8 -m venv qeff_env +source qeff_env/bin/activate +pip install -U pip -### 3. Run Benchmark +# Clone and Install the QEfficient Repo. +pip install git+https://github.com/quic/efficient-transformers -Benchmark the model on Cloud AI 100, run the infer API to print tokens and tok/sec +``` -```Python -# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100 -# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach +For more details about using ``QEfficient`` via Cloud AI 100 Apps SDK, visit [Linux Installation Guide](https://quic.github.io/efficient-transformers/source/Linux_installation.html) -qeff_model.generate(prompts=["My name is"]) -``` -End to End demo examples for various models are available in **notebooks** directory. Please check them out. -## Adding support for a new model -Watch this space for references to detailed steps, template examples and much more. +## Documentation -## Details on KV Cache Optimization for Cloud AI 100 -![alt text](docs/kv_cache_cloudai100.png) +* [Quick Start Guide](https://quic.github.io/efficient-transformers/source/quick_start.html#) +* [Python API](https://quic.github.io/efficient-transformers/source/high_level_api.html) +* [Validated Models](https://quic.github.io/efficient-transformers/source/Validate.html) +* [Models coming soon](https://quic.github.io/efficient-transformers/source/Validate.html#models-coming-soon) -Note: More details are here: https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Model-Architecture-Support/Large-Language-Models/llm/ +> Note: More details are here: https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Model-Architecture-Support/Large-Language-Models/llm/ ## Acknowledgements Thanks to: -* Huggingface transformers for work in LLM GenAI modeling implementation +* HuggingFace transformers for work in LLM GenAI modeling implementation * ONNX, Pytorch, ONNXruntime community. ## Support diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..9fe094f98 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,25 @@ +# Docs + +This directory contains the instructions for building static html documentations based on [sphinx](https://www.sphinx-doc.org/en/master/). + + +## Build the docs +Install the packages required for building documentation: + +```sh + pip install -r docs/requirements.txt +``` + +And then, change directory to docs folder to build the docs. + +```sh +cd docs/ +sphinx-build -M html . build +``` +## Preview the docs locally + +```bash +cd build/html +python -m http.server +``` +You can visit the page with your web browser with url `http://localhost:8080`. \ No newline at end of file diff --git a/docs/__init__.py b/docs/__init__.py deleted file mode 100644 index 91fee0a49..000000000 --- a/docs/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - diff --git a/docs/_static/my_theme.css b/docs/_static/my_theme.css new file mode 100644 index 000000000..00a18c905 --- /dev/null +++ b/docs/_static/my_theme.css @@ -0,0 +1,3 @@ +.wy-nav-content { + max-width: 1200px !important; +} \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 000000000..55881fb44 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,72 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys + +sys.path.insert(0, os.path.abspath("..")) + + +# -- Project information ----------------------------------------------------- + +project = "efficient-transformers" +copyright = "2024, Qualcomm" + +# The full version, including alpha/beta/rc tags +release = "main" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ["myst_parser", "sphinx.ext.todo", "sphinx.ext.viewcode", "sphinx.ext.autodoc"] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + + +def setup(app): + app.add_css_file("my_theme.css") + + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] +source = [".md"] +todo_include_todos = True + +suppress_warnings = [ + "ref.rst_pilog", # Suppress warnings about excluded toctree entries +] diff --git a/docs/Cloud_AI_100.png b/docs/image/Cloud_AI_100.png similarity index 100% rename from docs/Cloud_AI_100.png rename to docs/image/Cloud_AI_100.png diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..dd4b7ecce --- /dev/null +++ b/docs/index.md @@ -0,0 +1,69 @@ +% QEfficient documentation master file, created by +% sphinx-quickstart on Tue May 28 09:19:122024. +% You can adapt this file completely to your liking, but it should at least +% contain the root `toctree` directive. + +Welcome to Efficient-Transformers Documentation! +======================================== + + + + + +```{toctree} +:caption: 'Getting Started' +:maxdepth: 4 + +source/introduction +source/validate +``` + + +```{toctree} +:caption: 'Installation' +:maxdepth: 2 + +source/installation +``` + + +```{toctree} +:caption: 'Quick start' +:maxdepth: 4 + +source/quick_start +``` + +```{toctree} +:caption: 'Command Line Interface Use (CLI)' +:maxdepth: 2 +source/cli_api +``` + + +```{toctree} +:caption: 'Python API' +:maxdepth: 2 + +source/hl_api +source/ll_api + +``` + +```{toctree} +:caption: 'Blogs' +:maxdepth: 2 + +source/blogs + +``` + +```{toctree} +:caption: 'Reference' +:maxdepth: 2 + +source/reference + +``` diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 000000000..708f4974c --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,3 @@ +Sphinx==7.1.2 +sphinx-rtd-theme==2.0.0 +myst-parser==3.0.1 diff --git a/docs/source/blogs.md b/docs/source/blogs.md new file mode 100644 index 000000000..6e20dd7f2 --- /dev/null +++ b/docs/source/blogs.md @@ -0,0 +1,15 @@ +# Train anywhere, Infer on Qualcomm Cloud AI 100 + [Click here](https://www.qualcomm.com/developer/blog/2024/01/train-anywhere-infer-qualcomm-cloud-ai-100) + +# How to Quadruple LLM Decoding Performance with Speculative Decoding (SpD) and Microscaling (MX) Formats on Qualcomm® Cloud AI 100 + [Click here](https://statics.teams.cdn.office.net/evergreen-assets/safelinks/1/atp-safelinks.html) + +# Power-efficient acceleration for large language models – Qualcomm Cloud AI SDK + [Click here](https://www.qualcomm.com/developer/blog/2023/11/power-efficient-acceleration-large-language-models-qualcomm-cloud-ai-sdk) + +# Qualcomm Cloud AI 100 Accelerates Large Language Model Inference by ~2x Using Microscaling (Mx) Formats +[click here](https://www.qualcomm.com/developer/blog/2024/01/qualcomm-cloud-ai-100-accelerates-large-language-model-inference-2x-using-microscaling-mx) + +# Qualcomm Cloud AI Introduces Efficient Transformers: One API, Infinite Possibilities +[click here](https://www.qualcomm.com/developer/blog/2024/05/qualcomm-cloud-ai-introduces-efficient-transformers--one-api--in) + diff --git a/docs/source/cli_api.md b/docs/source/cli_api.md new file mode 100644 index 000000000..32f3247b3 --- /dev/null +++ b/docs/source/cli_api.md @@ -0,0 +1,26 @@ + +```{NOTE} +Use ``bash terminal``, else if using ``ZSH terminal`` then ``device_group``should be in single quotes e.g. ``'--device_group [0]'`` +``` + +(infer_api)= +# `QEfficient.cloud.infer` +```{eval-rst} +.. automodule:: QEfficient.cloud.infer.main +``` +# `QEfficient.cloud.execute` +```{eval-rst} +.. automodule:: QEfficient.cloud.execute.main +``` +# `QEfficient.cloud.compile` +```{eval-rst} + .. automodule:: QEfficient.compile.compile_helper.compile + .. code-block:: bash + + python -m QEfficient.cloud.compile OPTIONS +``` +# `QEfficient.cloud.export` +```{eval-rst} + .. automodule:: QEfficient.cloud.export.main + +``` \ No newline at end of file diff --git a/docs/source/hl_api.md b/docs/source/hl_api.md new file mode 100644 index 000000000..951309711 --- /dev/null +++ b/docs/source/hl_api.md @@ -0,0 +1,35 @@ +**This page give you an overview about the all the APIs that you might need to integrate the `QEfficient` into your python applications.** + +# High Level API + +## `QEFFAutoModelForCausalLM` +```{eval-rst} +.. automodule:: QEfficient.transformers.models.modeling_auto + :inherited-members: + :undoc-members: + :exclude-members: QEffAutoModel,QEFFTransformersBase, run_ort, run_pytorch, get_tokenizer, run_cloud_ai_100, execute +``` +## `export` +```{eval-rst} +.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100 + :members: + :show-inheritance: + :exclude-members: convert_to_cloud_kvstyle, convert_to_cloud_bertstyle +``` +## `compile` +```{eval-rst} +.. automodule:: QEfficient.compile.compile_helper + :members: + :show-inheritance: +.. code-block:: bash + import QEfficient + base_path, onnx_model_path = QEfficient.export(model_name="gpt2") + qpc_path = QEfficient.compile(onnx_path=onnx_model_path, qpc_path=os.path.join(base_path, "qpc"), num_cores=14, device_group=[0]) +``` +## `Execute` +```{eval-rst} +.. automodule:: QEfficient.generation.text_generation_inference + :members: + :show-inheritance: + :exclude-members: latency_stats_bertstyle,cloud_ai_100_exec_kv_helper +``` \ No newline at end of file diff --git a/docs/source/image/Cloud_AI_100.png b/docs/source/image/Cloud_AI_100.png new file mode 100644 index 000000000..54ab44309 Binary files /dev/null and b/docs/source/image/Cloud_AI_100.png differ diff --git a/docs/kv_cache_cloudai100.png b/docs/source/image/kv_cache_cloudai100.png similarity index 100% rename from docs/kv_cache_cloudai100.png rename to docs/source/image/kv_cache_cloudai100.png diff --git a/docs/source/installation.md b/docs/source/installation.md new file mode 100644 index 000000000..1a08928f4 --- /dev/null +++ b/docs/source/installation.md @@ -0,0 +1,44 @@ +# Pre-requisites +System Requirements: +1. [Supported Linux OS](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/#operating-systems) - Ubuntu, RHEL and AWS Linux +2. [Cloud AI 100 Platform and Apps SDK installed](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Cloud-AI-SDK/Cloud-AI-SDK/) +3. [SDK Pre-requisites](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Pre-requisites/pre-requisites/) +4. [Multi-device support enabled for model sharding](https://github.com/quic/cloud-ai-sdk/tree/1.12/utils/multi-device) + +# Linux Installation +There are two different way to install efficient-transformers. + +## Using SDK + +* Download Apps SDK: [Cloud AI 100 Platform and Apps SDK install](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Cloud-AI-SDK/Cloud-AI-SDK/) + + +```bash +# Install using Apps SDK + +bash install.sh --enable-qeff +source /opt/qti-aic/dev/python/qeff/bin/activate + +``` +## Using GitHub Repository + +```bash + +# Create Python virtual env and activate it. (Required Python 3.8) + +python3.8 -m venv qeff_env +source qeff_env/bin/activate +pip install -U pip + +# Clone and Install the QEfficient Repo. +pip install git+https://github.com/quic/efficient-transformers + +``` + +# Sanity Check + +After any of the above installation methods, you can check if ``QEfficient`` is installed correctly by using +```bash +python -c "import QEfficient; print(QEfficient.__version__)" +``` +If the above line executes successfully, you are good to go ahead and start deploying models on ``Cloud AI 100`` cards using ``QEfficient`` library. diff --git a/docs/source/introduction.md b/docs/source/introduction.md new file mode 100644 index 000000000..f6a214106 --- /dev/null +++ b/docs/source/introduction.md @@ -0,0 +1,33 @@ +![alt text](image/Cloud_AI_100.png) + + +# Introduction Qualcomm ``efficient-transformers`` library + +**Train anywhere, Infer on Qualcomm Cloud AI with a Developer-centric Toolchain** + +This library provides reimplemented blocks of LLMs which are used to make the models functional and highly performant on Qualcomm Cloud AI 100. +We support wide range of [models](validated_models) architectures, for easy efficient deployment on Cloud AI 100 cards. Users only need to provide model card from HuggingFace or Path to the local model and the library will take care of transforming model to it's efficient implementation for Cloud AI 100. + +For other models, there is comprehensive documentation to inspire upon the changes needed and How-To(s). + +**Typically for LLMs, the library provides:** +1. Reimplemented blocks from Transformers which enable efficient on-device retention of intermediate states. read more [here](kv_cache) +2. Graph transformations to enable execution of key operations in lower precision +3. Graph transformations to replace some operations to other mathematically equivalent operations that are efficient/supported on HW backend +4. Handling for underflow and overflows in lower precision +5. Patcher modules to map weights of original model's operations to updated model's operations +6. Exporter module to export the model source into a `ONNX` Graph. +7. Sample example applications and demo notebooks +8. Unit test templates. + +***Latest news*** :
    + +- [coming soon] Support for more popular [models](coming_soon_models) and inference optimization techniques like continuous batching and speculative decoding
    +* [06/2024] Added support for [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6b) +- [06/2024] Added support for [Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) +- [06/2024] Added support for [StarCoder2-15B](https://huggingface.co/bigcode/starcoder2-15b) +- [06/2024] Added support for [Phi3-Mini-4K-Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) +- [06/2024] Added support for [Codestral-22B-v0.1](https://huggingface.co/mistralai/Codestral-22B-v0.1) +- [06/2024] Added support for [Vicuna-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5) +- [05/2024] Added support for [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) & [Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1). +- [04/2024] Initial release of [efficient transformers](https://github.com/quic/efficient-transformers) for seamless inference on pre-trained LLMs. \ No newline at end of file diff --git a/docs/source/ll_api.md b/docs/source/ll_api.md new file mode 100644 index 000000000..8cdb974bc --- /dev/null +++ b/docs/source/ll_api.md @@ -0,0 +1,38 @@ +# Low Level API + +## `convert_to_cloud_kvstyle` +```{eval-rst} +.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100 + :members: + :show-inheritance: + :exclude-members: qualcomm_efficient_converter, convert_to_cloud_bertstyle +``` +## `convert_to_cloud_bertstyle` +```{eval-rst} +.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100 + :members: + :show-inheritance: + :exclude-members: qualcomm_efficient_converter, convert_to_cloud_kvstyle +``` + +## `utils` + +```{eval-rst} +.. automodule:: QEfficient.utils.device_utils + :members: + :show-inheritance: +``` + +```{eval-rst} +.. automodule:: QEfficient.utils.generate_inputs + :members: + :undoc-members: + :show-inheritance: +``` + +```{eval-rst} +.. automodule:: QEfficient.utils.run_utils + :members: + :undoc-members: + :show-inheritance: +``` \ No newline at end of file diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md new file mode 100644 index 000000000..cb410c23d --- /dev/null +++ b/docs/source/quick_start.md @@ -0,0 +1,128 @@ + +QEfficient Library was designed with one goal: + +**To make onboarding of models inference straightforward for any Transformer architecture, while leveraging the complete power of Cloud AI platform** + +To achieve this, we have 2 levels of APIs, with different levels of abstraction. +1. Command line interface abstracts away complex details, offering a simpler interface. They're ideal for quick development and prototyping. If you're new to a technology or want to minimize coding effort. + +2. Python high level APIs offer more granular control, ideal for when customization is necessary. + +## Command Line Interface + +```{NOTE} +Use ``bash terminal``, else if using ``ZSH terminal`` then ``device_group``should be in single quotes e.g. ``'--device_group [0]'`` +``` + +### QEfficient.cloud.infer + +This is the single e2e CLI API, which takes `model_card` name as input along with other compilation arguments. Check [Infer API doc](infer_api) for more details. + +* HuggingFace model files Download → Optimize for Cloud AI 100 → Export to `ONNX` → Compile on Cloud AI 100 → [Execute](#qefficientcloudexecute) +* It skips the export/compile stage based if `ONNX` or `qpc` files are found. If you use infer second time with different compilation arguments, it will automatically skip `ONNX` model creation and directly jump to compile stage. + + +```bash +# Check out the options using the help +python -m QEfficient.cloud.infer --help +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first +``` +If executing for batch size>1, +You can pass input prompts in single string but separate with pipe (|) symbol". Example below + +```bash +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth +theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first +``` + +You can also pass path of txt file with input prompts when you want to run inference on lot of prompts, Example below, sample txt file(prompts.txt) is present in examples folder. + +```bash +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first +``` + +### QEfficient.cloud.execute +You can first run `infer` API and then use `execute` to run the pre-compiled model on Cloud AI 100 cards. +Once we have compiled the QPC, we can now use the precompiled QPC in execute API to run for different prompts. Make sure to pass same `--device_group` as used during infer. Refer [Execute API doc](execute_api) for more details. + +```bash +python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs --prompt "Once upon a time in" --device_group [0] +``` + +### Multi-Qranium Inference +You can also enable MQ, just based on the number of devices. Based on the `--device-group` as input it will create TS config on the fly. If `--device-group [0,1]` it will create TS config for 2 devices and use it for compilation, if `--device-group [0]` then TS compilation is skipped and single soc execution is enabled. + +```bash +python -m QEfficient.cloud.infer --model_name Salesforce/codegen-2B-mono --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0,1] --prompt "def fibonacci(n):" --mos 2 --aic_enable_depth_first +``` +Above step will save the `qpc` files under `efficient-transformers/qeff_models/{model_card_name}`, you can use the execute API to run for different prompts. This will automatically pick the pre-compiled `qpc` files. + +```bash +python -m QEfficient.cloud.execute --model_name Salesforce/codegen-2B-mono --qpc-path qeff_models/Salesforce/codegen-2B-mono/qpc_16cores_1BS_32PL_128CL_2devices_mxfp6/qpcs --prompt "def binary_search(array: np.array, k: int):" --device-group [0,1] +``` + +To disable MQ, just pass single soc like below, below step will compile the model again and reuse the `ONNX` file as only compilation argument are different from above commands. + +```bash +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first +``` + +## Python API + +### 1. Model download and Optimize for Cloud AI 100 +If your models falls into the model architectures that are [already supported](validated_models), Below steps should work fine. +Please raise an [issue](https://github.com/quic/efficient-transformers/issues), in case of trouble. + +```Python +# Initiate the Original Transformer model +# import os + +from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM + +# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir. +# os.environ["TRANSFORMERS_CACHE"] = "/local/mnt/workspace/hf_cache" + +# ROOT_DIR = os.path.dirname(os.path.abspath("")) +# CACHE_DIR = os.path.join(ROOT_DIR, "tmp") #, you can use a different location for just one model by passing this param as cache_dir in below API. + +# Model-Card name (This is HF Model Card name) : https://huggingface.co/gpt2-xl +model_name = "gpt2" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib. + +qeff_model = AutoModelForCausalLM.from_pretrained(model_name) +print(f"{model_name} optimized for AI 100 \n", qeff_model) +``` + +### 2. Export and Compile with one API + +Use the qualcomm_efficient_converter API to export the KV transformed Model to ONNX and Verify on Torch. + +```Python +# We can now export the modified models to ONNX framework +# This will generate single ONNX Model for both Prefill and Decode Variations which are optimized for +# Cloud AI 100 Platform. + +# While generating the ONNX model, this will clip the overflow constants to fp16 +# Verify the model on ONNXRuntime vs Pytorch + +# Then generate inputs and customio yaml file required for compilation. +# Compile the model for provided compilation arguments +# Please use platform SDk to Check num_cores for your card. + +generated_qpc_path = qeff_model.compile( + num_cores=14, + mxfp6=True, + device_group=[0], +) +``` + +### 3. Execute + +Benchmark the model on Cloud AI 100, run the infer API to print tokens and tok/sec + +```Python +# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100 +# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach + +qeff_model.generate(prompts=["My name is"]) +``` +End to End demo examples for various models are available in **notebooks** directory. Please check them out. diff --git a/docs/source/reference.md b/docs/source/reference.md new file mode 100644 index 000000000..37d1ddadc --- /dev/null +++ b/docs/source/reference.md @@ -0,0 +1,6 @@ +**References** +# [Qualcomm Cloud AI home](https://www.qualcomm.com/products/technology/processors/cloud-artificial-intelligence) +# [Qualcomm Cloud AI SDK download](https://www.qualcomm.com/products/technology/processors/cloud-artificial-intelligence/cloud-ai-100#Software) +# [Qualcomm Cloud AI API reference](https://quic.github.io/cloud-ai-sdk-pages/latest/API/) +# [User Guide](https://quic.github.io/cloud-ai-sdk-pages/) +# [OCP Microscaling Formats (MX) Specification](https://www.qualcomm.com/developer/blog/2024/05/6.%09https:/www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) \ No newline at end of file diff --git a/docs/source/validate.md b/docs/source/validate.md new file mode 100644 index 000000000..81d1fb9b8 --- /dev/null +++ b/docs/source/validate.md @@ -0,0 +1,34 @@ +(validated_models)= +# Validated Models + +* [GPT2](https://huggingface.co/openai-community/gpt2) +* [Llama-3-8b](https://huggingface.co/meta-llama/Meta-Llama-3-8B) +* [Llama-3-70b](https://huggingface.co/meta-llama/Meta-Llama-3-70B) +* [Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) +* [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) +* [Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) +* [CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) +* [CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) +* [CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) +* [Salesforce/codegen25-7b-mono_P](https://huggingface.co/Salesforce/codegen25-7b-mono_P) +* [Salesforce/xgen-7b-8k-base](https://huggingface.co/Salesforce/xgen-7b-8k-base) +* [MPT-7b](https://huggingface.co/mosaicml/mpt-7b) +* [Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) +* [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) +* [Vicuna-v0](https://huggingface.co/lmsys/vicuna-13b-delta-v0) +* [Vicuna-v1.3](https://huggingface.co/lmsys/vicuna-13b-v1.3) +* [Vicuna-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5) +* [Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) +* [StarCoder2-15B](https://huggingface.co/bigcode/starcoder2-15b) +* [Phi3-Mini-4K-Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) +* [Codestral-22B-v0.1](https://huggingface.co/mistralai/Codestral-22B-v0.1) +* [Falcon-40b](https://huggingface.co/tiiuae/falcon-40b) +* [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6b) + +(coming_soon_models)= +# Models Coming Soon + +* [Jais-13b](https://huggingface.co/core42/jais-13b) +* [Jais-30b](https://huggingface.co/core42/jais-30b-chat-v1) +* [Chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b) +* [Baichuan2-7B-Base](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base) \ No newline at end of file diff --git a/docs/validation.md b/docs/validation.md deleted file mode 100644 index 7de5b0865..000000000 --- a/docs/validation.md +++ /dev/null @@ -1,29 +0,0 @@ -## Validation of Model before and After KV Cache Optimization - -# Run APIs - -| API | params | Usage | -|------------------|:--------:|:--------------------:| -| run_hf_model_on_pytorch |
    1. model_hf
    |
  • Runs HuggingFace model on Pytorch and returns output tokens
  • | -| run_kv_model_on_pytorch |
    1. model_kv
    2. n_layer
    3. padding_shape
    |
  • Runs KV cache model on PyTorch and returns output tokens
  • | -| run_kv_model_on_ort |
    1. model_path
    2. n_layer
    3. padding_shape
    |
  • Runs KV cache model on onnxruntime and returns output tokens
  • | -| run_kv_model_on_cloud_ai_100 |
    1. qpc_path
    2. device_id
    3. n_layer
    4. padding_shape
    |
  • To be run only when device is available
  • runs compiled qpc on Cloud AI 100 and returns output tokens | - -Class InputHandler is called inside these APIs which generates inputs for prefill and decode stage. - -These APIs, input_generation, checking available device id and Constants scripts are present inside tests/utils folder. - -# Sample Usage : - -These APIs are part of ApiRunner class : - - n_layer : number of layers in the model - padding_shape : shape of past key values to initialize the first iteration key value inputs with zeros. - -
  • run_api = ApiRunner(tokenizer, Constants.INPUT_STRING, Constants.PROMPT_LEN, Constants.CTX_LEN)
  • -
  • pytorch_hf_tokens = run_api.run_hf_model_on_pytorch(model_hf)
  • -
  • pytorch_kv_tokens = run_api.run_kv_model_on_pytorch(model_kv, n_layer, padding_shape)
  • -
  • ort_tokens = run_api.run_kv_model_on_ort(onnx_model_path, n_layer, padding_shape)
  • -
  • cloud_ai_100_tokens = run_api.run_kv_model_on_cloud_ai_100(session, n_layer, padding_shape)
  • - -Output tokens are compared to validate the model on the required framework level (PyTorch/ONNX/cloud_ai_100). diff --git a/examples/__init__.py b/examples/__init__.py index 91fee0a49..da26921c5 100644 --- a/examples/__init__.py +++ b/examples/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/notebooks/__init__.py b/notebooks/__init__.py index 91fee0a49..da26921c5 100644 --- a/notebooks/__init__.py +++ b/notebooks/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index ed6619d48..a68318ec9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ [project.optional-dependencies] test = ["pytest","pytest-mock"] +docs = ["Sphinx==7.1.2","sphinx-rtd-theme==2.0.0","myst-parser==3.0.1"] quality = ["black", "ruff", "hf_doc_builder@git+https://github.com/huggingface/doc-builder.git"] [build-system] diff --git a/scripts/__init__.py b/scripts/__init__.py index 91fee0a49..da26921c5 100644 --- a/scripts/__init__.py +++ b/scripts/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- diff --git a/tests/__init__.py b/tests/__init__.py index 91fee0a49..da26921c5 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,6 +1,6 @@ # ----------------------------------------------------------------------------- # -# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # -----------------------------------------------------------------------------