Adding sphinx documentation (#21)

* Rebase with main Signed-off-by: amitraj <[email protected]> * Updated with the latest changes Signed-off-by: Amit Raj <[email protected]> * Rabse with main Signed-off-by: Amit Raj <[email protected]> * added QEFFAutoModelForCausalLM doc Signed-off-by: Onkar Chougule <[email protected]> * updated instructions to launch doc locally Signed-off-by: Onkar Chougule <[email protected]> * grammatical check and reformatting Signed-off-by: Amit Raj <[email protected]> * Addressed all the comments Signed-off-by: Amit Raj <[email protected]> * Basic changes Signed-off-by: Amit Raj <[email protected]> * Addressed all the comments Signed-off-by: Amit Raj <[email protected]> * Spell check and minor fixes, beautification Signed-off-by: Onkar Chougule <[email protected]> * ran formatter Signed-off-by: Onkar Chougule <[email protected]> --------- Signed-off-by: amitraj <[email protected]> Signed-off-by: Amit Raj <[email protected]> Signed-off-by: Onkar Chougule <[email protected]> Co-authored-by: Onkar Chougule <[email protected]>
quic · Aug 16, 2024 · 764b033 · 764b033
1 parent 7818a94
commit 764b033
Show file tree

Hide file tree

Showing 67 changed files with 1,105 additions and 485 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted (subject to the limitations in the

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------

diff --git a/QEfficient/base/__init__.py b/QEfficient/base/__init__.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------

diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py
@@ -55,7 +55,7 @@
         "--device-group",
         required=True,
         type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")],
-        help="Cloud AI 100 device ids (comma-separated) e.g. [0] ",
+        help="Cloud AI 100 device ids (comma-separated) e.g. [0,1] ",
     )
     parser.add_argument(
         "--aic_enable_depth_first",

diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
@@ -25,18 +25,23 @@ def main(
     hf_token: Optional[str] = None,
 ) -> None:
     """
-    Helper function used by execute CLI app to run the Model on Cloud AI 100 Platform.
-    ---------
+    Helper function used by execute CLI app to run the Model on ``Cloud AI 100`` Platform.
 
-    :model_name: str. Hugging Face Model Card name, Example: "gpt2"
-    :qpc_path: str.  Path to the generated binary after compilation.
-    :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
-    :local_model_dir: str. Path to custom model weights and config files.
-    :prompt: str. Sample prompt for the model text generation
-    :prompts_txt_file_path: str. Path to txt file for multiple input prompts
-    :generation_len: int. Number of tokens to be generated.
-    :cache_dir: str. Cache dir where downloaded HuggingFace files are stored.
-    :hf_token: str. HuggingFace login token to access private repos.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``.
+        :qpc_path (str): Path to the generated binary after compilation.
+        :device_group (List[int]): Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
+    ``Optional`` Args:
+        :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
+        :prompt (str): Sample prompt for the model text generation. ``Defaults to None.``
+        :prompts_txt_file_path (str): Path to txt file for multiple input prompts. ``Defaults to None.``
+        :generation_len (int): Number of tokens to be generated. ``Defaults to None.``
+        :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to Constants.CACHE_DIR.``
+        :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
+
+    .. code-block:: bash
+
+        python -m QEfficient.cloud.execute OPTIONS
     """
     tokenizer = load_hf_tokenizer(
         pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
@@ -28,13 +28,14 @@ def get_onnx_model_path(
 ):
     """
     exports the model to onnx if pre-exported file is not found and returns onnx_model_path
-    ---------
 
-    :model_name: str. Hugging Face Model Card name, Example: "gpt2"
-    :cache_dir: str. Cache dir where downloaded HuggingFace files are stored.
-    :tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]. Pass model tokenizer.
-    :hf_token: str. HuggingFace login token to access private repos.
-    :local_model_dir: str. Path to custom model weights and config files.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``.
+    ``Optional`` Args:
+        :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.``
+        :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. ``Defaults to None.``
+        :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
+        :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
     """
     onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
     if onnx_path_exists:
@@ -67,12 +68,19 @@ def main(
 ) -> None:
     """
     Helper function used by export CLI app for exporting to ONNX Model.
-    ---------
 
-    :model_name: str. Hugging Face Model Card name, Example: gpt2
-    :cache_dir: str. Cache dir to store the downloaded HuggingFace files.
-    :hf_token: str. HuggingFace login token to access private repos.
-    :local_model_dir: str. Path to custom model weights and config files.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``.
+
+    ``Optional`` Args:
+        :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.``
+        :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
+        :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
+
+    .. code-block:: bash
+
+        python -m QEfficient.cloud.export OPTIONS
+
     """
     cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
     get_onnx_model_path(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token, local_model_dir=local_model_dir)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
@@ -36,29 +36,29 @@ def main(
     hf_token: Optional[str] = None,
 ) -> None:
     """
-    Helper function used by infer CLI app; to export, compile and execute the model on Cloud AI 100 Platform.
-    1. Check if compiled qpc for given config already exists, if it does jump to execute, else
-    2. Check if exported ONNX file already exists, if true, jump to compilation -> execution, else
-    3. Check if HF model exists in cache, if true, start transform -> export -> compilation -> execution, else,
-    4. Download HF model -> transform -> export -> compile -> execute
-    ---------
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+        :num_cores (int): Number of cores to compile model on.
+        :device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled.
+    ``Optional`` Args:
+        :prompt (str): Sample prompt for the model text generation. ``Defaults to None.``
+        :prompts_txt_file_path (str): Path to txt file for multiple input prompts. ``Defaults to None.``
+        :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.``
+        :mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.``
+        :batch_size (int): Batch size to compile the model for. ``Defaults to 1.``
+        :prompt_len (int): Prompt length for the model to compile. ``Defaults to 32.``
+        :ctx_len (int): Maximum context length to compile the model. ``Defaults to 128.``
+        :generation_len (int): Number of tokens to be generated. ``Defaults to False.``
+        :mxfp6 (bool): Enable compilation for MXFP6 precision. ``Defaults to False.``
+        :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.``
+        :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
+        :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.``
+        :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
+
+    .. code-block:: bash
+
+        python -m QEfficient.cloud.infer OPTIONS
 
-    :model_name: str. Hugging Face Model Card name, Example: "gpt2"
-    :num_cores: int. :num_cores: int. Number of cores to compile model on.
-    :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
-    :prompt: str. Sample prompt for the model text generation
-    :prompts_txt_file_path: str. Path to txt file for multiple input prompts
-    :aic_enable_depth_first: bool. Enables DFS with default memory size, disabled by default.
-    :mos: int. Effort level to reduce the on-chip memory.
-    :batch_size: int. Batch size to compile the model for.
-    :prompt_len: int. prompt length for the model to compile.
-    :ctx_len: int. Maximum context length to compile the model.
-    :generation_len: int. Number of tokens to be generated.
-    :mxfp6: bool. Enable compilation for MXFP6 precision
-    :mxint8: Compress Present/Past KV to MXINT8 using CustomIO config, default is False.
-    :local_model_dir: str. Path to custom model weights and config files.
-    :cache_dir: str. Cache dir where downloaded HuggingFace files are stored.
-    :hf_token: str. HuggingFace login token to access private repos.
     """
     cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
     tokenizer = load_hf_tokenizer(

diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py
@@ -113,21 +113,28 @@ def compile(
     **kwargs,
 ) -> str:
     """
-    Helper function used by compile CLI app for compiling the Onnx Model on Cloud AI 100 Platform with given config.
-    ---------
-
-    :onnx_path: str. Generated Onnx Model Path.
-    :qpc_path: str. Path for saving compiled qpc binaries.
-    :num_cores: int. Number of cores to compile model on.
-    :device_group: List[int]. Used for finding number of devices to compile for.
-    :aic_enable_depth_first: bool. Enables DFS with default memory size, disabled by default.
-    :mos: int. Effort level to reduce the on-chip memory.
-    :batch_size: int. Batch size to compile the model for.
-    :prompt_len: int. prompt len for the model to compile.
-    :ctx_len: int. Maximum context length to compile the model.
-    :mxfp6: bool. Enable compilation for MXFP6 precision
-    :mxint8: Compress Present/Past KV to MXINT8 using CustomIO config, default is False.
-    :custom_io_file_path: str. Path to custom IO file.
+    Compiles the given ``ONNX`` model using Cloud AI 100 platform SDK compiler and saves the compiled ``qpc`` package at ``qpc_path``.
+    Generates tensor-slicing configuration if multiple devices are passed in ``device_group``.
+
+    This function will be deprecated soon and will be replaced by ``QEFFAutoModelForCausalLM.compile``.
+
+    ``Mandatory`` Args:
+        :onnx_path (str): Generated ``ONNX`` Model Path.
+        :qpc_path (str): Path for saving compiled qpc binaries.
+        :num_cores (int): Number of cores to compile the model on.
+        :device_group (List[int]): Used for finding the number of devices to compile for.
+    ``Optional`` Args:
+        :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.``
+        :mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.``
+        :batch_size (int): Batch size to compile the model for. ``Defaults to 1.``
+        :prompt_len (int): Prompt length for the model to compile. ``Defaults to 32``
+        :ctx_len (int): Maximum context length to compile the model. ``Defaults to 128``
+        :mxfp6 (bool): Enable compilation for ``MXFP6`` precision.  ``Defaults to True.``
+        :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.``
+        :custom_io_file_path (str): Path to ``customIO`` file (formatted as a string). ``Defaults to None.``
+
+    Returns:
+        :str: Path to compiled ``qpc`` package.
     """
     os.makedirs(qpc_path, exist_ok=True)
     specialization_json_path = os.path.join(qpc_path, "specializations.json")

diff --git a/QEfficient/customop/__init__.py b/QEfficient/customop/__init__.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------

diff --git a/QEfficient/customop/rms_norm.py b/QEfficient/customop/rms_norm.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------

diff --git a/QEfficient/exporter/__init__.py b/QEfficient/exporter/__init__.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------

diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -32,18 +32,21 @@ def convert_to_cloud_bertstyle(
     seq_len: int,
 ) -> str:
     """
-    Function to convert the model to Bertstyle approach.
+    API to convert model to Bertstyle approach.
     Bertstyle Approach:
-        1. No Prefill/Decode separably compiled
-        2. No KV retention logic.
-        3. KV is every time computed for all the tokens until EOS/max_length
-
-    Args:
-        model_name (str): The name of the model to be used.
-        qeff_model (QEFFBaseModel): Transformed KV torch model to be used
-        tokenizer (HF AutoTokenizer): Tokenizer to prepare inputs.
-        onnx_dir_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location.
-        seq_len (int, optional): The length of the sequence. Default is 128.
+            1. No Prefill/Decode separably compiled.
+            2. No KV retention logic.
+            3. KV is every time computed for all the tokens until EOS/max_length.
+
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: `gpt2`.
+        :qeff_model (QEFFAutoModelForCausalLM): Transformed KV torch model to be used.
+        :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer.
+        :onnx_dir_path (str): Path to save exported ONNX file.
+        :seq_len (int): The length of the sequence.
+
+    Returns:
+         :str: Path of exported ``ONNX`` file.
     """
     if os.path.exists(onnx_dir_path):
         logger.warning(f"Overriding {onnx_dir_path}")
@@ -147,19 +150,22 @@ def convert_to_cloud_kvstyle(
     seq_len: int,
 ) -> str:
     """
-    Function Modeling changes for kv retention and export to Onnx.
-    KV Style Approach:
-        1. This architecture is particularly suitable for autoregressive tasks
-        2. where sequence generation involves processing one token at a time
+    API to convert model with kv retention and export to ONNX.
+    KV Style Approach-
+        1. This architecture is particularly suitable for auto-regressive tasks.
+        2. where sequence generation involves processing one token at a time.
         3. And contextual information from earlier tokens is crucial for predicting the next token.
         4. The inclusion of a kV cache enhances the efficiency of the decoding process, making it more computationally efficient.
 
-    Args:
-        model_name (str): The name of the model to be used.
-        qeff_model (QEFFBaseModel): Transformed KV torch model to be used
-        tokenizer (HF AutoTokenizer): Tokenzier to prepare inputs.
-        onnx_dir_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location.
-        seq_len (int, optional): The length of the sequence. Default is 128.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: `gpt2`.
+        :qeff_model (QEFFAutoModelForCausalLM): Transformed KV torch model to be used.
+        :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer.
+        :onnx_dir_path (str): Path to save exported ONNX file.
+        :seq_len (int): The length of the sequence.
+
+    Returns:
+         :str: Path of exported ``ONNX`` file.
     """
     warnings.warn(
         "\033[93mThis function will be deprecated soon, use QEfficient.export instead\033[0m",
@@ -369,29 +375,43 @@ def qualcomm_efficient_converter(
     form_factor: str = "cloud",
 ) -> Tuple[str, str]:
     """
-    Function to convert the input string using the specified model and returns the result.
-
-    Args:
-        model_name (str): The name of the model to be used.
-        model_kv (torch.nn.Module): Transformed KV torch model to be used
-        local_model_dir(str): Path to custom model weights and config files
-        tokenizer (HF AutoTokenizer): Tokenzier to prepare inputs.
-        cache_dir (str): Path to cache dir if not specified, default HF cache_dir will be used.
-        onnx_dir_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location.
-        hf_token (bool): If True, an authentication token will be used. Default is False.
-        seq_len (int, optional): The length of the sequence. Default is 128.
-        kv (bool): If True, key-value pairs will be used. Default is True.
-        form_factor (str): form_factor of the hardware, currently only accepts "cloud".
+    This method is an alias for ``QEfficient.export``.
+
+    Usage 1: This method can be used by passing ``model_name`` and ``local_model_dir`` or ``cache_dir`` if required for loading from local dir.
+    This will download the model from ``HuggingFace`` and export it to ``ONNX`` graph and returns generated files path check below.
+
+    Usage 2: You can pass ``model_name`` and ``model_kv`` as an object of ``QEfficient.QEFFAutoModelForCausalLM``, In this case will directly export the ``model_kv.model`` to ``ONNX``
+
+    We will be deprecating this function and it will be replaced by ``QEffAutoModelForCausalLM.export``.
+
+    ``Mandatory`` Args:
+        :model_name (str): The name of the model to be used.
+    ``Optional`` Args:
+        :model_kv (torch.nn.Module): Transformed ``KV torch model`` to be used. ``Defaults to None``.
+        :local_model_dir (str): Path of local model. ``Defaults to None``.
+        :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer. ``Defaults to None``.
+        :cache_dir (str): Path of the ``cache`` directory. ``Defaults to None``.
+        :onnx_dir_path (str): Path to store ``ONNX`` file. ``Defaults to None``.
+        :hf_token (str): HuggingFace token to access gated models. ``Defaults is None``.
+        :seq_len (int): The length of the sequence. ``Defaults is 128``.
+        :kv (bool): If false, it will export to Bert style. ``Defaults is True``.
+        :form_factor (str): Form factor of the hardware, currently only ``cloud`` is accepted. ``Defaults to cloud``.
 
     Returns:
-        None, if automation is False, else path to exported Onnx file
+        :Tuple[str, str]: Path to Base ``ONNX`` dir and path to generated ``ONNX`` model
+
+    .. code-block:: python
+
+        import QEfficient
+        base_path, onnx_model_path = QEfficient.export(model_name="gpt2")
 
     """
     warnings.warn(
         "\033[93mmodel_kv argument will be replaced by qeff_model of type QEFFBaseModel\033[0m",
         DeprecationWarning,
         stacklevel=2,
     )
+
     # Get model_kv first
     model_kv = (
         model_kv