Adding QEFFAutoModel i.e. model loader for loading any type of model. (…

…quic#31) * all changes Signed-off-by: Onkar Chougule <[email protected]> * only loader changes Signed-off-by: Onkar Chougule <[email protected]> * removed unused imports Signed-off-by: Onkar Chougule <[email protected]> * allowed to initialize QEFFAUtoLMModel Signed-off-by: Onkar Chougule <[email protected]> * fixed tests bugs Signed-off-by: Onkar Chougule <[email protected]> * renamed utils.py to _utils.py Signed-off-by: Onkar Chougule <[email protected]> * added more type hinting and docstrings Signed-off-by: Onkar Chougule <[email protected]> * addressed review comments, added test file for new interface Signed-off-by: Onkar Chougule <[email protected]> * enabled CLI APIs Signed-off-by: Onkar Chougule <[email protected]> * *Updated README, notebooks *Removed circular import *Added comments on loader files * separated cross-compile script *separated utils funcs Signed-off-by: Onkar Chougule <[email protected]> * bug-fix infer Signed-off-by: Onkar Chougule <[email protected]> * using QEfficient.export, compile in cloud APIs Signed-off-by: Onkar Chougule <[email protected]> * cleaner infer,epxport APIs Signed-off-by: Onkar Chougule <[email protected]> * addressed review comments Signed-off-by: Onkar Chougule <[email protected]> * *updated notebooks, readme *moved class desc to base.py *Added Runtime Enum Signed-off-by: Onkar Chougule <[email protected]> * updated cloud_ai_100_exec_kv to be callable from QEfficient package Signed-off-by: Onkar Chougule <[email protected]> * fixed tests Signed-off-by: Onkar Chougule <[email protected]> * clenaed notebook Signed-off-by: Onkar Chougule <[email protected]> * *Added transfrom call within init *reanmed cross_compile *updated notebooks *updated README Signed-off-by: Onkar Chougule <[email protected]> * addressed review comments Signed-off-by: Onkar Chougule <[email protected]> --------- Signed-off-by: Onkar Chougule <[email protected]>
quic-amitraj · Jun 7, 2024 · cfb3776 · cfb3776
1 parent e98180d
commit cfb3776
Show file tree

Hide file tree

Showing 24 changed files with 1,133 additions and 923 deletions.
diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
@@ -5,20 +5,11 @@
 #
 # -----------------------------------------------------------------------------
 
-import torch.nn as nn
-
-from QEfficient.transformers.modeling_utils import transform as transform_hf
-
-
-def transform(model: nn.Module, type="Transformers", form_factor="cloud"):
-    """
-    Low level apis in library
-    ---------
-    :param model: nn.Module. instance of nn.Module.
-    :type: str. Transformers | Diffusers, default : Transformers.
-    :form_factor: str.
-    """
-    if type == "Transformers":
-        return transform_hf(model, form_factor)
-    else:
-        raise NotImplementedError
+from QEfficient.compile.compile_helper import compile  # noqa: F401
+from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
+from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv  # noqa: F401
+from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader  # noqa: F401
+from QEfficient.transformers.transform import transform  # noqa: F401
+
+# Users can use QEfficient.export for exporting models to ONNX
+export = qualcomm_efficient_converter
diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py
@@ -6,97 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import argparse
-import json
-import os
-from typing import List
-
-from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100
-from QEfficient.utils.logging_utils import logger
-
-
-def create_and_dump_specializations(batch_size: int, prompt_len: int, ctx_len: int, path: str):
-    # Create 
-    specializations = {
-        "specializations": [
-            {
-                "batch_size": str(batch_size),
-                "seq_len": str(prompt_len),
-                "ctx_len": str(ctx_len),
-            },
-            {"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)},
-        ]
-    }
-    # Dump
-    with open(path, "w") as file:
-        json.dump(specializations, file, indent=4)
-
-
-def main(
-    onnx_path: str,
-    qpc_path: str,
-    num_cores: int,
-    device_group: List[int],
-    aic_enable_depth_first: bool = False,
-    mos: int = -1,
-    batch_size: int = 1,
-    prompt_len: int = 32,
-    ctx_len: int = 128,
-    mxfp6: bool = True,
-    mxint8: bool = False,
-) -> str:
-    """
-    API to compile the ONNX model on Cloud AI 100 platform with given config.
-    ---------
-    :param onnx_path: str. Generated ONNX model path.
-    :qpc_path: str. Path of store compiled qpc binaries file
-    :num_cores: int. Number of cores to compile model on. Default: 16, available option: [1 to 16].
-    :device_group: List[int]. Cloud AI 100 device ids (comma-separated) e.g. [0,1]. if devices > 1, it enable multiple card setup.
-    :aic_enable_depth_first: bool. If passed, this option will be enabled during compilation. Default=False.
-    :mos: int. Effort level to reduce the on-chip memory. Default=-1.
-    :batch_size: int. Batch size for model to compile.
-    :prompt_len: int. prompt len for the model to compile.
-    :ctx_len: int. Maximum context length for the model to compile.
-    :mxfp6: bool. Compress constant MatMul weights to MXFP6 E2M3, default is no compression.
-    :mxint8: bool. Compress Present/Past KV to MXINT8 using CustomIO config, default is False.
-
-    Return:
-        Path of the QPC files.
-    """
-
-    os.makedirs(qpc_path, exist_ok=True)
-    specialization_json_path = os.path.join(qpc_path, "specializations.json")
-    create_and_dump_specializations(
-        batch_size=batch_size, prompt_len=prompt_len, ctx_len=ctx_len, path=specialization_json_path
-    )
-
-    # Select the customIO config based on the mx flag.
-    if mxint8:
-        custom_io_file_name = "custom_io_int8.yaml"
-    else:
-        custom_io_file_name = "custom_io_fp16.yaml"
-
-    custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
-
-    if not os.path.isfile(custom_io_file_path):
-        raise FileNotFoundError(
-            f"file {custom_io_file_path} needs to exist in the same directory as onnx model files. Please rerun infer/export Api"
-        )
-
-    _, qpc_path = compile_kv_model_on_cloud_ai_100(
-        onnx_path=onnx_path,
-        specializations_json=specialization_json_path,
-        num_cores=num_cores,
-        custom_io_path=custom_io_file_path,
-        base_path=qpc_path,
-        mxfp6=mxfp6,
-        aic_enable_depth_first=aic_enable_depth_first,
-        mos=mos,
-        device_group=device_group,
-    )
-
-    logger.info(f"Compiled QPC files can be found here: {qpc_path}")
-    return qpc_path
-
+import QEfficient
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Compilation script.")
@@ -152,5 +62,7 @@ def main(
         default=-1,
         help=" Effort level to reduce the on-chip memory",
     )
+
+    # FIXME(ochougul): Allow extra compilation arguments
     args = parser.parse_args()
-    main(**vars(args))
+    QEfficient.compile(**vars(args))
diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
@@ -6,26 +6,25 @@
 # -----------------------------------------------------------------------------
 
 import argparse
-from typing import List
-from huggingface_hub import login
-from transformers import AutoTokenizer
+from typing import List, Optional
+
 from QEfficient.generation.text_generation_inference import (
     check_batch_size_and_num_prompts,
     cloud_ai_100_exec_kv,
     get_compilation_batch_size,
-    )
-from QEfficient.utils import hf_download
+)
+from QEfficient.utils import load_hf_tokenizer
 from QEfficient.utils.constants import Constants
 
 
 def main(
     model_name: str,
     qpc_path: str,
     device_group: List[int],
-    prompt: str = None,
-    prompts_txt_file_path: str = None,
-    cache_dir: str = Constants.CACHE_DIR,
-    hf_token: str = None,
+    prompt: Optional[str] = None,  # type: ignore
+    prompts_txt_file_path: Optional[str] = None,
+    cache_dir: Optional[str] = Constants.CACHE_DIR,
+    hf_token: Optional[str] = None,
 ):
     """
     API to run the model on Cloud AI 100 platform.
@@ -35,22 +34,14 @@ def main(
     :qpc_path: str.  Path to the save generated binary file after compilation.
     :cache_dir: str. Cache dir to store the downloaded huggingface files.
     :hf_token: Huggingface token to access gated models.
-<<<<<<< HEAD
     :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
     :prompts_txt_file_path: str. Path to txt file for multiple input prompts
-=======
->>>>>>> bb46e21 (Added sphinx files in docs)
     """
 
-    if hf_token is not None:
-        login(hf_token)
-
-    # Download tokenizer along with model if it doesn't exist
-    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"])
-    tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
+    tokenizer = load_hf_tokenizer(model_name, cache_dir, hf_token)
 
     batch_size = get_compilation_batch_size(qpc_path)
-    prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
+    prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
 
     # Execute
     cloud_ai_100_exec_kv(

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
@@ -7,24 +7,53 @@
 
 import argparse
 import os
+from typing import Optional, Union
 
-from huggingface_hub import login
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-import QEfficient
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.utils import hf_download, onnx_exists
+from QEfficient.utils import onnx_exists
 from QEfficient.utils.constants import Constants
 from QEfficient.utils.logging_utils import logger
 
 # Specifically for Docker images.
 ROOT_DIR = os.path.dirname(os.path.abspath(""))
 
 
+def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]]=None, hf_token: Optional[str] = None):
+    """
+    exports the model to onnx if pre-exported file is not found and returns onnx_model_path
+    """
+    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
+    if onnx_path_exists:
+        logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
+    else:
+        ###################
+        # hf model -> export
+        ####################
+        # Export to the Onnx
+        logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
+        _, generated_onnx_model_path = qualcomm_efficient_converter(
+                model_name=model_name,
+                tokenizer=tokenizer,
+                onnx_dir_path=onnx_dir_path,
+                kv=True,
+                form_factor="cloud",
+                return_path=True,
+                hf_token=hf_token,
+                cache_dir=cache_dir
+            ) # type: ignore
+        logger.info(f"Generated Onnx_path {generated_onnx_model_path} \nOnnx_model_path {onnx_model_path} \nand Onnx_dir_path is {onnx_dir_path}")
+        assert (
+                generated_onnx_model_path == onnx_model_path
+            ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_model_path}"
+    return onnx_model_path
+
+
 def main(
     model_name: str,
     cache_dir: str,
-    hf_token: str = None,
+    hf_token: Optional[str] = None,
 ) -> None:
     """
     ApI for exporting to ONNX Model.
@@ -33,38 +62,7 @@ def main(
     :cache_dir: str. Cache dir to store the downloaded huggingface files.
     :hf_token: str. HuggingFace login token to access private repos.
     """
-    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
-    if onnx_path_exists:
-        logger.warning(f"Generated Onnx files found {onnx_model_path}! Please use Infer/Compile Apis()")
-        return
-
-    if hf_token is not None:
-        login(hf_token)
-    model_hf_path = hf_download(
-        repo_id=model_name,
-        cache_dir=cache_dir,
-        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True
-    )
-    model = AutoModelForCausalLM.from_pretrained(model_hf_path, use_cache=True)
-
-    # Easy and minimal api to update the model to QEff.
-    QEfficient.transform(model, type="Transformers", form_factor="cloud")
-    print(f"Model after Optimized transformations {model}")
-
-    # Export to the Onnx
-    print(f"Exporting to Pytorch {model_name} to Onnx")
-    base_path, onnx_path = qualcomm_efficient_converter(
-        model_kv=model,
-        model_name=model_name,
-        tokenizer=tokenizer,
-        kv=True,
-        form_factor="cloud",
-        return_path=True,
-    )
-    print(f"Base Path is {base_path} and Onnx Model Path is : {onnx_path}")
+    get_onnx_model_path(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token)
 
 
 if __name__ == "__main__":