quic · ochougul · Jun 6, 2024 · May 27, 2024 · May 28, 2024 · May 28, 2024
@@ -5,16 +5,11 @@
 #
 # -----------------------------------------------------------------------------
 
-import torch.nn as nn
-from QEfficient.transformers.modeling_utils import transform as transform_hf
+from QEfficient.compile.compile_helper import compile  # noqa: F401
+from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
+from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv  # noqa: F401
+from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader  # noqa: F401
+from QEfficient.transformers.transform import transform  # noqa: F401
 
-
-def transform(model: nn.Module, type="Transformers", form_factor="cloud"):
-    """Low level apis in library
-    model : instance of nn.Module
-    type : Transformers | Diffusers, default : Transformers
-    """
-    if type == "Transformers":
-        return transform_hf(model, form_factor)
-    else:
-        raise NotImplementedError
+# Users can use QEfficient.export for exporting models to ONNX
+export = qualcomm_efficient_converter
@@ -6,91 +6,8 @@
 # -----------------------------------------------------------------------------
 
 import argparse
-import json
-import os
-from typing import List
-
-from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100
-from QEfficient.utils.logging_utils import logger
-
-
-def create_and_dump_specializations(batch_size: int, prompt_len: int, ctx_len: int, path: str):
-    # Create
-    specializations = {
-        "specializations": [
-            {
-                "batch_size": str(batch_size),
-                "seq_len": str(prompt_len),
-                "ctx_len": str(ctx_len),
-            },
-            {"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)},
-        ]
-    }
-    # Dump
-    with open(path, "w") as file:
-        json.dump(specializations, file, indent=4)
-
-
-def main(
-    onnx_path: str,
-    qpc_path: str,
-    num_cores: int,
-    device_group: List[int],
-    aic_enable_depth_first: bool = False,
-    mos: int = -1,
-    batch_size: int = 1,
-    prompt_len: int = 32,
-    ctx_len: int = 128,
-    mxfp6: bool = True,
-    mxint8: bool = False,
-) -> str:
-    # Dynamically create the specializations JSON
-    """
-    Api() to compile the Onnx Model on Cloud AI 100 Platform with give config.
-    ---------
-    :param onnx_path: str. Generated Onnx Model Path.
-    :base_path: str. Base path for the generated models.
-    :batch_size: int. Batch size to compile the model for.
-    :prompt_len: int. prompt len for the model to compile.
-    :ctx_len: int. Maximum context length to compile the model.
-    :mxfp6: bool. Enable compilation for MXFP6 precision
-    :num_cores: int. Number of cores to compile model on. default: 16 available option: [1 to 16]
-    """
-
-    os.makedirs(qpc_path, exist_ok=True)
-    specialization_json_path = os.path.join(qpc_path, "specializations.json")
-    create_and_dump_specializations(
-        batch_size=batch_size, prompt_len=prompt_len, ctx_len=ctx_len, path=specialization_json_path
-    )
-
-    # Select the customIO config based on the mx flag.
-    if mxint8:
-        custom_io_file_name = "custom_io_int8.yaml"
-    else:
-        custom_io_file_name = "custom_io_fp16.yaml"
-
-    custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
-
-    if not os.path.isfile(custom_io_file_path):
-        raise FileNotFoundError(
-            f"file {custom_io_file_path} needs to exist in the same directory as onnx model files. Please rerun infer/export Api"
-        )
-
-    _, qpc_path = compile_kv_model_on_cloud_ai_100(
-        onnx_path=onnx_path,
-        specializations_json=specialization_json_path,
-        num_cores=num_cores,
-        custom_io_path=custom_io_file_path,
-        base_path=qpc_path,
-        mxfp6=mxfp6,
-        aic_enable_depth_first=aic_enable_depth_first,
-        mos=mos,
-        device_group=device_group,
-    )
-
-    logger.info(f"Compiled QPC files can be found here: {qpc_path}")
-    return qpc_path
 
+import QEfficient
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Compilation script.")
@@ -146,5 +63,7 @@ def main(
         default=-1,
         help=" Effort level to reduce the on-chip memory",
     )
+
+    # FIXME(ochougul): Allow extra compilation arguments
     args = parser.parse_args()
-    main(**vars(args))
+    QEfficient.compile(**vars(args))
@@ -6,28 +6,25 @@
 # -----------------------------------------------------------------------------
 
 import argparse
-from typing import List
-
-from huggingface_hub import login
-from transformers import AutoTokenizer
+from typing import List, Optional
 
 from QEfficient.generation.text_generation_inference import (
     check_batch_size_and_num_prompts,
     cloud_ai_100_exec_kv,
     get_compilation_batch_size,
 )
-from QEfficient.utils import hf_download
+from QEfficient.utils import load_hf_tokenizer
 from QEfficient.utils.constants import Constants
 
 
 def main(
     model_name: str,
     qpc_path: str,
     device_group: List[int],
-    prompt: str = None,
-    prompts_txt_file_path: str = None,
-    cache_dir: str = Constants.CACHE_DIR,
-    hf_token: str = None,
+    prompt: Optional[str] = None,  # type: ignore
+    prompts_txt_file_path: Optional[str] = None,
+    cache_dir: Optional[str] = Constants.CACHE_DIR,
+    hf_token: Optional[str] = None,
 ):
     """
     APi() to run the Model on Cloud AI 100 Platform.
@@ -39,15 +36,10 @@ def main(
     :prompts_txt_file_path: str. Path to txt file for multiple input prompts
     """
 
-    if hf_token is not None:
-        login(hf_token)
-
-    # Download tokenizer along with model if it doesn't exist
-    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"])
-    tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
+    tokenizer = load_hf_tokenizer(model_name, cache_dir, hf_token)
 
     batch_size = get_compilation_batch_size(qpc_path)
-    prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
+    prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
 
     # Execute
     cloud_ai_100_exec_kv(

@@ -7,24 +7,53 @@
 
 import argparse
 import os
+from typing import Optional, Union
 
-from huggingface_hub import login
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-import QEfficient
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.utils import hf_download, onnx_exists
+from QEfficient.utils import onnx_exists
 from QEfficient.utils.constants import Constants
 from QEfficient.utils.logging_utils import logger
 
 # Specifically for Docker images.
 ROOT_DIR = os.path.dirname(os.path.abspath(""))
 
 
+def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]]=None, hf_token: Optional[str] = None):
+    """
+    exports the model to onnx if pre-exported file is not found and returns onnx_model_path
+    """
+    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
+    if onnx_path_exists:
+        logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
+    else:
+        ###################
+        # hf model -> export
+        ####################
+        # Export to the Onnx
+        logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
+        _, generated_onnx_model_path = qualcomm_efficient_converter(
+                model_name=model_name,
+                tokenizer=tokenizer,
+                onnx_dir_path=onnx_dir_path,
+                kv=True,
+                form_factor="cloud",
+                return_path=True,
+                hf_token=hf_token,
+                cache_dir=cache_dir
+            ) # type: ignore
+        logger.info(f"Generated Onnx_path {generated_onnx_model_path} \nOnnx_model_path {onnx_model_path} \nand Onnx_dir_path is {onnx_dir_path}")
+        assert (
+                generated_onnx_model_path == onnx_model_path
+            ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_model_path}"
+    return onnx_model_path
+
+
 def main(
     model_name: str,
     cache_dir: str,
-    hf_token: str = None,
+    hf_token: Optional[str] = None,
 ) -> None:
     """
     Api() for exporting to Onnx Model.
@@ -33,38 +62,7 @@ def main(
     :cache_dir: str. Cache dir to store the downloaded huggingface files.
     :hf_token: str. HuggingFace login token to access private repos.
     """
-    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
-    if onnx_path_exists:
-        logger.warning(f"Generated Onnx files found {onnx_model_path}! Please use Infer/Compile Apis()")
-        return
-
-    if hf_token is not None:
-        login(hf_token)
-    model_hf_path = hf_download(
-        repo_id=model_name,
-        cache_dir=cache_dir,
-        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True
-    )
-    model = AutoModelForCausalLM.from_pretrained(model_hf_path, use_cache=True)
-
-    # Easy and minimal api to update the model to QEff.
-    QEfficient.transform(model, type="Transformers", form_factor="cloud")
-    print(f"Model after Optimized transformations {model}")
-
-    # Export to the Onnx
-    print(f"Exporting to Pytorch {model_name} to Onnx")
-    base_path, onnx_path = qualcomm_efficient_converter(
-        model_kv=model,
-        model_name=model_name,
-        tokenizer=tokenizer,
-        kv=True,
-        form_factor="cloud",
-        return_path=True,
-    )
-    print(f"Base Path is {base_path} and Onnx Model Path is : {onnx_path}")
+    get_onnx_model_path(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token)
 
 
 if __name__ == "__main__":