From dee2d442db50c02850a839957751b07a32c9ffc4 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 28 May 2024 01:21:18 +0530
Subject: [PATCH 01/20] all changes

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/__init__.py              |   2 +
 QEfficient/loader/__init__.py       |   8 +++
 QEfficient/loader/loader.py         |  33 +++++++++
 QEfficient/loader/loader_factory.py | 107 ++++++++++++++++++++++++++++
 QEfficient/utils/__init__.py        |  99 ++-----------------------
 QEfficient/utils/generate_inputs.py |  32 +++++++++
 QEfficient/utils/run_utils.py       |  43 ++++++-----
 7 files changed, 212 insertions(+), 112 deletions(-)
 create mode 100644 QEfficient/loader/__init__.py
 create mode 100644 QEfficient/loader/loader.py
 create mode 100644 QEfficient/loader/loader_factory.py

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index c4ccb4ef7..0d623eeee 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -6,6 +6,8 @@
 # -----------------------------------------------------------------------------
 
 import torch.nn as nn
+
+from QEfficient.loader import QEFFAutoModel  # noqa: F401
 from QEfficient.transformers.modeling_utils import transform as transform_hf
 
 
diff --git a/QEfficient/loader/__init__.py b/QEfficient/loader/__init__.py
new file mode 100644
index 000000000..a17f497b5
--- /dev/null
+++ b/QEfficient/loader/__init__.py
@@ -0,0 +1,8 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from QEfficient.loader.loader import QEFFAutoModel  # noqa: F401
diff --git a/QEfficient/loader/loader.py b/QEfficient/loader/loader.py
new file mode 100644
index 000000000..950fcb946
--- /dev/null
+++ b/QEfficient/loader/loader.py
@@ -0,0 +1,33 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import os
+from typing import Any
+
+from QEfficient.loader.loader_factory import (
+    MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP,
+    QEFFBaseAutoModelFactory,
+    get_hf_model_type,
+)
+from QEfficient.utils.utils import login_and_download_hf_lm
+
+
+class QEFFAutoModel:
+    def __init__(self, *args: Any, **kwds: Any) -> None:
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`")
+    
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseAutoModelFactory:
+        pretrained_model_name_or_path = pretrained_model_name_or_path if os.path.isdir(pretrained_model_name_or_path) \
+            else login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs)
+        model_type = get_hf_model_type(hf_model_path=pretrained_model_name_or_path)
+        qeff_auto_model_class = MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP[model_type]
+        assert issubclass(qeff_auto_model_class, QEFFBaseAutoModelFactory), f"Expected class that inherits {QEFFBaseAutoModelFactory}, got {type(qeff_auto_model_class)}"
+
+        return qeff_auto_model_class.from_pretrained(pretrained_model_name_or_path)
diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py
new file mode 100644
index 000000000..c5421fd25
--- /dev/null
+++ b/QEfficient/loader/loader_factory.py
@@ -0,0 +1,107 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+import os
+from typing import Any
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Union
+
+from qtpy import API
+import torch.nn as nn
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
+
+from QEfficient.utils.run_utils import ApiRunner, run_hf_lm_model_with_pt
+import QEfficient
+
+class QEFFBaseAutoModelFactory(ABC):
+    
+    def __init__(self) -> None:
+        super().__init__()
+        # Users can call generate or execute
+        self.generate = self.execute
+    
+    @abstractmethod
+    def from_pretrained(self, pretrained_model_name_or_path: str, *args, **kwargs):
+        pass
+
+    @abstractmethod
+    def execute(self, *args, **kwargs) -> Any:
+        pass
+    
+    @abstractmethod
+    def transform(self, *args, **kwargs) -> Any:
+        pass
+    
+    @abstractmethod
+    def export(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("Reached too far!!")
+    
+
+class QEFFAutoModelForCausalLM(QEFFBaseAutoModelFactory):
+    def __init__(self, model: nn.Module, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], pretrained_model_name_or_path: str) -> None:
+        assert model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values(), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore
+        self.model = model
+        self.tokenizer = tokenizer
+        self.model_files_path = pretrained_model_name_or_path
+        self._model_executor = None
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
+        model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        return cls(model=model, tokenizer=tokenizer, pretrained_model_name_or_path=pretrained_model_name_or_path)
+    
+    def _run_kv_lm_model_with_pt(self, prompt, prompt_len, ctx_len):
+        api_runner = ApiRunner(self.tokenizer, prompt=prompt, prompt_len=prompt_len, ctx_len=ctx_len)
+        return api_runner.run_kv_model_on_pytorch(self.model, )
+
+    def execute(self, prompt: str, prompt_len: int = None, ctx_len: int = None, max_gen_length: int = 128): # type: ignore
+        if getattr(self.model, "qeff_transformed", False):
+            output_ids = run_hf_lm_model_with_pt(self.model, self.tokenizer, prompt, max_gen_length)
+        else:
+            output_ids = self._run_kv_lm_model_with_pt(prompt, prompt_len, ctx_len)
+        return output_ids
+    
+    def transform(self):
+        QEfficient.transform(self.model)
+        return self
+
+    def export(self):
+        pass
+
+
+class QEFF_MODEL_TYPE(Enum):
+    LLM = "LLM"
+    STABLE_DIFFUSION = "STABLE_DIFFUSION"
+    AWQ = "AWQ"
+
+
+MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP= {
+    QEFF_MODEL_TYPE.LLM: QEFFAutoModelForCausalLM
+}
+
+
+def get_hf_model_type(hf_model_path: str):
+    assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model"
+    config, kwargs = AutoConfig.from_pretrained(
+                hf_model_path,
+                return_unused_kwargs=True,
+            )
+    if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING:
+        # FIXME: Add logic to handle if quantization config is stored in separate quant_config.json outside of config, also create a separate function for this and below lines
+        quant_config = getattr(config, "quantization_config", getattr(config, "quant_config", None))
+        if quant_config is not None:
+            if quant_config.get("quant_method", None) == "awq":
+                return QEFF_MODEL_TYPE.AWQ
+            else:
+                raise NotImplementedError(f"current model type is not yet supported {type(config)}")
+        else:
+            return QEFF_MODEL_TYPE.LLM
+    else:
+        raise NotImplementedError(f"model type {type(config)} is not yet supported")
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index 486bae664..bed89942b 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -5,96 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
-import os
-from typing import List, Optional, Tuple, Union
-
-import requests
-from huggingface_hub import snapshot_download
-from requests.exceptions import HTTPError
-
-from QEfficient.utils.constants import QEFF_MODELS_DIR
-
-
-def hf_download(
-    repo_id: Optional[str] = None,
-    cache_dir: Optional[str] = None,
-    hf_token: Optional[str] = None,
-    allow_patterns: Optional[List[str]] = None,
-    ignore_patterns: Optional[List[str]] = None,
-):
-    # Setup cache and local dir
-    local_dir = None
-    if cache_dir is not None:
-        cache_dir = f"{cache_dir}"
-        local_dir = f"{cache_dir}/{repo_id}"
-
-    os.makedirs(f"{cache_dir}/{repo_id}", exist_ok=True)
-    max_retries = 5
-    retry_count = 0
-    while retry_count < max_retries:
-        try:
-            model_path = snapshot_download(
-                repo_id,
-                cache_dir=cache_dir,
-                local_dir=local_dir,
-                local_dir_use_symlinks=True,
-                revision="main",
-                resume_download=True,
-                token=hf_token,
-                allow_patterns=allow_patterns,
-                ignore_patterns=ignore_patterns,
-            )
-            break
-        except requests.ReadTimeout as e:
-            print(f"Read timeout: {e}")
-            retry_count += 1
-
-        except HTTPError as e:
-            retry_count = max_retries
-            if e.response.status_code == 401:
-                print("You need to pass a valid `--hf_token=...` to download private checkpoints.")
-            else:
-                raise e
-
-    return model_path
-
-
-def qpc_exists(model_name: str, qpc_base_dir_name: str) -> Union[Tuple[bool, str], None]:
-    """
-    Checks if qpc files already exists, removes the directory if files have been manipulated.
-    ---------
-    :param model_name: str. HF Model card name.
-    :param dir_path: str. Path of qpc directory.
-    :return: Union[Tuple[bool, str]]: qpc_exists and path to qpc directory
-    """
-    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
-    os.makedirs(model_card_dir, exist_ok=True)
-
-    qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs")
-
-    # Compute the boolean indicating if the QPC exists
-    qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin"))
-
-    return qpc_exists_bool, qpc_dir_path
-
-
-def onnx_exists(model_name: str) -> Union[Tuple[bool, str, str], None]:
-    """
-    Checks if qpc files already exists, removes the directory if files have been manipulated.
-    ---------
-    :param model_name: str. HF Model card name.
-    :return: Union[Tuple[bool, str, str]]: onnx_exists and path to onnx file and directory
-    """
-    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
-    os.makedirs(model_card_dir, exist_ok=True)
-
-    onnx_dir_path = os.path.join(model_card_dir, "onnx")
-    onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
-
-    # Compute the boolean indicating if the ONNX model exists
-    onnx_exists_bool = os.path.isfile(onnx_model_path) and os.path.isfile(
-        os.path.join(os.path.dirname(onnx_model_path), "custom_io_fp16.yaml")
-    )
-
-    # Return the boolean, onnx_dir_path, and onnx_model_path
-    return onnx_exists_bool, onnx_dir_path, onnx_model_path
+from QEfficient.utils.utils import (  # noqa: F401
+    hf_download,
+    login_and_download_hf_lm,
+    onnx_exists,
+    qpc_exists,
+)
diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py
index 4cf15024a..deb7bcf32 100644
--- a/QEfficient/utils/generate_inputs.py
+++ b/QEfficient/utils/generate_inputs.py
@@ -5,12 +5,44 @@
 #
 # -----------------------------------------------------------------------------
 
+from abc import ABC, abstractmethod
 import numpy as np
 import torch
 
 from QEfficient.utils.logging_utils import logger
 
 
+class AwesomeInputHandler(ABC):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.counter = 0
+
+    def reset(self):
+        self.counter = 0
+
+    def prepare_inputs(self, prompt, n_layer, padding_shape):
+        if self.counter!=0:
+            logger.warning("Resetting Input Handler as prepare_inputs is called even though it's in the middle of generating outputs")
+            self.reset()
+    
+        self._prepare_inputs(prompt, n_layer, padding_shape)
+        self.counter+=1
+
+    def update_inputs(self, outputs):
+        self._update_inputs(outputs)
+        self.counter+=1
+
+    @abstractmethod
+    def _prepare_inputs(self, prompt, n_layer, padding_shape):
+        pass
+
+    @abstractmethod
+    def _update_inputs(self, outputs):
+        pass
+
+
+
 class InputHandler:
     def __init__(self, tokenizer, input_str, prompt_len, ctx_len):
         """
diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py
index c521bf3d1..fbfc2b968 100644
--- a/QEfficient/utils/run_utils.py
+++ b/QEfficient/utils/run_utils.py
@@ -9,9 +9,30 @@
 import onnxruntime
 import torch
 
-from .generate_inputs import InputHandler
 from QEfficient.utils.logging_utils import logger
 
+from .generate_inputs import InputHandler
+
+
+def run_hf_lm_model_with_pt(model_hf, tokenizer, prompt, gen_len):
+    input_ids = tokenizer.encode(prompt, return_tensors="pt")
+
+    input_ids_len = len(input_ids[0])
+
+    with torch.no_grad():
+        for _ in range(gen_len):
+            outputs = model_hf(input_ids)
+            logits = outputs.logits[:, -1, :]
+            predicted_token_id = torch.argmax(logits, dim=-1)
+            input_ids = torch.cat([input_ids, predicted_token_id.unsqueeze(1)], dim=-1)
+
+    generated_ids = input_ids[0][input_ids_len:].detach().numpy()
+    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
+    print("Original HF Model Outputs (Torch CPU): \n")
+    print("Prompt:", repr(prompt))
+    print("Completion:", repr(generated_text))
+    return generated_ids
+
 
 class ApiRunner:
     """
@@ -32,7 +53,7 @@ def __init__(self, tokenizer, prompt, prompt_len, ctx_len):
         :param ctx_len: int
         """
         if tokenizer.padding_side != "left":
-            logger.warning(f"Please use padding_side='left' while initializing the tokenizer")
+            logger.warning("Please use padding_side='left' while initializing the tokenizer")
             tokenizer.padding_side = "left"
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
@@ -50,24 +71,8 @@ def run_hf_model_on_pytorch(self, model_hf):
         :param model_hf: pytorch model
         :return generated_ids: numpy.ndarray - output tokens
         """
+        return run_hf_lm_model_with_pt(model_hf, self.tokenizer, self.prompt[0], self.gen_len)
 
-        input_ids = self.tokenizer.encode(self.prompt[0], return_tensors="pt")
-
-        input_ids_len = len(input_ids[0])
-
-        with torch.no_grad():
-            for _ in range(self.gen_len):
-                outputs = model_hf(input_ids)
-                logits = outputs.logits[:, -1, :]
-                predicted_token_id = torch.argmax(logits, dim=-1)
-                input_ids = torch.cat([input_ids, predicted_token_id.unsqueeze(1)], dim=-1)
-
-        generated_ids = input_ids[0][input_ids_len:].detach().numpy()
-        generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
-        print("Original HF Model Outputs (Torch CPU): \n")
-        print("Prompt:", repr(self.prompt))
-        print("Completion:", repr(generated_text))
-        return generated_ids
 
     def run_kv_model_on_pytorch(self, model, n_layer, padding_shape):
         """

From ca62618728ebebacfdb309e80bed8309a56ff5a5 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 28 May 2024 23:30:47 +0530
Subject: [PATCH 02/20] only loader changes

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/__init__.py                        |  13 +-
 QEfficient/cloud/execute.py                   |  24 +-
 QEfficient/cloud/export.py                    |  30 +-
 QEfficient/cloud/infer.py                     | 132 +++----
 .../exporter/export_hf_to_cloud_ai_100.py     | 333 +++++++++---------
 QEfficient/exporter/export_utils.py           |   4 +-
 .../generation/text_generation_inference.py   |   2 +-
 QEfficient/loader/loader_factory.py           |  48 +--
 QEfficient/utils/__init__.py                  |   1 +
 tests/utils.py                                |   8 +-
 10 files changed, 288 insertions(+), 307 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 0d623eeee..d9d032f27 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -5,18 +5,21 @@
 #
 # -----------------------------------------------------------------------------
 
-import torch.nn as nn
+from typing import Any, Union
 
 from QEfficient.loader import QEFFAutoModel  # noqa: F401
+from QEfficient.loader.loader_factory import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFAutoModelForCausalLM
 from QEfficient.transformers.modeling_utils import transform as transform_hf
 
 
-def transform(model: nn.Module, type="Transformers", form_factor="cloud"):
+def transform(model: Union[QEFFAutoModelForCausalLM, Any], form_factor="cloud"):
     """Low level apis in library
     model : instance of nn.Module
     type : Transformers | Diffusers, default : Transformers
     """
-    if type == "Transformers":
-        return transform_hf(model, form_factor)
+    assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!"
+    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.LLM:
+        transform_hf(model.model, form_factor)
+        return model
     else:
-        raise NotImplementedError
+        raise NotImplementedError(f"Recieved unsupported class of type {type(model)}")
diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
index c1ec39abe..eee912494 100644
--- a/QEfficient/cloud/execute.py
+++ b/QEfficient/cloud/execute.py
@@ -6,17 +6,14 @@
 # -----------------------------------------------------------------------------
 
 import argparse
-from typing import List
-
-from huggingface_hub import login
-from transformers import AutoTokenizer
+from typing import List, Optional
 
 from QEfficient.generation.text_generation_inference import (
     check_batch_size_and_num_prompts,
     cloud_ai_100_exec_kv,
     get_compilation_batch_size,
 )
-from QEfficient.utils import hf_download
+from QEfficient.utils import load_hf_tokenizer
 from QEfficient.utils.constants import Constants
 
 
@@ -24,10 +21,10 @@ def main(
     model_name: str,
     qpc_path: str,
     device_group: List[int],
-    prompt: str = None,
-    prompts_txt_file_path: str = None,
-    cache_dir: str = Constants.CACHE_DIR,
-    hf_token: str = None,
+    prompt: Optional[str] = None,  # type: ignore
+    prompts_txt_file_path: Optional[str] = None,
+    cache_dir: Optional[str] = Constants.CACHE_DIR,
+    hf_token: Optional[str] = None,
 ):
     """
     APi() to run the Model on Cloud AI 100 Platform.
@@ -39,15 +36,10 @@ def main(
     :prompts_txt_file_path: str. Path to txt file for multiple input prompts
     """
 
-    if hf_token is not None:
-        login(hf_token)
-
-    # Download tokenizer along with model if it doesn't exist
-    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"])
-    tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
+    tokenizer = load_hf_tokenizer(model_name, cache_dir, hf_token)
 
     batch_size = get_compilation_batch_size(qpc_path)
-    prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
+    prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
 
     # Execute
     cloud_ai_100_exec_kv(
diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index f86e245c0..2b7201c8e 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -7,13 +7,12 @@
 
 import argparse
 import os
-
-from huggingface_hub import login
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from typing import Optional
 
 import QEfficient
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.utils import hf_download, onnx_exists
+from QEfficient.loader import QEFFAutoModel
+from QEfficient.utils import load_hf_tokenizer, onnx_exists
 from QEfficient.utils.constants import Constants
 from QEfficient.utils.logging_utils import logger
 
@@ -24,7 +23,7 @@
 def main(
     model_name: str,
     cache_dir: str,
-    hf_token: str = None,
+    hf_token: Optional[str] = None,
 ) -> None:
     """
     Api() for exporting to Onnx Model.
@@ -38,32 +37,23 @@ def main(
         logger.warning(f"Generated Onnx files found {onnx_model_path}! Please use Infer/Compile Apis()")
         return
 
-    if hf_token is not None:
-        login(hf_token)
-    model_hf_path = hf_download(
-        repo_id=model_name,
-        cache_dir=cache_dir,
-        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True
-    )
-    model = AutoModelForCausalLM.from_pretrained(model_hf_path, use_cache=True)
+    tokenizer = load_hf_tokenizer(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token)
+    qeff_model = QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, cache_dir=cache_dir, hf_token=hf_token)
 
     # Easy and minimal api to update the model to QEff.
-    QEfficient.transform(model, type="Transformers", form_factor="cloud")
-    print(f"Model after Optimized transformations {model}")
+    QEfficient.transform(qeff_model, form_factor="cloud")
+    print(f"Model after Optimized transformations {qeff_model}")
 
     # Export to the Onnx
     print(f"Exporting to Pytorch {model_name} to Onnx")
     base_path, onnx_path = qualcomm_efficient_converter(
-        model_kv=model,
+        model_kv=qeff_model,
         model_name=model_name,
         tokenizer=tokenizer,
         kv=True,
         form_factor="cloud",
         return_path=True,
-    )
+    ) # type: ignore
     print(f"Base Path is {base_path} and Onnx Model Path is : {onnx_path}")
 
 
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index f00a56883..326096573 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -9,9 +9,6 @@
 import os
 from typing import List
 
-from huggingface_hub import login
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
 import QEfficient
 from QEfficient.cloud.compile import main as compile
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
@@ -19,7 +16,8 @@
     check_batch_size_and_num_prompts,
     cloud_ai_100_exec_kv,
 )
-from QEfficient.utils import hf_download, onnx_exists, qpc_exists
+from QEfficient.loader import QEFFAutoModel
+from QEfficient.utils import load_hf_tokenizer, onnx_exists, qpc_exists
 from QEfficient.utils.constants import Constants
 from QEfficient.utils.logging_utils import logger
 
@@ -56,35 +54,18 @@ def main(
         + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16")
     )
 
-    prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
+    prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
 
     # Get tokenizer
-    if hf_token is not None:
-        login(hf_token)
-    model_hf_path = hf_download(
-        repo_id=model_name,
-        cache_dir=cache_dir,
-        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True
-    )
+    tokenizer = load_hf_tokenizer(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token)
 
     qpc_path_exists, qpc_dir_path = qpc_exists(model_name, qpc_base_dir_name)
+    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
+
     if qpc_path_exists:
         # execute
         logger.info("Pre-compiled qpc found! Trying to execute with given prompt")
-        cloud_ai_100_exec_kv(
-            batch_size,
-            tokenizer=tokenizer,
-            qpc_path=qpc_dir_path,
-            device_id=device_group,
-            prompt=prompt,
-        )
-        return
-
-    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
-    if onnx_path_exists:
+    elif onnx_path_exists:
         # Compile -> execute
         # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
         generated_qpc_path = compile(
@@ -103,61 +84,54 @@ def main(
         assert (
             generated_qpc_path == qpc_dir_path
         ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
-        cloud_ai_100_exec_kv(
-            batch_size,
+    else:
+        #############################################
+        # hf model -> export -> compile -> execute
+        #############################################
+        # Load hf model
+        qeff_model = QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, cache_dir=cache_dir, hf_token=hf_token)
+        
+        # Easy and minimal api to update the model to QEff.
+        qeff_opt_model = QEfficient.transform(qeff_model, form_factor="cloud")
+        logger.info(f"Model after Optimized transformations {qeff_opt_model}")
+
+        # Export to the Onnx
+        logger.info(f"Exporting to Pytorch {model_name} to ONNX...")
+        # Need to split below function into two functions one which always takes QEFFAutoModel and other with same interface as below
+        base_path, generated_onnx_path = qualcomm_efficient_converter(
+            model_kv=qeff_opt_model, # type: ignore
             tokenizer=tokenizer,
-            qpc_path=qpc_dir_path,
-            device_id=device_group,
-            prompt=prompt,
+            onnx_dir_path=onnx_dir_path,
+            kv=True,
+            form_factor="cloud",
+            return_path=True,
+        ) # type: ignore
+        print(f"Generated Onnx_path {generated_onnx_path} and Onnx_model_path {onnx_model_path} and Onnx_dir_path is {onnx_dir_path}")
+        assert (
+            generated_onnx_path == onnx_model_path
+        ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_path}"
+        logger.info(f"Base Path is {base_path} and Onnx Model Path is : {generated_onnx_path}")
+
+        # Compile
+        # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
+        generated_qpc_path = compile(
+            onnx_path=onnx_model_path,
+            qpc_path=os.path.dirname(qpc_dir_path),
+            num_cores=num_cores,
+            batch_size=batch_size,
+            prompt_len=prompt_len,
+            ctx_len=ctx_len,
+            mxfp6=mxfp6,
+            mxint8=mxint8,
+            aic_enable_depth_first=aic_enable_depth_first,
+            mos=mos,
+            device_group=device_group,
         )
-        return
-
-    #############################################
-    # hf model -> export -> compile -> execute
-    #############################################
-    model_hf = AutoModelForCausalLM.from_pretrained(model_hf_path, use_cache=True)
-    # Easy and minimal api to update the model to QEff.
-    model_transformed = QEfficient.transform(model_hf, type="Transformers", form_factor="cloud")
-    logger.info(f"Model after Optimized transformations {model_transformed}")
-
-    # Export to the Onnx
-    logger.info(f"Exporting to Pytorch {model_name} to ONNX...")
-    base_path, generated_onnx_path = qualcomm_efficient_converter(
-        model_kv=model_transformed,
-        onnx_dir_path=onnx_dir_path,
-        model_name=model_name,
-        kv=True,
-        form_factor="cloud",
-        return_path=True,
-        tokenizer=tokenizer,
-    )
-    print(
-        f"Generated Onnx_path {generated_onnx_path} and Onnx_model_path {onnx_model_path} and Onnx_dir_path is {onnx_dir_path}"
-    )
-    assert (
-        generated_onnx_path == onnx_model_path
-    ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_path}"
-    logger.info(f"Base Path is {base_path} and Onnx Model Path is : {generated_onnx_path}")
-
-    # Compile
-    # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
-    generated_qpc_path = compile(
-        onnx_path=onnx_model_path,
-        qpc_path=os.path.dirname(qpc_dir_path),
-        num_cores=num_cores,
-        batch_size=batch_size,
-        prompt_len=prompt_len,
-        ctx_len=ctx_len,
-        mxfp6=mxfp6,
-        mxint8=mxint8,
-        aic_enable_depth_first=aic_enable_depth_first,
-        mos=mos,
-        device_group=device_group,
-    )
-    assert (
-        qpc_dir_path == generated_qpc_path
-    ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
-    logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}")
+        assert (
+            qpc_dir_path == generated_qpc_path
+        ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
+        logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}")
+
 
     # Execute
     cloud_ai_100_exec_kv(
diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
index d9a1e9f8a..062ff27b0 100644
--- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py
+++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -7,30 +7,34 @@
 
 import os
 import shutil
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import torch
-from huggingface_hub import login
-from transformers import AutoTokenizer
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
+import QEfficient
 from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort
-from QEfficient.transformers.modeling_utils import transform
-from QEfficient.utils import hf_download
+from QEfficient.loader.loader import QEFFAutoModel
+from QEfficient.loader.loader_factory import (
+    AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP,
+    QEFF_MODEL_TYPE,
+    QEFFAutoModelForCausalLM,
+    QEFFBaseAutoModelFactory,
+)
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
+from QEfficient.utils.utils import load_hf_tokenizer
 
 
 def convert_to_cloud_bertstyle(
     model_name: str,
-    model_class: type = None,
-    tokenizer=None,
-    onnx_dir_path=None,
-    hf_token: str = None,
-    seq_len: int = Constants.seq_length,
-    input_str: str = Constants.input_str,
-    return_path: bool = False,
-    save_fp32_onnx: bool = False,
-    save_fp16_onnx: bool = True,
+    qeff_model: QEFFAutoModelForCausalLM,
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    onnx_dir_path: str,
+    seq_len: int,
+    return_path: bool,
+    save_fp32_onnx: bool,
+    save_fp16_onnx: bool,
 ):
     """
     Function to convert the model to Bertstyle approach.
@@ -40,23 +44,14 @@ def convert_to_cloud_bertstyle(
         3. KV is everytime computed for all the tokens until EOS/max_length
 
     Args:
-        model_name (str): The name of the model to be used.
-        model_class (type): The class of the model.
         tokenizer (HF AutoTokenizer): Tokenzier to prepare inputs.
         model_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location.
-        hf_token (str): If hf_token passed, it will be used for authentication for gated. Default is None.
         seq_len (int, optional): The length of the sequence. Default is 128.
-        input_str (str): The input string to be processed.
         return_path (bool): If True, return the base path for models and exported onnx model path
         save_fp32_onnx (bool); If True, fp32 unclipped version of ONNX will be saved. Default is False.
         save_fp16_onnx (bool); If false, generation of fp32 clipped version of ONNX will be skipped. Default is True.
 
     """
-    # todo (amitraj) Optimize the onnx export
-    if onnx_dir_path is None:
-        model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
-        onnx_dir_path = os.path.join(model_card_dir, "onnx_bertstyle")
-
     if os.path.exists(onnx_dir_path):
         logger.warning(f"Overriding {onnx_dir_path}")
         shutil.rmtree(onnx_dir_path)
@@ -64,37 +59,29 @@ def convert_to_cloud_bertstyle(
     if not (save_fp32_onnx or save_fp16_onnx):
         raise AttributeError("save_fp32_onnx and save_fp16_onnx can't be false")
 
-    seq_len = Constants.seq_length
-    input_str = Constants.input_str
-
-    # Load tokenizer
-    if tokenizer is None:
-        tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=True)
-    else:
-        if tokenizer.padding_side != "left":
-            logger.warning("Please use padding_side='left' while initializing the tokenizer")
-            tokenizer.padding_side = "left"
+    if tokenizer.padding_side != "left":
+        logger.warning("Please use padding_side='left' while initializing the tokenizer")
+        tokenizer.padding_side = "left"
 
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
 
-    try:
-        if hf_token:
-            login(hf_token)
-        model_hf_path = hf_download(
-            repo_id=model_name,
-            cache_dir=Constants.CACHE_DIR,
-            ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"],
-        )
-        model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True)
-    except Exception as e:
-        print(f"Failed to download the {model_name} model from Huggingface:%s", e)
-    model.eval()
-
     # Decide path for saving exported ONNX files.
+    fp32_model_name, fp16_model_name = export_bertstyle_model_to_onnx(model_name, qeff_model.model, tokenizer, onnx_dir_path, seq_len, save_fp32_onnx, save_fp16_onnx) # type: ignore
+
+    # return the model path for automation.
+    if return_path:
+        if save_fp16_onnx:
+            return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx")
+        else:
+            return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx")
+
+
+def export_bertstyle_model_to_onnx(model_name, model, tokenizer, onnx_dir_path, seq_len, save_fp32_onnx, save_fp16_onnx):
     model_base_name = model_name.replace("/", "_") + "_bertstyle"
     os.makedirs(onnx_dir_path, exist_ok=True)
 
+    input_str = Constants.input_str
     # Preprocess inputs
     if seq_len > 0:
         if tokenizer.pad_token_id is None:
@@ -173,29 +160,19 @@ def convert_to_cloud_bertstyle(
         inputs=inputs,
         input_list_file=input_list_file,
     )
-
-    # return the model path for automation.
-    if return_path:
-        if save_fp16_onnx:
-            return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx")
-        else:
-            return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx")
-    else:
-        return
+    
+    return fp32_model_name,fp16_model_name
 
 
 def convert_to_cloud_kvstyle(
     model_name: str,
-    model_class: type = None,
-    model_kv: torch.nn.Module = None,
-    tokenizer=None,
-    onnx_dir_path=None,
-    hf_token: str = None,
-    seq_len: int = Constants.seq_length,
-    input_str: str = Constants.input_str,
-    return_path: bool = False,
-    save_fp32_onnx: bool = False,
-    save_fp16_onnx: bool = True,
+    qeff_model: QEFFAutoModelForCausalLM,
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    onnx_dir_path: str,
+    seq_len: int,
+    return_path: bool,
+    save_fp32_onnx: bool,
+    save_fp16_onnx: bool,
 ):
     """
     Function Modeling changes for kv retention and export to Onnx.
@@ -219,58 +196,46 @@ def convert_to_cloud_kvstyle(
         save_fp16_onnx (bool); If false, generation of fp32 clipped version of ONNX will be skipped. Default is True.
 
     """
-    if onnx_dir_path is None:
-        model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
-        onnx_dir_path = os.path.join(model_card_dir, "onnx")
-
     if os.path.exists(onnx_dir_path):
         logger.warning(f"Overriding {onnx_dir_path}")
         shutil.rmtree(onnx_dir_path)
 
     if not (save_fp32_onnx or save_fp16_onnx):
         raise AttributeError("save_fp32_onnx and save_fp16_onnx can't be false")
+    
 
-    if model_class is None and model_kv is None:
-        raise AttributeError("model_class and model_kv both can't be None")
+    if tokenizer.padding_side != "left":
+        logger.warning("Please use padding_side='left' while initializing the tokenizer")
+        tokenizer.padding_side = "left"
 
-    if model_kv is not None:
-        if not getattr(model_kv, "qeff_transformed", False):
-            raise AttributeError(
-                "Model is not transformed, Please first use QEfficient.transform to tranform the model."
-            )
-        model = model_kv
-    else:
-        try:
-            if hf_token:
-                login(hf_token)
-            model_hf_path = hf_download(
-                repo_id=model_name,
-                cache_dir=Constants.CACHE_DIR,
-                ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"],
-            )
-            model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True)
-        except Exception as e:
-            print(f"Failed to download the {model_name} model from Huggingface:%s", e)
-        transform(model, form_factor="cloud")
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+    assert qeff_model.is_transformed, f"please pass the {qeff_model.__class__.__name__} after transform API"
 
     # Decide path for saving exported ONNX files.
-    model_base_name = model_name.replace("/", "_") + "_kv"
-    os.makedirs(onnx_dir_path, exist_ok=True)
+    fp32_model_name, fp16_model_name = export_kvstyle_transformed_model_to_onnx(model_name, qeff_model.model,  tokenizer, onnx_dir_path, seq_len, save_fp32_onnx, save_fp16_onnx) # type: ignore
 
-    # Load tokenizer
-    if tokenizer is None:
-        # todo(ochougul): use cache dir from snapshot download
-        tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
-    else:
-        if tokenizer.padding_side != "left":
-            logger.warning("Please use padding_side='left' while initializing the tokenizer")
-            tokenizer.padding_side = "left"
+    # return the model path for automation.
+    if return_path:
+        if save_fp16_onnx:
+            return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx")
+        else:
+            return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx")
 
-    if tokenizer.pad_token_id is None:
-        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+def export_kvstyle_transformed_model_to_onnx(model_name: str, transformed_model: torch.nn.Module, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+                                          onnx_dir_path: str, seq_len: int, save_fp32_onnx: Optional[bool] = False, save_fp16_onnx: Optional[bool] = True):
+    
+    assert isinstance(transformed_model, QEFFBaseAutoModelFactory), f"Expected model_kv to be of type {QEFFBaseAutoModelFactory} but got {transformed_model.__class__.__name__}"
+    if tokenizer.padding_side != "left":
+        logger.warning("Please use padding_side='left' while initializing the tokenizer")
+        tokenizer.padding_side = "left"
+
+    tokenizer.pad_token_id = tokenizer.eos_token_id if tokenizer.pad_token_id is None else tokenizer.pad_token_id    
 
     # Disabling requires_grad on all parameters
-    for j, p in enumerate(model.parameters()):
+    for j, p in enumerate(transformed_model.parameters()):
         p.requires_grad_(False)
 
     # Preprocess inputs
@@ -297,10 +262,10 @@ def convert_to_cloud_kvstyle(
         inputs = tokenizer(input_str, return_tensors="pt")
 
     try:
-        pt_outputs = model(**inputs)
+        pt_outputs = transformed_model(**inputs)
         output_names = list(pt_outputs.keys())
     except Exception as e:
-        print(f"Model {model_name} Execution failed in pytorch:%s", e)
+        print(f"Model {transformed_model.__class__,__name__} Execution failed in pytorch:%s", e)
 
     # Raise error if expected outputs are not present
     assert "logits" in output_names, "logits not found in output"
@@ -319,10 +284,10 @@ def convert_to_cloud_kvstyle(
 
     # Run PyTorch inference with past
     try:
-        pt_outputs = model(**inputs)
+        pt_outputs = transformed_model(**inputs)
         output_names = list(pt_outputs.keys())
     except Exception as e:
-        print(f"Model {model_name} Execution failed in pytorch:%s", e)
+        print(f"Model {transformed_model.__class__,__name__} Execution failed in pytorch:%s", e)
 
     # Add pkv into output_names
     pkv = tuple([(key.detach(), value.detach()) for key, value in pt_outputs.past_key_values])
@@ -337,9 +302,12 @@ def convert_to_cloud_kvstyle(
         pt_outputs[f"past_key.{i}_RetainedState"] = key
         pt_outputs[f"past_value.{i}_RetainedState"] = value
 
+
+    model_base_name = model_name.replace("/", "_") + "_kv"
+    os.makedirs(onnx_dir_path, exist_ok=True)
     # Export and simplify ONNX model
     fp32_model_name = export_onnx(
-        pt_model=model,
+        pt_model=transformed_model,
         inputs=inputs,
         output_names=output_names,
         gen_models_path=onnx_dir_path,
@@ -398,39 +366,95 @@ def convert_to_cloud_kvstyle(
         inputs=inputs,
         input_list_file=input_list_file,
     )
+    
+    return fp32_model_name, fp16_model_name
+
+
+def export_for_edge() -> None:
+    # [TODO]: Apply the class transformation to make changes for the KV models in edge use cases
+    # model = QEfficient.transform(model_hf, type="Transformers", form_factor="edge")
+    # model.eval()
+    raise NotImplementedError("Oops...reached too far!!")
+
+
+def export_for_cloud(model_name: str, qeff_model: QEFFBaseAutoModelFactory,
+                     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+                     onnx_dir_path: str, seq_length: int = Constants.seq_length,
+                     return_path: bool = True,
+                     save_fp32_onnx: bool = False,
+                     save_fp16_onnx: bool = True):
+    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.LLM: # type: ignore
+        return export_lm_model_for_cloud(model_name=model_name,
+                                         qeff_model=qeff_model, # type: ignore
+                                         tokenizer=tokenizer,
+                                         onnx_dir_path=onnx_dir_path,
+                                         seq_length=seq_length,
+                                         return_path=return_path,
+                                         save_fp16_onnx=save_fp16_onnx,
+                                         save_fp32_onnx=save_fp32_onnx)
+    else:
+        raise NotImplementedError(f"Only model type {QEFFAutoModelForCausalLM.__class__.__name__} is supported for export, got {type(qeff_model)}")
+    
+
+def export_lm_model_for_cloud(model_name:str, qeff_model: QEFFAutoModelForCausalLM,
+                              tokenizer:Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+                              onnx_dir_path: str, seq_length: int, return_path:bool,
+                              save_fp32_onnx:bool, save_fp16_onnx: bool):
+    if os.path.exists(onnx_dir_path):
+        logger.warning(f"Overriding {onnx_dir_path}")
+        shutil.rmtree(onnx_dir_path)
 
+    if not (save_fp32_onnx or save_fp16_onnx):
+        raise AttributeError("save_fp32_onnx and save_fp16_onnx can't be false")
+
+    if tokenizer.padding_side != "left":
+        logger.warning("Please use padding_side='left' while initializing the tokenizer")
+        tokenizer.padding_side = "left"
+
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+
+    if qeff_model.is_transformed:
+        fp32_model_name, fp16_model_name = export_bertstyle_model_to_onnx(
+            model_name=model_name,
+            model=qeff_model.model,
+            tokenizer=tokenizer, 
+            onnx_dir_path=onnx_dir_path,
+            seq_len=seq_length,
+            save_fp32_onnx=save_fp32_onnx,
+            save_fp16_onnx=save_fp16_onnx) # type: ignore
+    else:
+        fp32_model_name, fp16_model_name = export_kvstyle_transformed_model_to_onnx(
+            model_name=model_name,
+            transformed_model=qeff_model.model,
+            tokenizer=tokenizer,
+            onnx_dir_path=onnx_dir_path,
+            seq_len=seq_length,
+            save_fp32_onnx=save_fp32_onnx,
+            save_fp16_onnx=save_fp16_onnx) # type: ignore
+    
     # return the model path for automation.
     if return_path:
         if save_fp16_onnx:
             return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx")
         else:
             return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx")
-    else:
-        return
-
-
-def convert_to_edge(self) -> None:
-    # [TODO]: Apply the class transformation to make changes for the KV models in edge use cases
-    # model = QEfficient.transform(model_hf, type="Transformers", form_factor="edge")
-    # model.eval()
-    raise NotImplementedError("Oops...reached too far!!")
 
 
 def qualcomm_efficient_converter(
     model_name: str,
-    model_class: type = None,
-    model_kv: torch.nn.Module = None,
-    tokenizer=None,
-    onnx_dir_path=None,
-    hf_token: str = "",
+    model_kv: Optional[QEFFBaseAutoModelFactory] = None, # type: ignore
+    tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]]=None,
+    onnx_dir_path: Optional[str]=None,
+    hf_token: Optional[str] = None,
     seq_length: int = Constants.seq_length,
-    input_str: str = Constants.input_str,
     kv: bool = True,
-    return_path: bool = False,
-    form_factor="cloud",
+    return_path: bool = True,
+    form_factor: str="cloud",
     save_fp32_onnx: bool = False,
     save_fp16_onnx: bool = True,
-) -> Optional[Tuple[str, str]]:
+) -> Union[Tuple[str, str], None]:
     """
     Function to convert the input string using the specified model and returns the result.
 
@@ -442,7 +466,6 @@ def qualcomm_efficient_converter(
         onnx_dir_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location.
         token (bool): If True, an authentication token will be used. Default is False.
         seq_len (int, optional): The length of the sequence. Default is 128.
-        input_str (str): The input string to be processed.
         kv (bool): If True, key-value pairs will be used. Default is True.
         return_path (bool): If True, return the base path for models and exported onnx model path
         save_fp32_onnx (bool); If True, fp32 unclipped version of ONNX will be saved. Default is False.
@@ -452,36 +475,32 @@ def qualcomm_efficient_converter(
         None, if automation is False, else path to exported Onnx file
 
     """
-    if model_kv is not None and not kv:
-        raise AttributeError("For Transformed model kv must be True")
+    # Get model_kv first
+    model_kv = model_kv if model_kv else QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token)
 
+    # Transform if required
+    if model_kv.is_transformed and not kv:
+        raise AttributeError("Transformed model is passed while requsting to convert non-transformed model")
+    
+    model_kv: QEFFBaseAutoModelFactory = QEfficient.transform(model_kv) if kv else model_kv
+
+
+    if onnx_dir_path is None:
+        model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
+        onnx_dir_path = os.path.join(model_card_dir, "onnx")
+    
+    # Load tokenizer if not passed
+    tokenizer = load_hf_tokenizer(model_name=model_name, hf_token=hf_token) if tokenizer is None else tokenizer
+    
     if form_factor == "cloud":
-        if kv:
-            return convert_to_cloud_kvstyle(
-                model_name=model_name,
-                model_class=model_class,
-                model_kv=model_kv,
-                onnx_dir_path=onnx_dir_path,
-                tokenizer=tokenizer,
-                hf_token=hf_token,
-                seq_len=seq_length,
-                input_str=input_str,
-                return_path=return_path,
-                save_fp32_onnx=save_fp32_onnx,
-                save_fp16_onnx=save_fp16_onnx,
-            )
-        else:
-            return convert_to_cloud_bertstyle(
-                model_name=model_name,
-                model_class=model_class,
-                tokenizer=tokenizer,
-                onnx_dir_path=onnx_dir_path,
-                hf_token=hf_token,
-                seq_len=seq_length,
-                input_str=input_str,
-                return_path=return_path,
-                save_fp32_onnx=save_fp32_onnx,
-                save_fp16_onnx=save_fp16_onnx,
-            )
+        return export_for_cloud(
+            model_name=model_name,
+            qeff_model=model_kv,
+            tokenizer=tokenizer,
+            onnx_dir_path=onnx_dir_path,
+            seq_length=seq_length,
+            return_path=return_path,
+            save_fp16_onnx=save_fp16_onnx,
+            save_fp32_onnx=save_fp32_onnx)
     else:
-        return convert_to_edge()
+        return export_for_edge()
diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py
index 5654ac582..8ce7f6b26 100644
--- a/QEfficient/exporter/export_utils.py
+++ b/QEfficient/exporter/export_utils.py
@@ -83,8 +83,8 @@ def export_onnx(
             custom_opsets={"com.qti.aisw.onnx": 1},
         )
     except Exception as e:
-        error("Exporting to ONNX failed. {}".format(e))
-        return
+        raise RuntimeError("Exporting to ONNX failed. {}".format(e))
+        
 
     onnx.checker.check_model(f"{gen_models_path}_tmp/{model_base_name}.onnx")
     loaded_model = onnx.load(f"{gen_models_path}_tmp/{model_base_name}.onnx")
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index d5d626faa..141a545e1 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -107,7 +107,7 @@ def get_compilation_batch_size(qpc_path: str):
     return compilation_batch_size
 
 
-def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size):
+def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) -> List[str]:
     assert (
         prompt is not None or prompts_txt_file_path is not None
     ), "Please pass atleast one argument either using --prompt or --prompts_txt_file_path"
diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py
index c5421fd25..49b8382c6 100644
--- a/QEfficient/loader/loader_factory.py
+++ b/QEfficient/loader/loader_factory.py
@@ -6,19 +6,17 @@
 # ----------------------------------------------------------------------------
 
 import os
-from typing import Any
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Union
+from typing import Any, Dict, Union
 
-from qtpy import API
 import torch.nn as nn
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
 
-from QEfficient.utils.run_utils import ApiRunner, run_hf_lm_model_with_pt
 import QEfficient
 
+
 class QEFFBaseAutoModelFactory(ABC):
     
     def __init__(self) -> None:
@@ -29,6 +27,10 @@ def __init__(self) -> None:
     @abstractmethod
     def from_pretrained(self, pretrained_model_name_or_path: str, *args, **kwargs):
         pass
+    
+    @property
+    def is_transformed(self) -> bool:
+        raise NotImplementedError("Must implement for child classes")
 
     @abstractmethod
     def execute(self, *args, **kwargs) -> Any:
@@ -40,40 +42,36 @@ def transform(self, *args, **kwargs) -> Any:
     
     @abstractmethod
     def export(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("Reached too far!!")
+        pass
     
 
 class QEFFAutoModelForCausalLM(QEFFBaseAutoModelFactory):
-    def __init__(self, model: nn.Module, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], pretrained_model_name_or_path: str) -> None:
+    def __init__(self, model: nn.Module, pretrained_model_name_or_path: str) -> None:
         assert model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values(), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore
-        self.model = model
-        self.tokenizer = tokenizer
+        self.model: nn.Module = model
         self.model_files_path = pretrained_model_name_or_path
-        self._model_executor = None
+
+    @property
+    def is_transformed(self) -> bool:
+        return getattr(self.model, "qeff_transformed", False)
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
         model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-        return cls(model=model, tokenizer=tokenizer, pretrained_model_name_or_path=pretrained_model_name_or_path)
+        return cls(model=model, pretrained_model_name_or_path=pretrained_model_name_or_path)
     
-    def _run_kv_lm_model_with_pt(self, prompt, prompt_len, ctx_len):
-        api_runner = ApiRunner(self.tokenizer, prompt=prompt, prompt_len=prompt_len, ctx_len=ctx_len)
-        return api_runner.run_kv_model_on_pytorch(self.model, )
-
-    def execute(self, prompt: str, prompt_len: int = None, ctx_len: int = None, max_gen_length: int = 128): # type: ignore
-        if getattr(self.model, "qeff_transformed", False):
-            output_ids = run_hf_lm_model_with_pt(self.model, self.tokenizer, prompt, max_gen_length)
-        else:
-            output_ids = self._run_kv_lm_model_with_pt(prompt, prompt_len, ctx_len)
-        return output_ids
+    def execute(self, *args, **kwargs): # type: ignore
+        raise NotImplementedError("Reached too far!!")
     
     def transform(self):
         QEfficient.transform(self.model)
         return self
 
     def export(self):
-        pass
+        raise NotImplementedError("Reached too far!!")
+    
+    def __repr__(self) -> None:
+        print(self.model)
 
 
 class QEFF_MODEL_TYPE(Enum):
@@ -82,10 +80,12 @@ class QEFF_MODEL_TYPE(Enum):
     AWQ = "AWQ"
 
 
-MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP= {
+MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP = {
     QEFF_MODEL_TYPE.LLM: QEFFAutoModelForCausalLM
 }
 
+AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()}
+
 
 def get_hf_model_type(hf_model_path: str):
     assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model"
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index bed89942b..4e06598c9 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -7,6 +7,7 @@
 
 from QEfficient.utils.utils import (  # noqa: F401
     hf_download,
+    load_hf_tokenizer,
     login_and_download_hf_lm,
     onnx_exists,
     qpc_exists,
diff --git a/tests/utils.py b/tests/utils.py
index f8fd7566e..a26d84826 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -15,6 +15,9 @@
 import QEfficient
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100
+from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM
+import QEfficient.transformers
+import QEfficient.transformers.modeling_utils
 from QEfficient.utils import hf_download
 from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants
 from QEfficient.utils.device_utils import get_available_device_id, is_multi_qranium_setup_available, is_qpc_size_gt_32gb
@@ -98,7 +101,7 @@ def transform_pt_model_with_qeff(model_hf):
     :param model_hf: pytorch model
     :return model_kv
     """
-    model_kv = QEfficient.transform(model_hf, type="Transformers", form_factor="cloud")
+    model_kv = QEfficient.transformers.modeling_utils.transform(model_hf, form_factor="cloud")
     model_kv.eval()
     return model_kv
 
@@ -113,8 +116,7 @@ def export_onnx(model_kv, tokenizer, model_name, model_class):
     onnx_dir_path = os.path.join(QEFF_MODELS_DIR, model_name)
     base_path, onnx_model_path = qualcomm_efficient_converter(
         model_name=model_name,
-        model_class=model_class,
-        model_kv=model_kv,
+        model_kv=QEFFAutoModelForCausalLM(model=model_kv, pretrained_model_name_or_path=None), # type: ignore
         tokenizer=tokenizer,
         onnx_dir_path=onnx_dir_path,
         kv=True,

From 55f6182f1575666813aeed6c692c7f416f111110 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 28 May 2024 23:37:01 +0530
Subject: [PATCH 03/20] removed unused imports

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/loader/loader_factory.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py
index 49b8382c6..cd2104ba3 100644
--- a/QEfficient/loader/loader_factory.py
+++ b/QEfficient/loader/loader_factory.py
@@ -8,7 +8,7 @@
 import os
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Any, Dict, Union
+from typing import Any
 
 import torch.nn as nn
 from transformers import AutoConfig, AutoModelForCausalLM

From 8ff782ae46c7675103941f7da21945774fffe8aa Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 28 May 2024 23:54:02 +0530
Subject: [PATCH 04/20] allowed to initialize QEFFAUtoLMModel

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/loader/loader_factory.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py
index cd2104ba3..72e28f912 100644
--- a/QEfficient/loader/loader_factory.py
+++ b/QEfficient/loader/loader_factory.py
@@ -15,6 +15,7 @@
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
 
 import QEfficient
+from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict
 
 
 class QEFFBaseAutoModelFactory(ABC):
@@ -47,7 +48,8 @@ def export(self, *args, **kwargs) -> Any:
 
 class QEFFAutoModelForCausalLM(QEFFBaseAutoModelFactory):
     def __init__(self, model: nn.Module, pretrained_model_name_or_path: str) -> None:
-        assert model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values(), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore
+        assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or
+                model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore
         self.model: nn.Module = model
         self.model_files_path = pretrained_model_name_or_path
 

From cc7aa257f0fd3761d77705b80cf7b65446d0899e Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Wed, 29 May 2024 00:18:07 +0530
Subject: [PATCH 05/20] fixed tests bugs

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 .../exporter/export_hf_to_cloud_ai_100.py     | 15 ++++++++-------
 QEfficient/loader/loader_factory.py           |  4 ++--
 tests/utils.py                                | 19 +++++++++----------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
index 062ff27b0..4805f12d7 100644
--- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py
+++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -227,7 +227,6 @@ def convert_to_cloud_kvstyle(
 def export_kvstyle_transformed_model_to_onnx(model_name: str, transformed_model: torch.nn.Module, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
                                           onnx_dir_path: str, seq_len: int, save_fp32_onnx: Optional[bool] = False, save_fp16_onnx: Optional[bool] = True):
     
-    assert isinstance(transformed_model, QEFFBaseAutoModelFactory), f"Expected model_kv to be of type {QEFFBaseAutoModelFactory} but got {transformed_model.__class__.__name__}"
     if tokenizer.padding_side != "left":
         logger.warning("Please use padding_side='left' while initializing the tokenizer")
         tokenizer.padding_side = "left"
@@ -416,23 +415,25 @@ def export_lm_model_for_cloud(model_name:str, qeff_model: QEFFAutoModelForCausal
 
 
     if qeff_model.is_transformed:
-        fp32_model_name, fp16_model_name = export_bertstyle_model_to_onnx(
+        fp32_model_name, fp16_model_name = export_kvstyle_transformed_model_to_onnx(
             model_name=model_name,
-            model=qeff_model.model,
-            tokenizer=tokenizer, 
+            transformed_model=qeff_model.model,
+            tokenizer=tokenizer,
             onnx_dir_path=onnx_dir_path,
             seq_len=seq_length,
             save_fp32_onnx=save_fp32_onnx,
             save_fp16_onnx=save_fp16_onnx) # type: ignore
+
     else:
-        fp32_model_name, fp16_model_name = export_kvstyle_transformed_model_to_onnx(
+        fp32_model_name, fp16_model_name = export_bertstyle_model_to_onnx(
             model_name=model_name,
-            transformed_model=qeff_model.model,
-            tokenizer=tokenizer,
+            model=qeff_model.model,
+            tokenizer=tokenizer, 
             onnx_dir_path=onnx_dir_path,
             seq_len=seq_length,
             save_fp32_onnx=save_fp32_onnx,
             save_fp16_onnx=save_fp16_onnx) # type: ignore
+
     
     # return the model path for automation.
     if return_path:
diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py
index 72e28f912..2b13e59be 100644
--- a/QEfficient/loader/loader_factory.py
+++ b/QEfficient/loader/loader_factory.py
@@ -72,8 +72,8 @@ def transform(self):
     def export(self):
         raise NotImplementedError("Reached too far!!")
     
-    def __repr__(self) -> None:
-        print(self.model)
+    def __repr__(self) -> str:
+        return self.model.__repr__()
 
 
 class QEFF_MODEL_TYPE(Enum):
diff --git a/tests/utils.py b/tests/utils.py
index a26d84826..0760e3613 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -13,11 +13,11 @@
 import transformers
 
 import QEfficient
+import QEfficient.transformers
+import QEfficient.transformers.modeling_utils
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100
 from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM
-import QEfficient.transformers
-import QEfficient.transformers.modeling_utils
 from QEfficient.utils import hf_download
 from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants
 from QEfficient.utils.device_utils import get_available_device_id, is_multi_qranium_setup_available, is_qpc_size_gt_32gb
@@ -161,14 +161,13 @@ def set_up(model_config, device_group=[0]):
         model_config["model_name"],
         model_config["model_class"],
     )
-    try:
-        ort_tokens = api_runner.run_kv_model_on_ort(
-            onnx_model_path,
-            model_config["n_layer"],
-            model_config["padding_shape"],
-        )
-    except Exception as e:
-        print(f"ONNX Model run on onnxrt failed due to : {e}")
+
+    ort_tokens = api_runner.run_kv_model_on_ort(
+        onnx_model_path,
+        model_config["n_layer"],
+        model_config["padding_shape"],
+    )
+
 
     setup_info = {}
     setup_info["model_config"] = model_config

From 59fcc74665d20fd83a25fccbc8f72dffe61a6014 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Wed, 29 May 2024 11:55:31 +0530
Subject: [PATCH 06/20] renamed utils.py to _utils.py

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 .../exporter/export_hf_to_cloud_ai_100.py     |   2 +-
 QEfficient/loader/loader.py                   |   2 +-
 QEfficient/utils/__init__.py                  |   2 +-
 QEfficient/utils/_utils.py                    | 124 ++++++++++++++++++
 4 files changed, 127 insertions(+), 3 deletions(-)
 create mode 100644 QEfficient/utils/_utils.py

diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
index 4805f12d7..c2568a735 100644
--- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py
+++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -23,7 +23,7 @@
 )
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
-from QEfficient.utils.utils import load_hf_tokenizer
+from QEfficient.utils._utils import load_hf_tokenizer
 
 
 def convert_to_cloud_bertstyle(
diff --git a/QEfficient/loader/loader.py b/QEfficient/loader/loader.py
index 950fcb946..185434dae 100644
--- a/QEfficient/loader/loader.py
+++ b/QEfficient/loader/loader.py
@@ -13,7 +13,7 @@
     QEFFBaseAutoModelFactory,
     get_hf_model_type,
 )
-from QEfficient.utils.utils import login_and_download_hf_lm
+from QEfficient.utils._utils import login_and_download_hf_lm
 
 
 class QEFFAutoModel:
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index 4e06598c9..7a3cd4959 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from QEfficient.utils.utils import (  # noqa: F401
+from QEfficient.utils._utils import (  # noqa: F401
     hf_download,
     load_hf_tokenizer,
     login_and_download_hf_lm,
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
new file mode 100644
index 000000000..80f1f0c46
--- /dev/null
+++ b/QEfficient/utils/_utils.py
@@ -0,0 +1,124 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import os
+from typing import List, Optional, Tuple, Union
+
+import requests
+from huggingface_hub import login, snapshot_download
+from requests.exceptions import HTTPError
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from QEfficient.utils.constants import QEFF_MODELS_DIR
+
+
+def login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs):
+    hf_token = kwargs.pop("hf_token", None)
+    cache_dir = kwargs.pop("cache_dir", None)   
+    if hf_token is not None:
+        login(hf_token)
+    pretrained_model_name_or_path = hf_download(
+        repo_id=pretrained_model_name_or_path,
+        cache_dir=cache_dir,
+        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
+    )
+    return pretrained_model_name_or_path
+
+
+def hf_download(
+    repo_id: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+    hf_token: Optional[str] = None,
+    allow_patterns: Optional[List[str]] = None,
+    ignore_patterns: Optional[List[str]] = None,
+):
+    # Setup cache and local dir
+    local_dir = None
+    if cache_dir is not None:
+        cache_dir = f"{cache_dir}"
+        local_dir = f"{cache_dir}/{repo_id}"
+
+    os.makedirs(f"{cache_dir}/{repo_id}", exist_ok=True)
+    max_retries = 5
+    retry_count = 0
+    while retry_count < max_retries:
+        try:
+            model_path = snapshot_download(
+                repo_id,
+                cache_dir=cache_dir,
+                local_dir=local_dir,
+                local_dir_use_symlinks=True,
+                revision="main",
+                resume_download=True,
+                token=hf_token,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
+            )
+            break
+        except requests.ReadTimeout as e:
+            print(f"Read timeout: {e}")
+            retry_count += 1
+
+        except HTTPError as e:
+            retry_count = max_retries
+            if e.response.status_code == 401:
+                print("You need to pass a valid `--hf_token=...` to download private checkpoints.")
+            else:
+                raise e
+
+    return model_path
+
+
+def qpc_exists(model_name: str, qpc_base_dir_name: str) -> Tuple[bool, str]:
+    """
+    Checks if qpc files already exists, removes the directory if files have been manipulated.
+    ---------
+    :param model_name: str. HF Model card name.
+    :param dir_path: str. Path of qpc directory.
+    :return: Union[Tuple[bool, str]]: qpc_exists and path to qpc directory
+    """
+    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
+    os.makedirs(model_card_dir, exist_ok=True)
+
+    qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs")
+
+    # Compute the boolean indicating if the QPC exists
+    qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin"))
+
+    return qpc_exists_bool, qpc_dir_path
+
+
+def onnx_exists(model_name: str) -> Tuple[bool, str, str]:
+    """
+    Checks if qpc files already exists, removes the directory if files have been manipulated.
+    ---------
+    :param model_name: str. HF Model card name.
+    :return: Union[Tuple[bool, str, str]]: onnx_exists and path to onnx file and directory
+    """
+    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
+    os.makedirs(model_card_dir, exist_ok=True)
+
+    onnx_dir_path = os.path.join(model_card_dir, "onnx")
+    onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
+
+    # Compute the boolean indicating if the ONNX model exists
+    onnx_exists_bool = os.path.isfile(onnx_model_path) and os.path.isfile(
+        os.path.join(os.path.dirname(onnx_model_path), "custom_io_fp16.yaml")
+    )
+
+    # Return the boolean, onnx_dir_path, and onnx_model_path
+    return onnx_exists_bool, onnx_dir_path, onnx_model_path
+
+
+def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token: Optional[str] = None) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]:
+    if hf_token is not None:
+        login(hf_token)
+
+    # Download tokenizer along with model if it doesn't exist
+    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"])
+    tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
+    return tokenizer

From 836102f915da46fe389a58df6867d36cc83b5408 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Wed, 29 May 2024 13:49:07 +0530
Subject: [PATCH 07/20] added more type hinting and docstrings

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/__init__.py                        |  20 +---
 QEfficient/cloud/export.py                    |   2 +-
 QEfficient/cloud/infer.py                     |   2 +-
 .../exporter/export_hf_to_cloud_ai_100.py     |  12 +-
 QEfficient/loader/loader.py                   |  17 ++-
 QEfficient/loader/loader_factory.py           |  38 +++---
 QEfficient/transformers/modeling_utils.py     | 110 ++++++++++--------
 QEfficient/utils/generate_inputs.py           |  34 +-----
 QEfficient/utils/run_utils.py                 |  38 +++---
 tests/utils.py                                |   2 +-
 10 files changed, 131 insertions(+), 144 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index d9d032f27..fb4c517f7 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -5,21 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import Any, Union
-
+from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.loader import QEFFAutoModel  # noqa: F401
-from QEfficient.loader.loader_factory import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFAutoModelForCausalLM
-from QEfficient.transformers.modeling_utils import transform as transform_hf
-
+from QEfficient.transformers.modeling_utils import transform  # noqa: F401
 
-def transform(model: Union[QEFFAutoModelForCausalLM, Any], form_factor="cloud"):
-    """Low level apis in library
-    model : instance of nn.Module
-    type : Transformers | Diffusers, default : Transformers
-    """
-    assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!"
-    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.LLM:
-        transform_hf(model.model, form_factor)
-        return model
-    else:
-        raise NotImplementedError(f"Recieved unsupported class of type {type(model)}")
+# Users can use QEfficient.export for exporting models to ONNX
+export = qualcomm_efficient_converter
diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 2b7201c8e..4ac2f6a05 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -9,7 +9,7 @@
 import os
 from typing import Optional
 
-import QEfficient
+import QEfficient.transformers.modeling_utils
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.loader import QEFFAutoModel
 from QEfficient.utils import load_hf_tokenizer, onnx_exists
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 326096573..14b61465b 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -9,7 +9,7 @@
 import os
 from typing import List
 
-import QEfficient
+import QEfficient.transformers.modeling_utils
 from QEfficient.cloud.compile import main as compile
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.generation.text_generation_inference import (
diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
index c2568a735..e8b9a23ac 100644
--- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py
+++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -7,19 +7,19 @@
 
 import os
 import shutil
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Type, Union
 
 import torch
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-import QEfficient
+import QEfficient.transformers.modeling_utils
 from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort
 from QEfficient.loader.loader import QEFFAutoModel
 from QEfficient.loader.loader_factory import (
     AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP,
     QEFF_MODEL_TYPE,
     QEFFAutoModelForCausalLM,
-    QEFFBaseAutoModelFactory,
+    QEFFBaseModel,
 )
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
@@ -376,7 +376,7 @@ def export_for_edge() -> None:
     raise NotImplementedError("Oops...reached too far!!")
 
 
-def export_for_cloud(model_name: str, qeff_model: QEFFBaseAutoModelFactory,
+def export_for_cloud(model_name: str, qeff_model: QEFFBaseModel,
                      tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
                      onnx_dir_path: str, seq_length: int = Constants.seq_length,
                      return_path: bool = True,
@@ -445,7 +445,7 @@ def export_lm_model_for_cloud(model_name:str, qeff_model: QEFFAutoModelForCausal
 
 def qualcomm_efficient_converter(
     model_name: str,
-    model_kv: Optional[QEFFBaseAutoModelFactory] = None, # type: ignore
+    model_kv: Optional[Type[QEFFBaseModel]] = None, # type: ignore
     tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]]=None,
     onnx_dir_path: Optional[str]=None,
     hf_token: Optional[str] = None,
@@ -483,7 +483,7 @@ def qualcomm_efficient_converter(
     if model_kv.is_transformed and not kv:
         raise AttributeError("Transformed model is passed while requsting to convert non-transformed model")
     
-    model_kv: QEFFBaseAutoModelFactory = QEfficient.transform(model_kv) if kv else model_kv
+    model_kv: Type[QEFFBaseModel] = QEfficient.transform(model_kv) if kv else model_kv
 
 
     if onnx_dir_path is None:
diff --git a/QEfficient/loader/loader.py b/QEfficient/loader/loader.py
index 185434dae..94ae672be 100644
--- a/QEfficient/loader/loader.py
+++ b/QEfficient/loader/loader.py
@@ -6,28 +6,35 @@
 # -----------------------------------------------------------------------------
 
 import os
-from typing import Any
+from typing import Any, Type
 
 from QEfficient.loader.loader_factory import (
     MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP,
-    QEFFBaseAutoModelFactory,
+    QEFFBaseModel,
     get_hf_model_type,
 )
 from QEfficient.utils._utils import login_and_download_hf_lm
 
 
 class QEFFAutoModel:
+    """
+    Provides HuggingFace model loading interface same as transformers APIs.
+    Supports loading any model on HuggingFace.
+    """
     def __init__(self, *args: Any, **kwds: Any) -> None:
         raise EnvironmentError(
             f"{self.__class__.__name__} is designed to be instantiated "
             f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`")
     
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseAutoModelFactory:
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> Type[QEFFBaseModel]:
+        """
+        Downloads HuggingFace model if already doesn't exist locally, returns QEffAutoModel object based on type of model.
+        """
         pretrained_model_name_or_path = pretrained_model_name_or_path if os.path.isdir(pretrained_model_name_or_path) \
             else login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs)
         model_type = get_hf_model_type(hf_model_path=pretrained_model_name_or_path)
         qeff_auto_model_class = MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP[model_type]
-        assert issubclass(qeff_auto_model_class, QEFFBaseAutoModelFactory), f"Expected class that inherits {QEFFBaseAutoModelFactory}, got {type(qeff_auto_model_class)}"
+        assert issubclass(qeff_auto_model_class, QEFFBaseModel), f"Expected class that inherits {QEFFBaseModel}, got {type(qeff_auto_model_class)}"
 
-        return qeff_auto_model_class.from_pretrained(pretrained_model_name_or_path)
+        return qeff_auto_model_class.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path)
diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py
index 2b13e59be..a803b22fe 100644
--- a/QEfficient/loader/loader_factory.py
+++ b/QEfficient/loader/loader_factory.py
@@ -8,26 +8,31 @@
 import os
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Any
+from typing import Any, Dict, Type
 
 import torch.nn as nn
 from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
 
-import QEfficient
+import QEfficient.transformers.modeling_utils
 from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict
 
 
-class QEFFBaseAutoModelFactory(ABC):
-    
+class QEFFBaseModel(ABC):
+    """
+    This class acts as parent class for all the varieties of model class (i.e. LLMs, SD, quantized etc.).
+    Enforces certain methods to be implemented by child classes.
+
+    All the child classes must provide way to load, transform(optimize), exoprt to ONNX etc. capabilities.
+    """
     def __init__(self) -> None:
         super().__init__()
         # Users can call generate or execute
         self.generate = self.execute
     
-    @abstractmethod
-    def from_pretrained(self, pretrained_model_name_or_path: str, *args, **kwargs):
-        pass
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
+        raise NotImplementedError("Must implement for child classes")
     
     @property
     def is_transformed(self) -> bool:
@@ -46,7 +51,10 @@ def export(self, *args, **kwargs) -> Any:
         pass
     
 
-class QEFFAutoModelForCausalLM(QEFFBaseAutoModelFactory):
+class QEFFAutoModelForCausalLM(QEFFBaseModel):
+    """
+    QEFF class for manipulating any causal language model from HuggingFace hub.
+    """
     def __init__(self, model: nn.Module, pretrained_model_name_or_path: str) -> None:
         assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or
                 model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore
@@ -66,7 +74,7 @@ def execute(self, *args, **kwargs): # type: ignore
         raise NotImplementedError("Reached too far!!")
     
     def transform(self):
-        QEfficient.transform(self.model)
+        QEfficient.transformers.modeling_utils.transform_lm(self.model)
         return self
 
     def export(self):
@@ -82,19 +90,23 @@ class QEFF_MODEL_TYPE(Enum):
     AWQ = "AWQ"
 
 
-MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP = {
+MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = {
     QEFF_MODEL_TYPE.LLM: QEFFAutoModelForCausalLM
 }
 
-AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()}
+AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()}
 
 
-def get_hf_model_type(hf_model_path: str):
-    assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model"
+def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE:
+    """
+    Loads model config file and returns the type of the model (i.e. LLMs, SD, quantized etc.) as supported by the library.
+    """
+    assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model"
     config, kwargs = AutoConfig.from_pretrained(
                 hf_model_path,
                 return_unused_kwargs=True,
             )
+    
     if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING:
         # FIXME: Add logic to handle if quantization config is stored in separate quant_config.json outside of config, also create a separate function for this and below lines
         quant_config = getattr(config, "quantization_config", getattr(config, "quant_config", None))
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 5ad29ef3d..9253ae54c 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -7,6 +7,7 @@
 
 import hashlib
 from collections import namedtuple
+from typing import Dict, Type
 
 import torch.nn as nn
 import transformers
@@ -34,17 +35,22 @@
 )
 from transformers.models.mixtral.modeling_mixtral import (
     MixtralAttention,
+    MixtralBLockSparseTop2MLP,
+    MixtralDecoderLayer,
     MixtralForCausalLM,
     MixtralModel,
-    MixtralDecoderLayer,
-    MixtralSparseMoeBlock,
-    MixtralBLockSparseTop2MLP,
-    MixtralRotaryEmbedding,
     MixtralRMSNorm,
+    MixtralRotaryEmbedding,
+    MixtralSparseMoeBlock,
 )
 from transformers.models.mpt.modeling_mpt import MptAttention, MptBlock, MptForCausalLM, MptModel
 
 from QEfficient.customop import CustomRMSNormAIC
+from QEfficient.loader.loader_factory import (
+    AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP,
+    QEFF_MODEL_TYPE,
+    QEFFBaseModel,
+)
 from QEfficient.utils.logging_utils import logger
 
 from .modeling_attn_mask_utils import (
@@ -81,13 +87,13 @@
     QEffMistralRotaryEmbedding,
 )
 from .models.mixtral_moe.modeling_mixtral import (
-    QEffMixtralModel,
-    QEffMixtralRotaryEmbedding,
     QEffMixtralAttention,
-    QEffMixtralForCausalLM,
+    QEffMixtralBLockSparseTop2MLP,
     QEffMixtralDecoderLayer,
+    QEffMixtralForCausalLM,
+    QEffMixtralModel,
+    QEffMixtralRotaryEmbedding,
     QEffMixtralSparseMoeBlock,
-    QEffMixtralBLockSparseTop2MLP,
 )
 from .models.mpt.modeling_mpt import QEffMptAttention, QEffMptBlock, QEffMptForCausalLM, QEFfMptModel
 
@@ -108,7 +114,7 @@
 
 # Define a transformers layers to QEff layers dictionary
 # While onboarding new models make sure to add the new layer maps to this dictionary.
-TransformersToQEffModulesDict = {
+TransformersToQEffModulesDict: Dict[Type[nn.Module], Type[nn.Module]] = {
     # GPT model layers
     GPT2Model: QEffGPT2Model,
     GPT2Block: QEffGPT2Block,
@@ -179,13 +185,12 @@ def replace_module_with_qeff_layers(model: nn.Module) -> None:
         replace_module_with_qeff_layers(module)
 
 
-def transform(model: nn.Module, form_factor: str = "cloud") -> nn.Module:
+def transform_lm(model: nn.Module) -> nn.Module:
     """
-    Replaces some Transformers' methods for equivalent methods optimized for AI 100.
+    Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for cloud AI 100.
     ---------
     Args:
     param model (torch.nn.Module): PyTorch model.
-    form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"].
 
     Returns:
     torch.nn.Module: PyTorch Module with replaced QEff layers.
@@ -196,38 +201,49 @@ def transform(model: nn.Module, form_factor: str = "cloud") -> nn.Module:
         print("Model is already transformed")
         return model
 
-    
-    if form_factor == "cloud":
-        # Get Hash of all params for checking later
-        prior_params_hash = get_params_hash(model)
-        logger.warning(f"The model {model.__class__} layers has been upadted to QEff layers in-place")
-        # Replace with QEff layers
-        replace_module_with_qeff_layers(model)
-
-        # Check with new params hash
-        later_params_hash = get_params_hash(model)
-        assert (
-            prior_params_hash == later_params_hash
-        ), "Weights were changed in the transform process, please report an issue"
-
-        # Replace the modeling output classes
-        transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions = (
-            QEffBaseModelOutputWithPastAndCrossAttentions
-        )
-        transformers.modeling_outputs.CausalLMOutputWithCrossAttentions = QEffCausalLMOutputWithCrossAttentions
-        transformers.modeling_outputs.BaseModelOutputWithPast = QEffBaseModelOutputWithPast
-        transformers.modeling_outputs.CausalLMOutputWithPast = QEffCausalLMOutputWithPast
-        transformers.modeling_outputs.MoeCausalLMOutputWithPast = QEffMoeCausalLMOutputWithPast
-        transformers.modeling_outputs.MoeModelOutputWithPast = QEffMoeModelOutputWithPast
-
-        # Replace the modeling attn util classes and functions
-        transformers.modeling_attn_mask_utils.AttentionMaskConverter = QEffAttentionMaskConverter
-        transformers.modeling_attn_mask_utils._prepare_4d_attention_mask = _qeff_prepare_4d_attention_mask
-        transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask = _qeff_prepare_4d_causal_attention_mask
-
-        setattr(model,'qeff_transformed',True)
-        return model.eval()
-
-    elif form_factor == "edge":
-        # Add changes for the edge usecase
-        raise NotImplementedError("We currently only support cloud form factor!")
+    # Get Hash of all params for checking later
+    prior_params_hash = get_params_hash(model)
+    logger.warning(f"The model {model.__class__} layers has been upadted to QEff layers in-place")
+    # Replace with QEff layers
+    replace_module_with_qeff_layers(model)
+
+    # Check with new params hash
+    later_params_hash = get_params_hash(model)
+    assert (
+        prior_params_hash == later_params_hash
+    ), "Weights were changed in the transform process, please report an issue"
+
+    # Replace the modeling output classes
+    transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions = (
+        QEffBaseModelOutputWithPastAndCrossAttentions
+    )
+    transformers.modeling_outputs.CausalLMOutputWithCrossAttentions = QEffCausalLMOutputWithCrossAttentions
+    transformers.modeling_outputs.BaseModelOutputWithPast = QEffBaseModelOutputWithPast
+    transformers.modeling_outputs.CausalLMOutputWithPast = QEffCausalLMOutputWithPast
+    transformers.modeling_outputs.MoeCausalLMOutputWithPast = QEffMoeCausalLMOutputWithPast
+    transformers.modeling_outputs.MoeModelOutputWithPast = QEffMoeModelOutputWithPast
+
+    # Replace the modeling attn util classes and functions
+    transformers.modeling_attn_mask_utils.AttentionMaskConverter = QEffAttentionMaskConverter
+    transformers.modeling_attn_mask_utils._prepare_4d_attention_mask = _qeff_prepare_4d_attention_mask
+    transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask = _qeff_prepare_4d_causal_attention_mask
+
+    setattr(model,'qeff_transformed',True)
+    return model.eval()
+
+
+
+def transform(model: Type[QEFFBaseModel], form_factor="cloud"):
+    """
+    This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for cloud AI 100.
+    Will replace the torch.nn.Module layers of passed QEffModel with optimized implementation of the same.
+
+    model: object of any instance of class that is child of `QEFFBaseAutoModelFactory`
+    form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"].
+    """
+    assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!"
+    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.LLM:
+        transform_lm(model.model, form_factor) # type: ignore
+        return model
+    else:
+        raise NotImplementedError(f"Recieved unsupported class of type {type(model)}")
diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py
index deb7bcf32..9818f6123 100644
--- a/QEfficient/utils/generate_inputs.py
+++ b/QEfficient/utils/generate_inputs.py
@@ -5,44 +5,12 @@
 #
 # -----------------------------------------------------------------------------
 
-from abc import ABC, abstractmethod
 import numpy as np
 import torch
 
 from QEfficient.utils.logging_utils import logger
 
 
-class AwesomeInputHandler(ABC):
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.counter = 0
-
-    def reset(self):
-        self.counter = 0
-
-    def prepare_inputs(self, prompt, n_layer, padding_shape):
-        if self.counter!=0:
-            logger.warning("Resetting Input Handler as prepare_inputs is called even though it's in the middle of generating outputs")
-            self.reset()
-    
-        self._prepare_inputs(prompt, n_layer, padding_shape)
-        self.counter+=1
-
-    def update_inputs(self, outputs):
-        self._update_inputs(outputs)
-        self.counter+=1
-
-    @abstractmethod
-    def _prepare_inputs(self, prompt, n_layer, padding_shape):
-        pass
-
-    @abstractmethod
-    def _update_inputs(self, outputs):
-        pass
-
-
-
 class InputHandler:
     def __init__(self, tokenizer, input_str, prompt_len, ctx_len):
         """
@@ -53,7 +21,7 @@ def __init__(self, tokenizer, input_str, prompt_len, ctx_len):
         :param ctx_len: int
         """
         if tokenizer.padding_side != "left":
-            logger.warning(f"Please use padding_side='left' while initializing the tokenizer")
+            logger.warning("Please use padding_side='left' while initializing the tokenizer")
             tokenizer.padding_side = "left"
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py
index fbfc2b968..bc50df37a 100644
--- a/QEfficient/utils/run_utils.py
+++ b/QEfficient/utils/run_utils.py
@@ -14,26 +14,6 @@
 from .generate_inputs import InputHandler
 
 
-def run_hf_lm_model_with_pt(model_hf, tokenizer, prompt, gen_len):
-    input_ids = tokenizer.encode(prompt, return_tensors="pt")
-
-    input_ids_len = len(input_ids[0])
-
-    with torch.no_grad():
-        for _ in range(gen_len):
-            outputs = model_hf(input_ids)
-            logits = outputs.logits[:, -1, :]
-            predicted_token_id = torch.argmax(logits, dim=-1)
-            input_ids = torch.cat([input_ids, predicted_token_id.unsqueeze(1)], dim=-1)
-
-    generated_ids = input_ids[0][input_ids_len:].detach().numpy()
-    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
-    print("Original HF Model Outputs (Torch CPU): \n")
-    print("Prompt:", repr(prompt))
-    print("Completion:", repr(generated_text))
-    return generated_ids
-
-
 class ApiRunner:
     """
     ApiRunner class is responsible for:
@@ -71,7 +51,23 @@ def run_hf_model_on_pytorch(self, model_hf):
         :param model_hf: pytorch model
         :return generated_ids: numpy.ndarray - output tokens
         """
-        return run_hf_lm_model_with_pt(model_hf, self.tokenizer, self.prompt[0], self.gen_len)
+        input_ids = self.tokenizer.encode(self.prompt[0], return_tensors="pt")
+
+        input_ids_len = len(input_ids[0])
+
+        with torch.no_grad():
+            for _ in range(self.gen_len):
+                outputs = model_hf(input_ids)
+                logits = outputs.logits[:, -1, :]
+                predicted_token_id = torch.argmax(logits, dim=-1)
+                input_ids = torch.cat([input_ids, predicted_token_id.unsqueeze(1)], dim=-1)
+
+        generated_ids = input_ids[0][input_ids_len:].detach().numpy()
+        generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
+        print("Original HF Model Outputs (Torch CPU): \n")
+        print("Prompt:", repr(self.prompt))
+        print("Completion:", repr(generated_text))
+        return generated_ids
 
 
     def run_kv_model_on_pytorch(self, model, n_layer, padding_shape):
diff --git a/tests/utils.py b/tests/utils.py
index 0760e3613..3ef42f82c 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -101,7 +101,7 @@ def transform_pt_model_with_qeff(model_hf):
     :param model_hf: pytorch model
     :return model_kv
     """
-    model_kv = QEfficient.transformers.modeling_utils.transform(model_hf, form_factor="cloud")
+    model_kv = QEfficient.transformers.modeling_utils.transform_lm(model_hf)
     model_kv.eval()
     return model_kv
 

From 2cd787fbe599513258f15e601620893919c6fef7 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 30 May 2024 12:07:25 +0530
Subject: [PATCH 08/20] addressed review comments, added test file for new
 interface

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 .../exporter/export_hf_to_cloud_ai_100.py     | 19 +++++-----
 QEfficient/loader/loader_factory.py           | 33 +++++++++++++----
 QEfficient/transformers/modeling_utils.py     |  2 +-
 QEfficient/utils/_utils.py                    |  7 ++--
 tests/test_loader.py                          | 36 +++++++++++++++++++
 tests/utils.py                                | 15 +++-----
 6 files changed, 81 insertions(+), 31 deletions(-)
 create mode 100644 tests/test_loader.py

diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
index e8b9a23ac..8e012cb1b 100644
--- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py
+++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -260,11 +260,10 @@ def export_kvstyle_transformed_model_to_onnx(model_name: str, transformed_model:
     else:
         inputs = tokenizer(input_str, return_tensors="pt")
 
-    try:
-        pt_outputs = transformed_model(**inputs)
-        output_names = list(pt_outputs.keys())
-    except Exception as e:
-        print(f"Model {transformed_model.__class__,__name__} Execution failed in pytorch:%s", e)
+
+    pt_outputs = transformed_model(**inputs)
+    output_names = list(pt_outputs.keys())
+
 
     # Raise error if expected outputs are not present
     assert "logits" in output_names, "logits not found in output"
@@ -282,11 +281,9 @@ def export_kvstyle_transformed_model_to_onnx(model_name: str, transformed_model:
     inputs["past_key_values"] = tuple([(key.detach(), value.detach()) for key, value in pt_outputs.past_key_values])
 
     # Run PyTorch inference with past
-    try:
-        pt_outputs = transformed_model(**inputs)
-        output_names = list(pt_outputs.keys())
-    except Exception as e:
-        print(f"Model {transformed_model.__class__,__name__} Execution failed in pytorch:%s", e)
+    pt_outputs = transformed_model(**inputs)
+    output_names = list(pt_outputs.keys())
+
 
     # Add pkv into output_names
     pkv = tuple([(key.detach(), value.detach()) for key, value in pt_outputs.past_key_values])
@@ -382,7 +379,7 @@ def export_for_cloud(model_name: str, qeff_model: QEFFBaseModel,
                      return_path: bool = True,
                      save_fp32_onnx: bool = False,
                      save_fp16_onnx: bool = True):
-    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.LLM: # type: ignore
+    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore
         return export_lm_model_for_cloud(model_name=model_name,
                                          qeff_model=qeff_model, # type: ignore
                                          tokenizer=tokenizer,
diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py
index a803b22fe..66ca42154 100644
--- a/QEfficient/loader/loader_factory.py
+++ b/QEfficient/loader/loader_factory.py
@@ -38,6 +38,14 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
     def is_transformed(self) -> bool:
         raise NotImplementedError("Must implement for child classes")
 
+    @abstractmethod
+    def transform_export(self, *args, **kwargs) -> Any:
+        pass
+
+    @abstractmethod
+    def transform_export_compile(self, *args, **kwargs) -> Any:
+        pass
+
     @abstractmethod
     def execute(self, *args, **kwargs) -> Any:
         pass
@@ -49,6 +57,10 @@ def transform(self, *args, **kwargs) -> Any:
     @abstractmethod
     def export(self, *args, **kwargs) -> Any:
         pass
+
+    @abstractmethod
+    def compile(self, *args, **kwargs) -> Any:
+        pass
     
 
 class QEFFAutoModelForCausalLM(QEFFBaseModel):
@@ -61,6 +73,9 @@ def __init__(self, model: nn.Module, pretrained_model_name_or_path: str) -> None
         self.model: nn.Module = model
         self.model_files_path = pretrained_model_name_or_path
 
+    def __repr__(self) -> str:
+        return self.model.__repr__()
+    
     @property
     def is_transformed(self) -> bool:
         return getattr(self.model, "qeff_transformed", False)
@@ -70,6 +85,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
         model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
         return cls(model=model, pretrained_model_name_or_path=pretrained_model_name_or_path)
     
+    def transform_export(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("Reached too far!!")
+    
+    def transform_export_compile(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("Reached too far!!")
+    
     def execute(self, *args, **kwargs): # type: ignore
         raise NotImplementedError("Reached too far!!")
     
@@ -80,18 +101,18 @@ def transform(self):
     def export(self):
         raise NotImplementedError("Reached too far!!")
     
-    def __repr__(self) -> str:
-        return self.model.__repr__()
+    def compile(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("Reached too far!!")
 
 
 class QEFF_MODEL_TYPE(Enum):
-    LLM = "LLM"
-    STABLE_DIFFUSION = "STABLE_DIFFUSION"
+    CAUSALLM = "LLM"
+    DIFFUSION = "STABLE_DIFFUSION"
     AWQ = "AWQ"
 
 
 MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = {
-    QEFF_MODEL_TYPE.LLM: QEFFAutoModelForCausalLM
+    QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM
 }
 
 AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()}
@@ -116,6 +137,6 @@ def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE:
             else:
                 raise NotImplementedError(f"current model type is not yet supported {type(config)}")
         else:
-            return QEFF_MODEL_TYPE.LLM
+            return QEFF_MODEL_TYPE.CAUSALLM
     else:
         raise NotImplementedError(f"model type {type(config)} is not yet supported")
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 9253ae54c..592b085ff 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -242,7 +242,7 @@ def transform(model: Type[QEFFBaseModel], form_factor="cloud"):
     form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"].
     """
     assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!"
-    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.LLM:
+    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM:
         transform_lm(model.model, form_factor) # type: ignore
         return model
     else:
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 80f1f0c46..049462514 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -75,7 +75,10 @@ def hf_download(
 
 def qpc_exists(model_name: str, qpc_base_dir_name: str) -> Tuple[bool, str]:
     """
-    Checks if qpc files already exists, removes the directory if files have been manipulated.
+    Checks if qpc dir exists.
+    Returns
+    1. Boolean variable indicating if qpc files exist
+    2. Path of the qpc dir if found.
     ---------
     :param model_name: str. HF Model card name.
     :param dir_path: str. Path of qpc directory.
@@ -120,5 +123,5 @@ def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token
 
     # Download tokenizer along with model if it doesn't exist
     model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"])
-    tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
+    tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True)
     return tokenizer
diff --git a/tests/test_loader.py b/tests/test_loader.py
new file mode 100644
index 000000000..56e81f666
--- /dev/null
+++ b/tests/test_loader.py
@@ -0,0 +1,36 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from typing import Any, Dict
+
+import pytest
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+
+import QEfficient.transformers.modeling_utils
+from QEfficient import QEFFAutoModel
+from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM
+
+model_name_to_params_dict : Dict[str, Dict[str, Any]] = {
+    "gpt2": {
+        "qeff_class": QEFFAutoModelForCausalLM,
+        "hf_class": GPT2LMHeadModel,
+        "prompt": "Equator is"
+    },
+    
+}
+model_names = model_name_to_params_dict.keys()
+
+
+@pytest.mark.parametrize("model_name", model_names)
+def test_qeff_auto_model_for_causal_lm(model_name: str):
+    model = QEFFAutoModel.from_pretrained(model_name)
+    assert isinstance(model, model_name_to_params_dict[model_name]['qeff_class'])
+    assert isinstance(model.model, model_name_to_params_dict[model_name]['hf_class']) # type: ignore
+
+    # Run transform
+    QEfficient.transform(model)
+    print(model)
diff --git a/tests/utils.py b/tests/utils.py
index 3ef42f82c..37dfd5795 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -10,15 +10,11 @@
 import shutil
 import unittest
 
-import transformers
-
-import QEfficient
-import QEfficient.transformers
-import QEfficient.transformers.modeling_utils
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100
 from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM
-from QEfficient.utils import hf_download
+from QEfficient.transformers.modeling_utils import transform_lm
+from QEfficient.utils import hf_download, load_hf_tokenizer
 from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants
 from QEfficient.utils.device_utils import get_available_device_id, is_multi_qranium_setup_available, is_qpc_size_gt_32gb
 from QEfficient.utils.run_utils import ApiRunner
@@ -70,10 +66,7 @@ def get_tokenizer(model_name):
     :param model_name: str
     :return tokenizer
     """
-    model_hf_path = hf_download(repo_id=model_name, allow_patterns=["*.json"])
-    tokenizer = transformers.AutoTokenizer.from_pretrained(model_hf_path, padding_side="left")
-    if tokenizer.pad_token_id is None:
-        tokenizer.pad_token_id = tokenizer.eos_token_id
+    tokenizer = load_hf_tokenizer(model_name=model_name)
     return tokenizer
 
 
@@ -101,7 +94,7 @@ def transform_pt_model_with_qeff(model_hf):
     :param model_hf: pytorch model
     :return model_kv
     """
-    model_kv = QEfficient.transformers.modeling_utils.transform_lm(model_hf)
+    model_kv = transform_lm(model_hf)
     model_kv.eval()
     return model_kv
 

From 1a895dc3e7c0e89b80c26d5db09a90ae3fc53962 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 30 May 2024 18:46:42 +0530
Subject: [PATCH 09/20] enabled CLI APIs

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/__init__.py                        |   2 +-
 QEfficient/cloud/export.py                    |   2 +-
 QEfficient/cloud/infer.py                     |  25 +++-
 .../exporter/export_hf_to_cloud_ai_100.py     |  11 +-
 QEfficient/loader/loader.py                   |  42 +++++-
 QEfficient/loader/loader_factory.py           |  62 +++------
 QEfficient/transformers/modeling_utils.py     | 115 ----------------
 QEfficient/transformers/transform.py          | 123 ++++++++++++++++++
 QEfficient/utils/_utils.py                    |  11 +-
 QEfficient/utils/logging_utils.py             |  25 ++--
 tests/test_loader.py                          |   2 +-
 tests/utils.py                                |   2 +-
 12 files changed, 220 insertions(+), 202 deletions(-)
 create mode 100644 QEfficient/transformers/transform.py

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index fb4c517f7..09a6ae7fa 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -7,7 +7,7 @@
 
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.loader import QEFFAutoModel  # noqa: F401
-from QEfficient.transformers.modeling_utils import transform  # noqa: F401
+from QEfficient.transformers.transform import transform  # noqa: F401
 
 # Users can use QEfficient.export for exporting models to ONNX
 export = qualcomm_efficient_converter
diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 4ac2f6a05..2b7201c8e 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -9,7 +9,7 @@
 import os
 from typing import Optional
 
-import QEfficient.transformers.modeling_utils
+import QEfficient
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.loader import QEFFAutoModel
 from QEfficient.utils import load_hf_tokenizer, onnx_exists
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 14b61465b..f1a56931c 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -6,10 +6,11 @@
 # -----------------------------------------------------------------------------
 
 import argparse
+import logging
 import os
-from typing import List
+from typing import List, Optional
 
-import QEfficient.transformers.modeling_utils
+import QEfficient
 from QEfficient.cloud.compile import main as compile
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.generation.text_generation_inference import (
@@ -32,12 +33,12 @@
 def main(
     model_name: str,
     num_cores: int,
-    prompt: str = None,
-    prompts_txt_file_path: str = None,
+    prompt: Optional[str] = None, # type: ignore
+    prompts_txt_file_path: Optional[str] = None,
     aic_enable_depth_first: bool = False,
     mos: int = -1,
     cache_dir: str = Constants.CACHE_DIR,
-    hf_token: str = None,
+    hf_token: Optional[str] = None,
     batch_size: int = 1,
     prompt_len: int = 32,
     ctx_len: int = 128,
@@ -64,8 +65,9 @@ def main(
 
     if qpc_path_exists:
         # execute
-        logger.info("Pre-compiled qpc found! Trying to execute with given prompt")
+        logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
     elif onnx_path_exists:
+        logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
         # Compile -> execute
         # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
         generated_qpc_path = compile(
@@ -96,9 +98,10 @@ def main(
         logger.info(f"Model after Optimized transformations {qeff_opt_model}")
 
         # Export to the Onnx
-        logger.info(f"Exporting to Pytorch {model_name} to ONNX...")
+        logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
         # Need to split below function into two functions one which always takes QEFFAutoModel and other with same interface as below
         base_path, generated_onnx_path = qualcomm_efficient_converter(
+            model_name=model_name,
             model_kv=qeff_opt_model, # type: ignore
             tokenizer=tokenizer,
             onnx_dir_path=onnx_dir_path,
@@ -204,6 +207,14 @@ def main(
         default=-1,
         help="Effort level to reduce the on-chip memory",
     )
+    #FIXME: Add verbose feature
+    parser.add_argument(
+        "--verbose","-v",
+        action="store_true",
+        help="pass to print info logs",
+    )
 
     args = parser.parse_args()
+    if args.verbose:
+        logger.setLevel(logging.INFO)
     main(**args.__dict__)
diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
index 8e012cb1b..d5da3f422 100644
--- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py
+++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -12,18 +12,17 @@
 import torch
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-import QEfficient.transformers.modeling_utils
+import QEfficient
 from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort
-from QEfficient.loader.loader import QEFFAutoModel
+from QEfficient.loader.loader import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFFAutoModel
 from QEfficient.loader.loader_factory import (
-    AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP,
     QEFF_MODEL_TYPE,
     QEFFAutoModelForCausalLM,
     QEFFBaseModel,
 )
+from QEfficient.utils._utils import load_hf_tokenizer
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
-from QEfficient.utils._utils import load_hf_tokenizer
 
 
 def convert_to_cloud_bertstyle(
@@ -442,7 +441,7 @@ def export_lm_model_for_cloud(model_name:str, qeff_model: QEFFAutoModelForCausal
 
 def qualcomm_efficient_converter(
     model_name: str,
-    model_kv: Optional[Type[QEFFBaseModel]] = None, # type: ignore
+    model_kv: QEFFBaseModel = None, # type: ignore
     tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]]=None,
     onnx_dir_path: Optional[str]=None,
     hf_token: Optional[str] = None,
@@ -480,7 +479,7 @@ def qualcomm_efficient_converter(
     if model_kv.is_transformed and not kv:
         raise AttributeError("Transformed model is passed while requsting to convert non-transformed model")
     
-    model_kv: Type[QEFFBaseModel] = QEfficient.transform(model_kv) if kv else model_kv
+    model_kv = model_kv if model_kv.is_transformed else QEfficient.transform(model_kv) if kv else model_kv
 
 
     if onnx_dir_path is None:
diff --git a/QEfficient/loader/loader.py b/QEfficient/loader/loader.py
index 94ae672be..99295555f 100644
--- a/QEfficient/loader/loader.py
+++ b/QEfficient/loader/loader.py
@@ -6,15 +6,43 @@
 # -----------------------------------------------------------------------------
 
 import os
-from typing import Any, Type
+from typing import Any, Dict, Type
 
-from QEfficient.loader.loader_factory import (
-    MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP,
-    QEFFBaseModel,
-    get_hf_model_type,
-)
+from transformers import AutoConfig
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
+
+from QEfficient.loader.loader_factory import QEFF_MODEL_TYPE, QEFFAutoModelForCausalLM, QEFFBaseModel
 from QEfficient.utils._utils import login_and_download_hf_lm
 
+MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = {
+    QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM
+}
+
+AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()}
+
+def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE:
+    """
+    Loads model config file and returns the type of the model (i.e. LLMs, SD, quantized etc.) as supported by the library.
+    """
+    assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model"
+    config, kwargs = AutoConfig.from_pretrained(
+                hf_model_path,
+                return_unused_kwargs=True,
+            )
+
+    if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING:
+        # FIXME: Add logic to handle if quantization config is stored in separate quant_config.json outside of config, also create a separate function for this and below lines
+        quant_config = getattr(config, "quantization_config", getattr(config, "quant_config", None))
+        if quant_config is not None:
+            if quant_config.get("quant_method", None) == "awq":
+                return QEFF_MODEL_TYPE.AWQ
+            else:
+                raise NotImplementedError(f"current model type is not yet supported {type(config)}")
+        else:
+            return QEFF_MODEL_TYPE.CAUSALLM
+    else:
+        raise NotImplementedError(f"model type {type(config)} is not yet supported")
+
 
 class QEFFAutoModel:
     """
@@ -27,7 +55,7 @@ def __init__(self, *args: Any, **kwds: Any) -> None:
             f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`")
     
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> Type[QEFFBaseModel]:
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseModel:
         """
         Downloads HuggingFace model if already doesn't exist locally, returns QEffAutoModel object based on type of model.
         """
diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py
index 66ca42154..23d210f30 100644
--- a/QEfficient/loader/loader_factory.py
+++ b/QEfficient/loader/loader_factory.py
@@ -5,19 +5,24 @@
 #
 # ----------------------------------------------------------------------------
 
-import os
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Any, Dict, Type
+from typing import Any
 
 import torch.nn as nn
-from transformers import AutoConfig, AutoModelForCausalLM
+from transformers import AutoModelForCausalLM
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
 
-import QEfficient.transformers.modeling_utils
+import QEfficient
 from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict
 
 
+class QEFF_MODEL_TYPE(Enum):
+    CAUSALLM = "LLM"
+    DIFFUSION = "STABLE_DIFFUSION"
+    AWQ = "AWQ"
+
+
 class QEFFBaseModel(ABC):
     """
     This class acts as parent class for all the varieties of model class (i.e. LLMs, SD, quantized etc.).
@@ -29,11 +34,11 @@ def __init__(self) -> None:
         super().__init__()
         # Users can call generate or execute
         self.generate = self.execute
-    
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
         raise NotImplementedError("Must implement for child classes")
-    
+
     @property
     def is_transformed(self) -> bool:
         raise NotImplementedError("Must implement for child classes")
@@ -49,11 +54,11 @@ def transform_export_compile(self, *args, **kwargs) -> Any:
     @abstractmethod
     def execute(self, *args, **kwargs) -> Any:
         pass
-    
+
     @abstractmethod
     def transform(self, *args, **kwargs) -> Any:
         pass
-    
+
     @abstractmethod
     def export(self, *args, **kwargs) -> Any:
         pass
@@ -61,7 +66,7 @@ def export(self, *args, **kwargs) -> Any:
     @abstractmethod
     def compile(self, *args, **kwargs) -> Any:
         pass
-    
+
 
 class QEFFAutoModelForCausalLM(QEFFBaseModel):
     """
@@ -95,7 +100,7 @@ def execute(self, *args, **kwargs): # type: ignore
         raise NotImplementedError("Reached too far!!")
     
     def transform(self):
-        QEfficient.transformers.modeling_utils.transform_lm(self.model)
+        QEfficient.transform(self)
         return self
 
     def export(self):
@@ -103,40 +108,3 @@ def export(self):
     
     def compile(self, *args, **kwargs) -> Any:
         raise NotImplementedError("Reached too far!!")
-
-
-class QEFF_MODEL_TYPE(Enum):
-    CAUSALLM = "LLM"
-    DIFFUSION = "STABLE_DIFFUSION"
-    AWQ = "AWQ"
-
-
-MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = {
-    QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM
-}
-
-AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()}
-
-
-def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE:
-    """
-    Loads model config file and returns the type of the model (i.e. LLMs, SD, quantized etc.) as supported by the library.
-    """
-    assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model"
-    config, kwargs = AutoConfig.from_pretrained(
-                hf_model_path,
-                return_unused_kwargs=True,
-            )
-    
-    if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING:
-        # FIXME: Add logic to handle if quantization config is stored in separate quant_config.json outside of config, also create a separate function for this and below lines
-        quant_config = getattr(config, "quantization_config", getattr(config, "quant_config", None))
-        if quant_config is not None:
-            if quant_config.get("quant_method", None) == "awq":
-                return QEFF_MODEL_TYPE.AWQ
-            else:
-                raise NotImplementedError(f"current model type is not yet supported {type(config)}")
-        else:
-            return QEFF_MODEL_TYPE.CAUSALLM
-    else:
-        raise NotImplementedError(f"model type {type(config)} is not yet supported")
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 592b085ff..753d08204 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,12 +5,10 @@
 #
 # -----------------------------------------------------------------------------
 
-import hashlib
 from collections import namedtuple
 from typing import Dict, Type
 
 import torch.nn as nn
-import transformers
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
     CodeGenBlock,
@@ -46,26 +44,7 @@
 from transformers.models.mpt.modeling_mpt import MptAttention, MptBlock, MptForCausalLM, MptModel
 
 from QEfficient.customop import CustomRMSNormAIC
-from QEfficient.loader.loader_factory import (
-    AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP,
-    QEFF_MODEL_TYPE,
-    QEFFBaseModel,
-)
-from QEfficient.utils.logging_utils import logger
 
-from .modeling_attn_mask_utils import (
-    QEffAttentionMaskConverter,
-    _qeff_prepare_4d_attention_mask,
-    _qeff_prepare_4d_causal_attention_mask,
-)
-from .modeling_outputs import (
-    QEffBaseModelOutputWithPast,
-    QEffBaseModelOutputWithPastAndCrossAttentions,
-    QEffCausalLMOutputWithCrossAttentions,
-    QEffCausalLMOutputWithPast,
-    QEffMoeCausalLMOutputWithPast,
-    QEffMoeModelOutputWithPast,
-)
 from .models.codegen.modeling_codegen import (
     QEffCodeGenAttention,
     QEffCodeGenBlock,
@@ -153,97 +132,3 @@
     MixtralSparseMoeBlock: QEffMixtralSparseMoeBlock,
     MixtralBLockSparseTop2MLP:QEffMixtralBLockSparseTop2MLP,
 }
-
-
-def get_params_hash(model: nn.Module) -> str:
-    """
-    Creates a Hash of all the parameters values i.e. weights using SHA256 algo.
-    --------
-    :param model: torch.nn.Module. Base PyTorch model.
-    :returns: str. Hash string
-    """
-    hasher = hashlib.sha256()
-    for _, params in model.named_parameters():
-        hasher.update(params.data.numpy().tobytes())
-
-    return hasher.hexdigest()
-
-
-def replace_module_with_qeff_layers(model: nn.Module) -> None:
-    """
-    Replaces the transformers nn.Module classes with optmized QEff classes in place.
-    ----------
-    :param model: torch.nn.Module. Base PyTorch model.
-    """
-    # Replace if module class is registed in TransformersToQEffModulesDict
-    target_module = TransformersToQEffModulesDict.get(model.__class__)
-    if target_module is not None:
-        model.__class__ = target_module
-
-    # Iterate over child modules
-    for _, module in model.named_children():
-        replace_module_with_qeff_layers(module)
-
-
-def transform_lm(model: nn.Module) -> nn.Module:
-    """
-    Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for cloud AI 100.
-    ---------
-    Args:
-    param model (torch.nn.Module): PyTorch model.
-
-    Returns:
-    torch.nn.Module: PyTorch Module with replaced QEff layers.
-    """
-    
-    # Introducnig qeff_transformed attribue in model to check status of transform
-    if getattr(model, "qeff_transformed", False):
-        print("Model is already transformed")
-        return model
-
-    # Get Hash of all params for checking later
-    prior_params_hash = get_params_hash(model)
-    logger.warning(f"The model {model.__class__} layers has been upadted to QEff layers in-place")
-    # Replace with QEff layers
-    replace_module_with_qeff_layers(model)
-
-    # Check with new params hash
-    later_params_hash = get_params_hash(model)
-    assert (
-        prior_params_hash == later_params_hash
-    ), "Weights were changed in the transform process, please report an issue"
-
-    # Replace the modeling output classes
-    transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions = (
-        QEffBaseModelOutputWithPastAndCrossAttentions
-    )
-    transformers.modeling_outputs.CausalLMOutputWithCrossAttentions = QEffCausalLMOutputWithCrossAttentions
-    transformers.modeling_outputs.BaseModelOutputWithPast = QEffBaseModelOutputWithPast
-    transformers.modeling_outputs.CausalLMOutputWithPast = QEffCausalLMOutputWithPast
-    transformers.modeling_outputs.MoeCausalLMOutputWithPast = QEffMoeCausalLMOutputWithPast
-    transformers.modeling_outputs.MoeModelOutputWithPast = QEffMoeModelOutputWithPast
-
-    # Replace the modeling attn util classes and functions
-    transformers.modeling_attn_mask_utils.AttentionMaskConverter = QEffAttentionMaskConverter
-    transformers.modeling_attn_mask_utils._prepare_4d_attention_mask = _qeff_prepare_4d_attention_mask
-    transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask = _qeff_prepare_4d_causal_attention_mask
-
-    setattr(model,'qeff_transformed',True)
-    return model.eval()
-
-
-
-def transform(model: Type[QEFFBaseModel], form_factor="cloud"):
-    """
-    This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for cloud AI 100.
-    Will replace the torch.nn.Module layers of passed QEffModel with optimized implementation of the same.
-
-    model: object of any instance of class that is child of `QEFFBaseAutoModelFactory`
-    form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"].
-    """
-    assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!"
-    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM:
-        transform_lm(model.model, form_factor) # type: ignore
-        return model
-    else:
-        raise NotImplementedError(f"Recieved unsupported class of type {type(model)}")
diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py
new file mode 100644
index 000000000..413e9f6fe
--- /dev/null
+++ b/QEfficient/transformers/transform.py
@@ -0,0 +1,123 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import hashlib
+
+import torch.nn as nn
+import transformers
+
+from QEfficient.loader.loader_factory import QEFF_MODEL_TYPE
+from QEfficient.loader.loader import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP
+from QEfficient.loader.loader_factory import QEFFBaseModel
+from QEfficient.transformers.modeling_attn_mask_utils import (
+    QEffAttentionMaskConverter,
+    _qeff_prepare_4d_attention_mask,
+    _qeff_prepare_4d_causal_attention_mask,
+)
+from QEfficient.transformers.modeling_outputs import (
+    QEffBaseModelOutputWithPast,
+    QEffBaseModelOutputWithPastAndCrossAttentions,
+    QEffCausalLMOutputWithCrossAttentions,
+    QEffCausalLMOutputWithPast,
+    QEffMoeCausalLMOutputWithPast,
+    QEffMoeModelOutputWithPast,
+)
+from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict
+from QEfficient.utils.logging_utils import logger
+
+
+def replace_module_with_qeff_layers(model: nn.Module) -> None:
+    """
+    Replaces the transformers nn.Module classes with optmized QEff classes in place.
+    ----------
+    :param model: torch.nn.Module. Base PyTorch model.
+    """
+    # Replace if module class is registed in TransformersToQEffModulesDict
+    target_module = TransformersToQEffModulesDict.get(model.__class__)
+    if target_module is not None:
+        model.__class__ = target_module
+
+    # Iterate over child modules
+    for _, module in model.named_children():
+        replace_module_with_qeff_layers(module)
+
+
+def get_params_hash(model: nn.Module) -> str:
+    """
+    Creates a Hash of all the parameters values i.e. weights using SHA256 algo.
+    --------
+    :param model: torch.nn.Module. Base PyTorch model.
+    :returns: str. Hash string
+    """
+    hasher = hashlib.sha256()
+    for _, params in model.named_parameters():
+        hasher.update(params.data.numpy().tobytes())
+
+    return hasher.hexdigest()
+
+
+def transform_lm(model: nn.Module) -> nn.Module:
+    """
+    Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for cloud AI 100.
+    ---------
+    Args:
+    param model (torch.nn.Module): PyTorch model.
+
+    Returns:
+    torch.nn.Module: PyTorch Module with replaced QEff layers.
+    """
+
+    # Introducnig qeff_transformed attribue in model to check status of transform
+    if getattr(model, "qeff_transformed", False):
+        print("Model is already transformed")
+        return model
+
+    # Get Hash of all params for checking later
+    prior_params_hash = get_params_hash(model)
+    logger.warning(f"The model {model.__class__} layers has been upadted to QEff layers in-place")
+    # Replace with QEff layers
+    replace_module_with_qeff_layers(model)
+
+    # Check with new params hash
+    later_params_hash = get_params_hash(model)
+    assert (
+        prior_params_hash == later_params_hash
+    ), "Weights were changed in the transform process, please report an issue"
+
+    # Replace the modeling output classes
+    transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions = (
+        QEffBaseModelOutputWithPastAndCrossAttentions
+    )
+    transformers.modeling_outputs.CausalLMOutputWithCrossAttentions = QEffCausalLMOutputWithCrossAttentions
+    transformers.modeling_outputs.BaseModelOutputWithPast = QEffBaseModelOutputWithPast
+    transformers.modeling_outputs.CausalLMOutputWithPast = QEffCausalLMOutputWithPast
+    transformers.modeling_outputs.MoeCausalLMOutputWithPast = QEffMoeCausalLMOutputWithPast
+    transformers.modeling_outputs.MoeModelOutputWithPast = QEffMoeModelOutputWithPast
+
+    # Replace the modeling attn util classes and functions
+    transformers.modeling_attn_mask_utils.AttentionMaskConverter = QEffAttentionMaskConverter
+    transformers.modeling_attn_mask_utils._prepare_4d_attention_mask = _qeff_prepare_4d_attention_mask
+    transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask = _qeff_prepare_4d_causal_attention_mask
+
+    setattr(model,'qeff_transformed',True)
+    return model.eval()
+
+
+def transform(model: QEFFBaseModel, form_factor="cloud"):
+    """
+    This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for cloud AI 100.
+    Will replace the torch.nn.Module layers of passed QEffModel with optimized implementation of the same.
+
+    model: object of any instance of class that is child of `QEFFBaseAutoModelFactory`
+    form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"].
+    """
+    assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!"
+    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM:
+        transform_lm(model.model) # type: ignore
+        return model
+    else:
+        raise NotImplementedError(f"Recieved unsupported class of type {type(model)}")
\ No newline at end of file
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 049462514..25eb52616 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -14,19 +14,21 @@
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from QEfficient.utils.constants import QEFF_MODELS_DIR
+from QEfficient.utils.logging_utils import logger
 
 
-def login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs):
+def login_and_download_hf_lm(model_name, *args, **kwargs):
+    logger.info(f"loading HuggingFace model for {model_name}")
     hf_token = kwargs.pop("hf_token", None)
     cache_dir = kwargs.pop("cache_dir", None)   
     if hf_token is not None:
         login(hf_token)
-    pretrained_model_name_or_path = hf_download(
-        repo_id=pretrained_model_name_or_path,
+    model_name = hf_download(
+        repo_id=model_name,
         cache_dir=cache_dir,
         ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
     )
-    return pretrained_model_name_or_path
+    return model_name
 
 
 def hf_download(
@@ -118,6 +120,7 @@ def onnx_exists(model_name: str) -> Tuple[bool, str, str]:
 
 
 def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token: Optional[str] = None) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]:
+    logger.info(f"Loading Tokenizer for {model_name}")
     if hf_token is not None:
         login(hf_token)
 
diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py
index fe42d5ed9..044e6e83f 100644
--- a/QEfficient/utils/logging_utils.py
+++ b/QEfficient/utils/logging_utils.py
@@ -13,19 +13,20 @@ class QEffFormatter(logging.Formatter):
     Formatter class used to set colors for printing different logging levels of messages on console.
     """
 
-    grey = "\x1b[38;20m"
-    yellow = "\x1b[33;20m"
-    red = "\x1b[31;20m"
-    bold_red = "\x1b[31;1m"
-    reset = "\x1b[0m"
-    format = "%(levelname)s - %(name)s - %(message)s  (%(filename)s:%(lineno)d)"
+    cyan: str = "\x1b[38;5;14m"
+    yellow: str = "\x1b[33;20m"
+    red: str = "\x1b[31;20m"
+    bold_red: str = "\x1b[31;1m"
+    reset: str = "\x1b[0m"
+    common_format: str = "%(levelname)s - %(name)s - %(message)s" # type: ignore
+    format_with_line_info = "%(levelname)s - %(name)s - %(message)s  (%(filename)s:%(lineno)d)" # type: ignore
 
     FORMATS = {
-        logging.DEBUG: grey + format + reset,
-        logging.INFO: grey + format + reset,
-        logging.WARNING: yellow + format + reset,
-        logging.ERROR: red + format + reset,
-        logging.CRITICAL: bold_red + format + reset,
+        logging.DEBUG: cyan + format_with_line_info + reset,
+        logging.INFO: cyan + common_format + reset,
+        logging.WARNING: yellow + common_format + reset,
+        logging.ERROR: red + format_with_line_info + reset,
+        logging.CRITICAL: bold_red + format_with_line_info + reset,
     }
 
     def format(self, record):
@@ -45,7 +46,7 @@ def create_logger() -> logging.Logger:
 
     # create console handler and set level to debug
     ch = logging.StreamHandler()
-    ch.setLevel(logging.WARNING)
+    ch.setLevel(logging.INFO)
     # define formatter
     ch.setFormatter(QEffFormatter())
 
diff --git a/tests/test_loader.py b/tests/test_loader.py
index 56e81f666..0d3df3a01 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -10,7 +10,7 @@
 import pytest
 from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 
-import QEfficient.transformers.modeling_utils
+import QEfficient
 from QEfficient import QEFFAutoModel
 from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM
 
diff --git a/tests/utils.py b/tests/utils.py
index 37dfd5795..f68dd20fb 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -13,7 +13,7 @@
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100
 from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM
-from QEfficient.transformers.modeling_utils import transform_lm
+from QEfficient.transformers.transform import transform_lm
 from QEfficient.utils import hf_download, load_hf_tokenizer
 from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants
 from QEfficient.utils.device_utils import get_available_device_id, is_multi_qranium_setup_available, is_qpc_size_gt_32gb

From 9f5ff0a66a818b244c0888d4c4bdee73d66e0ac0 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Fri, 31 May 2024 15:50:28 +0530
Subject: [PATCH 10/20] *Updated README, notebooks *Removed circular import
 *Added comments on loader files * separated cross-compile script *separated
 utils funcs

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/__init__.py               |   1 +
 QEfficient/cloud/compile.py          |  89 +--------------
 QEfficient/cloud/infer.py            |   3 +-
 QEfficient/cross_compile.py          | 159 +++++++++++++++++++++++++++
 QEfficient/exporter/export_utils.py  |  97 +---------------
 QEfficient/loader/loader.py          |   6 +
 QEfficient/loader/loader_factory.py  |  20 ++++
 QEfficient/transformers/transform.py |   3 +-
 QEfficient/utils/_utils.py           |   5 +-
 README.md                            |  81 ++++++--------
 notebooks/QEfficientGPT2.ipynb       |  44 +++-----
 notebooks/QEfficientMPT.ipynb        |  39 +++----
 tests/test_loader.py                 |   2 +-
 tests/utils.py                       |   2 +-
 14 files changed, 264 insertions(+), 287 deletions(-)
 create mode 100644 QEfficient/cross_compile.py

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 09a6ae7fa..9804c4ea1 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+from QEfficient.cross_compile import compile  # noqa: F401
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.loader import QEFFAutoModel  # noqa: F401
 from QEfficient.transformers.transform import transform  # noqa: F401
diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py
index 0171b2787..b46282da2 100644
--- a/QEfficient/cloud/compile.py
+++ b/QEfficient/cloud/compile.py
@@ -6,91 +6,8 @@
 # -----------------------------------------------------------------------------
 
 import argparse
-import json
-import os
-from typing import List
-
-from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100
-from QEfficient.utils.logging_utils import logger
-
-
-def create_and_dump_specializations(batch_size: int, prompt_len: int, ctx_len: int, path: str):
-    # Create
-    specializations = {
-        "specializations": [
-            {
-                "batch_size": str(batch_size),
-                "seq_len": str(prompt_len),
-                "ctx_len": str(ctx_len),
-            },
-            {"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)},
-        ]
-    }
-    # Dump
-    with open(path, "w") as file:
-        json.dump(specializations, file, indent=4)
-
-
-def main(
-    onnx_path: str,
-    qpc_path: str,
-    num_cores: int,
-    device_group: List[int],
-    aic_enable_depth_first: bool = False,
-    mos: int = -1,
-    batch_size: int = 1,
-    prompt_len: int = 32,
-    ctx_len: int = 128,
-    mxfp6: bool = True,
-    mxint8: bool = False,
-) -> str:
-    # Dynamically create the specializations JSON
-    """
-    Api() to compile the Onnx Model on Cloud AI 100 Platform with give config.
-    ---------
-    :param onnx_path: str. Generated Onnx Model Path.
-    :base_path: str. Base path for the generated models.
-    :batch_size: int. Batch size to compile the model for.
-    :prompt_len: int. prompt len for the model to compile.
-    :ctx_len: int. Maximum context length to compile the model.
-    :mxfp6: bool. Enable compilation for MXFP6 precision
-    :num_cores: int. Number of cores to compile model on. default: 16 available option: [1 to 16]
-    """
-
-    os.makedirs(qpc_path, exist_ok=True)
-    specialization_json_path = os.path.join(qpc_path, "specializations.json")
-    create_and_dump_specializations(
-        batch_size=batch_size, prompt_len=prompt_len, ctx_len=ctx_len, path=specialization_json_path
-    )
-
-    # Select the customIO config based on the mx flag.
-    if mxint8:
-        custom_io_file_name = "custom_io_int8.yaml"
-    else:
-        custom_io_file_name = "custom_io_fp16.yaml"
-
-    custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
-
-    if not os.path.isfile(custom_io_file_path):
-        raise FileNotFoundError(
-            f"file {custom_io_file_path} needs to exist in the same directory as onnx model files. Please rerun infer/export Api"
-        )
-
-    _, qpc_path = compile_kv_model_on_cloud_ai_100(
-        onnx_path=onnx_path,
-        specializations_json=specialization_json_path,
-        num_cores=num_cores,
-        custom_io_path=custom_io_file_path,
-        base_path=qpc_path,
-        mxfp6=mxfp6,
-        aic_enable_depth_first=aic_enable_depth_first,
-        mos=mos,
-        device_group=device_group,
-    )
-
-    logger.info(f"Compiled QPC files can be found here: {qpc_path}")
-    return qpc_path
 
+import QEfficient
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Compilation script.")
@@ -146,5 +63,7 @@ def main(
         default=-1,
         help=" Effort level to reduce the on-chip memory",
     )
+
+    # FIXME(ochougul): Allow extra compilation arguments
     args = parser.parse_args()
-    main(**vars(args))
+    QEfficient.compile(**vars(args))
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index f1a56931c..e72a3329b 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -11,7 +11,6 @@
 from typing import List, Optional
 
 import QEfficient
-from QEfficient.cloud.compile import main as compile
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.generation.text_generation_inference import (
     check_batch_size_and_num_prompts,
@@ -117,7 +116,7 @@ def main(
 
         # Compile
         # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
-        generated_qpc_path = compile(
+        generated_qpc_path = QEfficient.compile(
             onnx_path=onnx_model_path,
             qpc_path=os.path.dirname(qpc_dir_path),
             num_cores=num_cores,
diff --git a/QEfficient/cross_compile.py b/QEfficient/cross_compile.py
new file mode 100644
index 000000000..771d52f54
--- /dev/null
+++ b/QEfficient/cross_compile.py
@@ -0,0 +1,159 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import json
+import os
+import subprocess
+from typing import List, Tuple
+
+from QEfficient.utils.logging_utils import logger
+
+
+def create_and_dump_specializations(batch_size: int, prompt_len: int, ctx_len: int, path: str):
+    # Create
+    specializations = {
+        "specializations": [
+            {
+                "batch_size": str(batch_size),
+                "seq_len": str(prompt_len),
+                "ctx_len": str(ctx_len),
+            },
+            {"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)},
+        ]
+    }
+    # Dump
+    with open(path, "w") as file:
+        json.dump(specializations, file, indent=4)
+
+
+def compile_kv_model_on_cloud_ai_100(
+    onnx_path: str,
+    specializations_json: str,
+    num_cores: int,
+    base_path: str,
+    mxfp6: bool,
+    custom_io_path: str,
+    aic_enable_depth_first: bool,
+    mos: int = -1,
+    device_group: List[int] = [0],
+    **kwargs,
+) -> Tuple[bool, str]:
+    import shutil
+    if kwargs:
+        # FIXME
+        raise NotImplementedError("Can't handle extra compilation args now!")
+    aic_binary_dir = os.path.join(base_path, "qpcs")
+
+    if os.path.isdir(aic_binary_dir):
+        shutil.rmtree(aic_binary_dir)
+
+    assert os.path.isfile(
+        specializations_json
+    ), f"Please use 'QEfficient.compile', as {specializations_json} file was not found"
+    assert os.path.isfile(custom_io_path), f"{custom_io_path} file was not found!"
+    command = [
+        "/opt/qti-aic/exec/qaic-exec",
+        f"-m={onnx_path}",
+        "-aic-hw",
+        "-aic-hw-version=2.0",
+        f"-network-specialization-config={specializations_json}",
+        "-convert-to-fp16",
+        "-retained-state",
+        f"-aic-num-cores={num_cores}",
+        f"-custom-IO-list-file={custom_io_path}",
+        "-compile-only",
+        f"-aic-binary-dir={aic_binary_dir}",
+    ]
+    if mxfp6:
+        command.append("-mxfp6-matmul")
+    if mos > 0:
+        command.append(f"-mos={mos}")
+    if aic_enable_depth_first:
+        command.append("-aic-enable-depth-first")
+    if len(device_group) > 1:
+        mdp_ts_config = {
+            "connections": [{"devices": list(range(len(device_group))), "type": "p2p"}],
+            "partitions": [
+                {
+                    "name": "Partition0",
+                    "devices": [{"deviceId": device, "numCores": num_cores} for device in range(len(device_group))],
+                }
+            ],
+        }
+        mdp_ts_config_path = os.path.join(base_path, "mdp_ts_config.json")
+        with open(mdp_ts_config_path, "w") as file:
+            json.dump(mdp_ts_config, file, indent=4)
+        command.append(f"-mdp-load-partition-config={mdp_ts_config_path}")
+    print("Running AI 100 compiler:", " ".join(command))
+    result = subprocess.run(command, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"Compilation Failed!!\n\nSTDOUT\n{result.stdout}\n\nSTDERR\n{result.stderr}")
+
+    print("\n===================== Compilation Done! =====================\n")
+    return result.returncode == 0, aic_binary_dir
+
+
+def compile(
+    onnx_path: str,
+    qpc_path: str,
+    num_cores: int,
+    device_group: List[int],
+    aic_enable_depth_first: bool = False,
+    mos: int = -1,
+    batch_size: int = 1,
+    prompt_len: int = 32,
+    ctx_len: int = 128,
+    mxfp6: bool = True,
+    mxint8: bool = False,
+    **kwargs
+) -> str:
+    # Dynamically create the specializations JSON
+    """
+    Api() to compile the Onnx Model on Cloud AI 100 Platform with give config.
+    ---------
+    :param onnx_path: str. Generated Onnx Model Path.
+    :base_path: str. Base path for the generated models.
+    :batch_size: int. Batch size to compile the model for.
+    :prompt_len: int. prompt len for the model to compile.
+    :ctx_len: int. Maximum context length to compile the model.
+    :mxfp6: bool. Enable compilation for MXFP6 precision
+    :num_cores: int. Number of cores to compile model on. default: 16 available option: [1 to 16]
+    """
+
+    os.makedirs(qpc_path, exist_ok=True)
+    specialization_json_path = os.path.join(qpc_path, "specializations.json")
+    create_and_dump_specializations(
+        batch_size=batch_size, prompt_len=prompt_len, ctx_len=ctx_len, path=specialization_json_path
+    )
+
+    # Select the customIO config based on the mx flag.
+    if mxint8:
+        custom_io_file_name = "custom_io_int8.yaml"
+    else:
+        custom_io_file_name = "custom_io_fp16.yaml"
+
+    custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
+
+    if not os.path.isfile(custom_io_file_path):
+        raise FileNotFoundError(
+            f"file {custom_io_file_path} needs to exist in the same directory as onnx model files. Please rerun infer/export Api"
+        )
+
+    _, qpc_path = compile_kv_model_on_cloud_ai_100(
+        onnx_path=onnx_path,
+        specializations_json=specialization_json_path,
+        num_cores=num_cores,
+        custom_io_path=custom_io_file_path,
+        base_path=qpc_path,
+        mxfp6=mxfp6,
+        aic_enable_depth_first=aic_enable_depth_first,
+        mos=mos,
+        device_group=device_group,
+    )
+
+    logger.info(f"Compiled QPC files can be found here: {qpc_path}")
+    return qpc_path
diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py
index 8ce7f6b26..417c3a214 100644
--- a/QEfficient/exporter/export_utils.py
+++ b/QEfficient/exporter/export_utils.py
@@ -5,12 +5,10 @@
 #
 # -----------------------------------------------------------------------------
 
-import json
 import os
 import shutil
-import subprocess
 import sys
-from logging import error, info
+from logging import info
 from typing import Dict, List, Tuple, Union
 
 import numpy as np
@@ -285,7 +283,7 @@ def generate_input_files(
         fp.write(",".join(filenames))
         fp.write("\n")
 
-
+# FIXME(ochougul/quic-mamta): Remove duplication with APIRunner
 def run_model_on_ort(
     onnx_path: str,
     inputs: Dict[str, torch.Tensor],
@@ -331,94 +329,3 @@ def run_model_on_ort(
         print(f"Failed to run the onnx {onnx_path} model in onnx runtime:%s", e)
         print("\n=============================================================\n")
         return input_names, None
-
-
-def run_model_on_cloud_ai_100(
-    onnx_path: str,
-    onnx_symbol_defs: Dict[str, int] = {},
-    **kwargs,
-) -> bool:
-    args = [
-        "/opt/qti-aic/exec/qaic-exec",
-        f"-m={onnx_path}",
-        "-aic-hw",
-        "-aic-hw-version=2.0",
-    ]
-    for onnx_symbol, onnx_def in onnx_symbol_defs.items():
-        args.append(f"-onnx-define-symbol={onnx_symbol},{onnx_def}")
-    for k, v in kwargs.items():
-        k = k.replace("_", "-")
-        if isinstance(v, bool):
-            if v:
-                args.append(f"-{k}")
-            continue
-        args.append(f"-{k}={v}")
-
-    info("Running compiler:", " ".join(args))
-    result = subprocess.run(args)
-    return result.returncode == 0
-
-
-def compile_kv_model_on_cloud_ai_100(
-    onnx_path: str,
-    specializations_json: str,
-    num_cores: int,
-    base_path: str,
-    mxfp6: bool,
-    custom_io_path: str,
-    aic_enable_depth_first: bool,
-    mos: int = -1,
-    device_group: List[int] = [0],
-    **kwargs,
-) -> bool:
-    import shutil
-
-    aic_binary_dir = os.path.join(base_path, "qpcs")
-
-    if os.path.isdir(aic_binary_dir):
-        shutil.rmtree(aic_binary_dir)
-
-    assert os.path.isfile(
-        specializations_json
-    ), f"Please use 'from QEfficient.cloud.compile import main as compile', as {specializations_json} file was not found"
-    assert os.path.isfile(custom_io_path), f"{custom_io_path} file was not found!"
-    command = [
-        "/opt/qti-aic/exec/qaic-exec",
-        f"-m={onnx_path}",
-        "-aic-hw",
-        "-aic-hw-version=2.0",
-        f"-network-specialization-config={specializations_json}",
-        "-convert-to-fp16",
-        "-retained-state",
-        f"-aic-num-cores={num_cores}",
-        f"-custom-IO-list-file={custom_io_path}",
-        "-compile-only",
-        f"-aic-binary-dir={aic_binary_dir}",
-    ]
-    if mxfp6:
-        command.append("-mxfp6-matmul")
-    if mos > 0:
-        command.append(f"-mos={mos}")
-    if aic_enable_depth_first:
-        command.append("-aic-enable-depth-first")
-    if len(device_group) > 1:
-        mdp_ts_config = {
-            "connections": [{"devices": list(range(len(device_group))), "type": "p2p"}],
-            "partitions": [
-                {
-                    "name": "Partition0",
-                    "devices": [{"deviceId": device, "numCores": num_cores} for device in range(len(device_group))],
-                }
-            ],
-        }
-        mdp_ts_config_path = os.path.join(base_path, "mdp_ts_config.json")
-        with open(mdp_ts_config_path, "w") as file:
-            json.dump(mdp_ts_config, file, indent=4)
-        command.append(f"-mdp-load-partition-config={mdp_ts_config_path}")
-    print("Running AI 100 compiler:", " ".join(command))
-    result = subprocess.run(command, capture_output=True, text=True)
-    if result.returncode != 0:
-        raise RuntimeError(f"Compilation Failed!!\n\nSTDOUT\n{result.stdout}\n\nSTDERR\n{result.stderr}")
-
-    print("\n===================== Compilation Done! =====================\n")
-    return result.returncode == 0, aic_binary_dir
diff --git a/QEfficient/loader/loader.py b/QEfficient/loader/loader.py
index 99295555f..c0da35421 100644
--- a/QEfficient/loader/loader.py
+++ b/QEfficient/loader/loader.py
@@ -5,6 +5,12 @@
 #
 # -----------------------------------------------------------------------------
 
+"""
+MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP dictionary defines the mapping between names of the varities of Transformer model defined in 
+QEFF_MODEL_TYPE and the classes that implement the methods i.e.(compile, export etc.) for those types.
+
+QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model.
+"""
 import os
 from typing import Any, Dict, Type
 
diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py
index 23d210f30..ede3a6c85 100644
--- a/QEfficient/loader/loader_factory.py
+++ b/QEfficient/loader/loader_factory.py
@@ -5,6 +5,23 @@
 #
 # ----------------------------------------------------------------------------
 
+"""
+** This file for holds the classes that handle main functions
+1.load i.e. from_pretrained
+2.execute
+3.transform
+4.export
+5.compile
+For different varities of Transformer Models
+
+** Each variety of the Transformer model that has different way of doing any of the above functions will have it's own class i.e.
+following models type will have their own class which must inherit QEFFBaseModel abstract class.
+1.Causal Language Models
+2.Diffusion
+3.Quantized models 
+
+** QEFFBASEModel is abstract base class that defines the basic structure of these classes.
+"""
 from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Any
@@ -18,6 +35,9 @@
 
 
 class QEFF_MODEL_TYPE(Enum):
+    """
+    Defines Names of the different varities of transformer models.
+    """
     CAUSALLM = "LLM"
     DIFFUSION = "STABLE_DIFFUSION"
     AWQ = "AWQ"
diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py
index 413e9f6fe..aaddd75fa 100644
--- a/QEfficient/transformers/transform.py
+++ b/QEfficient/transformers/transform.py
@@ -10,9 +10,8 @@
 import torch.nn as nn
 import transformers
 
-from QEfficient.loader.loader_factory import QEFF_MODEL_TYPE
 from QEfficient.loader.loader import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP
-from QEfficient.loader.loader_factory import QEFFBaseModel
+from QEfficient.loader.loader_factory import QEFF_MODEL_TYPE, QEFFBaseModel
 from QEfficient.transformers.modeling_attn_mask_utils import (
     QEffAttentionMaskConverter,
     _qeff_prepare_4d_attention_mask,
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 25eb52616..4c2ad177c 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -119,12 +119,13 @@ def onnx_exists(model_name: str) -> Tuple[bool, str, str]:
     return onnx_exists_bool, onnx_dir_path, onnx_model_path
 
 
-def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token: Optional[str] = None) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]:
+def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token: Optional[str] = None, padding_side:str = "left", **kwargs) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]:
     logger.info(f"Loading Tokenizer for {model_name}")
     if hf_token is not None:
         login(hf_token)
 
     # Download tokenizer along with model if it doesn't exist
     model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"])
-    tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True)
+    #FIXME(ochougul): should this always return left padded tokenizer?
+    tokenizer = AutoTokenizer.from_pretrained(model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs)
     return tokenizer
diff --git a/README.md b/README.md
index 430a058c0..634e3add3 100644
--- a/README.md
+++ b/README.md
@@ -118,8 +118,8 @@ In summary:
 
 | High Level APIs | Sample use | Arguments         |
 |-----------------|------------|-------------------|
-| QEfficient.cloud.infer           |   [click here](#1-use-qefficientcloudinfer)         |  <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>mxint8 : Optional </li><li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>**prompt : Optional</li><li>**prompts_txt_file_path : Optional</li>|
-| QEfficient.cloud.execute  |     [click here](#2-use-of-qefficientcloudexcute)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>prompt : Optional [Default-"My name is"]</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li><li>**prompt : Optional</li><li>**prompts_txt_file_path : Optional</li> |
+| QEfficient.cloud.infer           |   [click here](#1-use-qefficientcloudinfer)         |  <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>mxint8 : Optional </li><li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>**prompt : Optional</li><li>**prompts_txt_file_path : Optional</li><li>verbose : Optional</li>|
+| QEfficient.cloud.execute  |     [click here](#2-use-of-qefficientcloudexcute)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>prompt : Optional [Default-"My name is"]</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li><li>**prompt : Optional</li><li>**prompts_txt_file_path : Optional</li>|
 
 **One argument, prompt or prompts_txt_file_path must be passed.**
 
@@ -187,46 +187,40 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3
 
 | Low Level APIs | Sample use | Arguments         | 
 |-----------------|------------|-------------------|
-|  QEfficient.transform    |   [click here](#1-model-download-and-transform)         |  <li>model : $\color{green} {Mandatory}$</li><li>Type : Optional [Default- "Transformers"]</li> <li>form_factor : Optional [Default-"cloud"]</li>  | 
-| qualcomm_efficient_converter |     [click here](#2-onnx-export-of-transformed-model)       |   <li>mode_name : $\color{green} {Mandatory}$</li><li>model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed] </li><li>model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]</li> <li>tokenizer : Optional</li><li>onnx_path : Optional </li><li>hf_token : Optional</li><li>seq_length : Optional [Default-128]</li><li>input_str : Optional [Default-"My name is"]</li><li>kv : Optional [Default-$\color{green} {True}$]</li><li>return_path : Optional [Default-False]</li><li>form_factor : Optional [Default-"cloud"]</li><li>save_fp32_onnx : Optional [Default-False]</li><li>save_fp16_onnx : Optional [Default-True]</li> <li>*Both save_fp32_onnx and save_fp16_onnx can't be false*</li> | 
-|     compile | [click here](#3-compile-on-cloud-ai-100) | <li>onnx_path : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>num_cores : $\color{green} {Mandatory}$</li><li>device_group  : $\color{green} {Mandatory}$</li> <li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li><li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional [Default-True]</li>| 
+|  QEfficient.transform    |   [click here](#1-model-download-and-transform)         |  <li>model : $\color{green} {Mandatory}$</li><li>form_factor : Optional [Default-"cloud"]</li>  | 
+| QEfficient.export |     [click here](#2-onnx-export-of-transformed-model)       |   <li>mode_name : $\color{green} {Mandatory}$</li><li>model_kv : Optional </li><li>tokenizer : Optional</li><li>onnx_path : Optional </li><li>hf_token : Optional</li><li>seq_length : Optional [Default-128]</li><li>kv : Optional [Default-$\color{green} {True}$]</li><li>return_path : Optional [Default-False]</li><li>form_factor : Optional [Default-"cloud"]</li><li>***save_fp32_onnx : Optional [Default-False]</li><li>***save_fp16_onnx : Optional [Default-True]</li>| 
+|     QEfficient.compile | [click here](#3-compile-on-cloud-ai-100) | <li>onnx_path : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>num_cores : $\color{green} {Mandatory}$</li><li>device_group  : $\color{green} {Mandatory}$</li>  <li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li><li>ctx_len : Optional [Default-128]</li><li>aic_enable_depth_first : Optional [Default-False]</li> <li>mos : Optional [Defaul= -1]</li> <li>mxint8 : Optional [Defaul-False]</li><li>mxfp6 : Optional [Default-True]</li>| 
 |cloud_ai_100_exec_kv | [click here](#4-run-benchmark)  | <li>batch_size : $\color{green} {Mandatory}$</li> <li>tokenizer : $\color{green} {Mandatory}$</li> <li>qpc_path : $\color{green} {Mandatory}$</li><li>**prompt : Optional</li><li>**prompts_txt_file_path : Optional</li><li>input_len : Optional [Default-None]</li> <li>generation_len : Optional [Default-None]</li> <li>device_id : Optional [Default-[0]]</li> <li>enable_debug_logs : Optional [Default-False]</li> <li>stream : Optional [Default-True]</li> <li>write_io_dir : Optional</li><li>automation : Optional [Default-False]</li>| 
 
-**One argument, prompt or prompts_txt_file_path must be passed.
-
+**One argument, prompt or prompts_txt_file_path must be passed.<br>
+***Both save_fp32_onnx and save_fp16_onnx can't be false.
 
 ### 1.  Model download and transform
 
 Initialize QEfficient and transform the models, Check the list of supported architectures in the repo.
 
-```bash
+```Python
 # Initiate the Orignal Transformer model
 import os
-from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+
+
 import QEfficient
-from transformers import AutoTokenizer
-from QEfficient.utils import hf_download
-from QEfficient.utils.constants import Constants
+from QEfficient import QEFFAutoModel
+
 # Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.
 # os.environ["TRANSFORMERS_CACHE"] = "/local/mnt/workspace/hf_cache"
 
-ROOT_DIR = os.path.dirname(os.path.abspath(""))
+#ROOT_DIR = os.path.dirname(os.path.abspath(""))
+#CACHE_DIR = os.path.join(ROOT_DIR, "tmp"), you can use a different location for just one model by passing this param as cache_dir in below API.
 
 # Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl
+model_name = "gpt2"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.
 
-model_name = "gpt2" 
-
-# Similar, we can change model name and generate corresponding models, if we have added the support in the lib.
-
-model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR, ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"])
-model_hf = GPT2LMHeadModel.from_pretrained(model_hf_path, use_cache=True)
-model_hf.eval()
-print(f"{model_name} from hugging-face \n", model_hf)
+qeff_model = QEFFAutoModel.from_pretrained(model_name, cache_dir=None)
+print(f"{model_name} from hugging-face \n", qeff_model)
 
 # Easy and minimal api to update the model
-model_transformed = QEfficient.transform(model_hf, type="Transformers", form_factor="cloud")
-
-model_transformed.eval()
+model_transformed = QEfficient.transform(qeff_model, form_factor="cloud")
 print("Model after Optimized transformations \n", model_transformed)
 ```
 
@@ -234,31 +228,27 @@ print("Model after Optimized transformations \n", model_transformed)
 
 use the qualcomm_efficient_converter API to export the KV transformed Model to ONNX and Verify on Torch.
 
-```bash
-from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-
-# We can now export the modified models to  ONNX framework
-# This will generate single ONNX Model for both Prefill and Decode Variations which are optimized for
+```Python
+from QEfficient.utils import load_hf_tokenizer
+# We can now export the modified models to Onnx framework
+# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for
 # Cloud AI 100 Platform.
 
-# This will generate  ONNX model, clip the overflow constants to fp16
-# Verify the model on  ONNXRuntime vs Pytorch
-# Then generate inputs and custom_io.yaml file required for compilation.
+# This will generate Onnx model, clip the overflow constants to fp16
+# Verify the model on Onnxruntime vs Pytorch
+# Then generate inputs and customio yaml file required for compilation.
 
 # We can generate the KV Style models with the flag "kv"
 # Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version.
 # It is recommended to use kv=True for better performance.
-
-# For custom models defined on the Hub in their own modeling files. We need `trust_remote_code` option
-# Should be set to `True` in `AutoTokenizer` for repositories you trust.
-tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
-base_path, onnx_path = qualcomm_efficient_converter(
-    model_kv=model_transformed,
+tokenizer = load_hf_tokenizer(model_name, use_cache=True)
+base_path, onnx_path = QEfficient.export(
     model_name=model_name,
+    model_kv=model_transformed,
+    tokenizer=tokenizer,
     kv=True,
     form_factor="cloud",
     return_path=True,
-    tokenizer=tokenizer,
 )
 ```
 
@@ -266,29 +256,28 @@ base_path, onnx_path = qualcomm_efficient_converter(
 
 Once, the model is exported, Compile the model on Cloud AI 100 and generate QPC.
 
-```bash
+```Python
 # Please use platform SDk to Check num_cores for your card.
-from QEfficient.cloud.compile import main as compile
 
-generated_qpc_path = compile(
+generated_qpc_path = QEfficient.compile(
     onnx_path=onnx_path,
     num_cores=14,
-    qpc_path=base_path,
+    qpc_path=os.path.dirname(base_path),
+    mxfp6=False,
     device_group=[0],
-    mxfp6=True,
 )
 ```
 ### 4. Run Benchmark 
 
 Benchmark the model on Cloud AI 100, run the infer API to print tokens and tok/sec
 
-```bash
+```Python
 from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size
 
 # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100
 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach
 batch_size = get_compilation_batch_size(generated_qpc_path)
-cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt="My name is")
+cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=["My name is"])
 ```
 End to End demo examples for various models are available in **notebooks** directory. Please check them out.
 
diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index 668a3b473..37d5ebb40 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -26,28 +26,19 @@
     "# Initiate the Orignal Transformer model\n",
     "import os\n",
     "\n",
-    "from transformers import AutoTokenizer\n",
-    "from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel\n",
-    "\n",
-    "from QEfficient.utils import hf_download\n",
-    "from QEfficient.utils.constants import Constants\n",
+    "from QEfficient import QEFFAutoModel\n",
     "\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
     "\n",
-    "ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n",
+    "#ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n",
+    "#CACHE_DIR = os.path.join(ROOT_DIR, \"tmp\"), you can use a different location for just one model by passing this param as cache_dir in below API.\n",
     "\n",
     "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n",
     "model_name = \"gpt2\"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n",
     "\n",
-    "model_hf_path = hf_download(\n",
-    "    repo_id=model_name,\n",
-    "    cache_dir=Constants.CACHE_DIR,\n",
-    "    ignore_patterns=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n",
-    ")\n",
-    "model_hf = GPT2LMHeadModel.from_pretrained(model_hf_path, use_cache=True)\n",
-    "model_hf.eval()\n",
-    "print(f\"{model_name} from hugging-face \\n\", model_hf)"
+    "qeff_model = QEFFAutoModel.from_pretrained(model_name, cache_dir=None)\n",
+    "print(f\"{model_name} from hugging-face \\n\", qeff_model)"
    ]
   },
   {
@@ -75,9 +66,8 @@
     "import QEfficient\n",
     "\n",
     "# Easy and minimal api to update the model\n",
-    "model_transformed = QEfficient.transform(model_hf, type=\"Transformers\", form_factor=\"cloud\")\n",
+    "model_transformed = QEfficient.transform(qeff_model, form_factor=\"cloud\")\n",
     "\n",
-    "model_transformed.eval()\n",
     "print(\"Model after Optimized transformations \\n\", model_transformed)"
    ]
   },
@@ -96,8 +86,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter\n",
-    "\n",
+    "from QEfficient.utils import load_hf_tokenizer\n",
     "# We can now export the modified models to Onnx framework\n",
     "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n",
     "# Cloud AI 100 Platform.\n",
@@ -109,14 +98,14 @@
     "# We can generate the KV Style models with the flag \"kv\"\n",
     "# Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version.\n",
     "# It is recommended to use kv=True for better performance.\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side=\"left\")\n",
-    "base_path, onnx_path = qualcomm_efficient_converter(\n",
-    "    model_kv=model_transformed,\n",
+    "tokenizer = load_hf_tokenizer(model_name, use_cache=True)\n",
+    "base_path, onnx_path = QEfficient.export(\n",
     "    model_name=model_name,\n",
+    "    model_kv=model_transformed,\n",
+    "    tokenizer=tokenizer,\n",
     "    kv=True,\n",
     "    form_factor=\"cloud\",\n",
     "    return_path=True,\n",
-    "    tokenizer=tokenizer,\n",
     ")"
    ]
   },
@@ -136,13 +125,12 @@
    "outputs": [],
    "source": [
     "# Please use platform SDk to Check num_cores for your card.\n",
-    "from QEfficient.cloud.compile import main as compile\n",
     "\n",
-    "generated_qpc_path = compile(\n",
+    "generated_qpc_path = QEfficient.compile(\n",
     "    onnx_path=onnx_path,\n",
     "    num_cores=14,\n",
-    "    qpc_path=base_path,\n",
-    "    mxfp6=True,\n",
+    "    qpc_path=os.path.dirname(base_path),\n",
+    "    mxfp6=False,\n",
     "    device_group=[0],\n",
     ")"
    ]
@@ -166,8 +154,8 @@
     "\n",
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
-    "batch_size = get_compilation_batch_size(generated_qpc_path)\n"
-    "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
+    "batch_size = get_compilation_batch_size(generated_qpc_path)\n",
+    "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
    ]
   }
  ],
diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index 8533eedcc..9ca0c389a 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -26,27 +26,18 @@
     "# Initiate the Orignal Transformer model\n",
     "import os\n",
     "\n",
-    "from transformers import AutoTokenizer\n",
-    "from transformers.models.mpt.modeling_mpt import MptForCausalLM\n",
-    "\n",
-    "from QEfficient.utils import hf_download\n",
-    "from QEfficient.utils.constants import Constants\n",
+    "from QEfficient import QEFFAutoModel\n",
     "\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
     "\n",
-    "ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n",
+    "#ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n",
+    "#CACHE_DIR = os.path.join(ROOT_DIR, \"tmp\"), you can use a different location for just one model by passing this param as cache_dir in below API.\n",
     "\n",
     "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n",
     "model_name = \"mosaicml/mpt-7b\"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n",
-    "model_hf_path = hf_download(\n",
-    "    repo_id=model_name,\n",
-    "    cache_dir=Constants.CACHE_DIR,\n",
-    "    ignore_patterns=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n",
-    ")\n",
-    "model_hf = MptForCausalLM.from_pretrained(model_hf_path, use_cache=True)\n",
-    "model_hf.eval()\n",
-    "print(f\"{model_name} from hugging-face \\n\", model_hf)"
+    "qeff_model = QEFFAutoModel.from_pretrained(model_name)\n",
+    "print(f\"{qeff_model} from hugging-face \\n\", qeff_model)"
    ]
   },
   {
@@ -74,7 +65,7 @@
     "import QEfficient\n",
     "\n",
     "# Easy and minimal api to update the model\n",
-    "model_transformed = QEfficient.transform(model_hf, type=\"Transformers\", form_factor=\"cloud\")\n",
+    "model_transformed = QEfficient.transform(qeff_model, form_factor=\"cloud\")\n",
     "\n",
     "model_transformed.eval()\n",
     "print(\"Model after Optimized transformations \\n\", model_transformed)"
@@ -95,7 +86,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter\n",
+    "from QEfficient.utils import load_hf_tokenizer\n",
     "\n",
     "# We have the utils to export the modified models to Onnx framework\n",
     "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n",
@@ -107,14 +98,14 @@
     "\n",
     "# We can generate both bertstyle and KV Style models with the flag \"kv\"\n",
     "# Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version.\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side=\"left\")\n",
-    "base_path, onnx_path = qualcomm_efficient_converter(\n",
-    "    model_kv=model_transformed,\n",
+    "tokenizer = load_hf_tokenizer(model_hf_path, use_cache=True, padding_side=\"left\")\n",
+    "base_path, onnx_path = QEfficient.export(\n",
     "    model_name=model_name,\n",
+    "    model_kv=model_transformed,\n",
+    "    tokenizer=tokenizer,\n",
     "    kv=True,\n",
     "    form_factor=\"cloud\",\n",
     "    return_path=True,\n",
-    "    tokenizer=tokenizer,\n",
     ")"
    ]
   },
@@ -134,12 +125,11 @@
    "outputs": [],
    "source": [
     "# Please use platform SDk to Check num_cores for your card.\n",
-    "from QEfficient.cloud.compile import main as compile\n",
     "\n",
-    "generated_qpc_path = compile(\n",
+    "generated_qpc_path = QEfficient.compile(\n",
     "    onnx_path=onnx_path,\n",
     "    num_cores=14,\n",
-    "    qpc_path=base_path,\n",
+    "    qpc_path=os.path.dirname(base_path),\n",
     "    mxfp6=True,\n",
     "    device_group=[0],\n",
     ")"
@@ -165,8 +155,7 @@
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
-    "batch_size = get_compilation_batch_size(generated_qpc_path)"
-    "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
+    "batch_size = get_compilation_batch_size(generated_qpc_path)cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
    ]
   }
  ],
diff --git a/tests/test_loader.py b/tests/test_loader.py
index 0d3df3a01..cc6aa050d 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -24,7 +24,7 @@
 }
 model_names = model_name_to_params_dict.keys()
 
-
+#FIXME: Add test cases for passing cache_dir, pretrained_model_path instead of card name, etc., Passing other kwargs
 @pytest.mark.parametrize("model_name", model_names)
 def test_qeff_auto_model_for_causal_lm(model_name: str):
     model = QEFFAutoModel.from_pretrained(model_name)
diff --git a/tests/utils.py b/tests/utils.py
index f68dd20fb..0eeb94cd8 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -11,7 +11,7 @@
 import unittest
 
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100
+from QEfficient.cross_compile import compile_kv_model_on_cloud_ai_100
 from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM
 from QEfficient.transformers.transform import transform_lm
 from QEfficient.utils import hf_download, load_hf_tokenizer

From f00226103de85b7007da7fd9fb27b94cd7e97b13 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Fri, 31 May 2024 16:18:32 +0530
Subject: [PATCH 11/20] bug-fix infer

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/cloud/infer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index e72a3329b..537ce2663 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -216,4 +216,5 @@ def main(
     args = parser.parse_args()
     if args.verbose:
         logger.setLevel(logging.INFO)
+    del args.verbose # type: ignore
     main(**args.__dict__)

From 9eed62feb0ec3a43a851b7d254667ef1f60e20c7 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Fri, 31 May 2024 16:20:52 +0530
Subject: [PATCH 12/20] using QEfficient.export, compile in cloud APIs

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/cloud/export.py | 3 +--
 QEfficient/cloud/infer.py  | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 2b7201c8e..30f93680b 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -10,7 +10,6 @@
 from typing import Optional
 
 import QEfficient
-from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.loader import QEFFAutoModel
 from QEfficient.utils import load_hf_tokenizer, onnx_exists
 from QEfficient.utils.constants import Constants
@@ -46,7 +45,7 @@ def main(
 
     # Export to the Onnx
     print(f"Exporting to Pytorch {model_name} to Onnx")
-    base_path, onnx_path = qualcomm_efficient_converter(
+    base_path, onnx_path = QEfficient.export(
         model_kv=qeff_model,
         model_name=model_name,
         tokenizer=tokenizer,
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 537ce2663..ce05e39fa 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -69,7 +69,7 @@ def main(
         logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
         # Compile -> execute
         # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
-        generated_qpc_path = compile(
+        generated_qpc_path = QEfficient.compile(
             onnx_path=onnx_model_path,
             qpc_path=os.path.dirname(qpc_dir_path),
             num_cores=num_cores,

From 605ff865f528f29a3ec4b886320cee642fdb51fc Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Mon, 3 Jun 2024 23:56:17 +0530
Subject: [PATCH 13/20] cleaner infer,epxport APIs

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/cloud/export.py                    |  63 ++++++-----
 QEfficient/cloud/infer.py                     | 107 +++++-------------
 .../exporter/export_hf_to_cloud_ai_100.py     |  26 ++---
 QEfficient/utils/__init__.py                  |   1 +
 QEfficient/utils/_utils.py                    |  11 ++
 5 files changed, 86 insertions(+), 122 deletions(-)

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 30f93680b..51d66570c 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -7,11 +7,12 @@
 
 import argparse
 import os
-from typing import Optional
+from typing import Optional, Union
 
-import QEfficient
-from QEfficient.loader import QEFFAutoModel
-from QEfficient.utils import load_hf_tokenizer, onnx_exists
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
+from QEfficient.utils import onnx_exists
 from QEfficient.utils.constants import Constants
 from QEfficient.utils.logging_utils import logger
 
@@ -19,6 +20,36 @@
 ROOT_DIR = os.path.dirname(os.path.abspath(""))
 
 
+def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]]=None, hf_token: Optional[str] = None):
+    """
+    exports the model to onnx if pre-exported file is not found and returns onnx_model_path
+    """
+    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
+    if onnx_path_exists:
+        logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
+    else:
+        ###################
+        # hf model -> export
+        ####################
+        # Export to the Onnx
+        logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
+        _, generated_onnx_model_path = qualcomm_efficient_converter(
+                model_name=model_name,
+                tokenizer=tokenizer,
+                onnx_dir_path=onnx_dir_path,
+                kv=True,
+                form_factor="cloud",
+                return_path=True,
+                hf_token=hf_token,
+                cache_dir=cache_dir
+            ) # type: ignore
+        logger.info(f"Generated Onnx_path {generated_onnx_model_path} \nOnnx_model_path {onnx_model_path} \nand Onnx_dir_path is {onnx_dir_path}")
+        assert (
+                generated_onnx_model_path == onnx_model_path
+            ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_model_path}"
+    return onnx_model_path
+
+
 def main(
     model_name: str,
     cache_dir: str,
@@ -31,29 +62,7 @@ def main(
     :cache_dir: str. Cache dir to store the downloaded huggingface files.
     :hf_token: str. HuggingFace login token to access private repos.
     """
-    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
-    if onnx_path_exists:
-        logger.warning(f"Generated Onnx files found {onnx_model_path}! Please use Infer/Compile Apis()")
-        return
-
-    tokenizer = load_hf_tokenizer(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token)
-    qeff_model = QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, cache_dir=cache_dir, hf_token=hf_token)
-
-    # Easy and minimal api to update the model to QEff.
-    QEfficient.transform(qeff_model, form_factor="cloud")
-    print(f"Model after Optimized transformations {qeff_model}")
-
-    # Export to the Onnx
-    print(f"Exporting to Pytorch {model_name} to Onnx")
-    base_path, onnx_path = QEfficient.export(
-        model_kv=qeff_model,
-        model_name=model_name,
-        tokenizer=tokenizer,
-        kv=True,
-        form_factor="cloud",
-        return_path=True,
-    ) # type: ignore
-    print(f"Base Path is {base_path} and Onnx Model Path is : {onnx_path}")
+    get_onnx_model_path(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token)
 
 
 if __name__ == "__main__":
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index ce05e39fa..c3ce86a98 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -11,13 +11,12 @@
 from typing import List, Optional
 
 import QEfficient
-from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
+from QEfficient.cloud.export import get_onnx_model_path
 from QEfficient.generation.text_generation_inference import (
     check_batch_size_and_num_prompts,
     cloud_ai_100_exec_kv,
 )
-from QEfficient.loader import QEFFAutoModel
-from QEfficient.utils import load_hf_tokenizer, onnx_exists, qpc_exists
+from QEfficient.utils import get_qpc_dir_name_infer, load_hf_tokenizer, qpc_exists
 from QEfficient.utils.constants import Constants
 from QEfficient.utils.logging_utils import logger
 
@@ -47,95 +46,43 @@ def main(
         0,
     ],
 ) -> None:
-    qpc_base_dir_name = (
-        f"qpc_{num_cores}cores_{batch_size}BS_{prompt_len}PL_{ctx_len}CL_{mos}MOS_"
-        + f"{len(device_group)}"
-        + "devices"
-        + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16")
-    )
-
+    qpc_base_dir_name = get_qpc_dir_name_infer(num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group)
     prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
-
-    # Get tokenizer
     tokenizer = load_hf_tokenizer(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token)
 
     qpc_path_exists, qpc_dir_path = qpc_exists(model_name, qpc_base_dir_name)
-    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
-
+    # Handle qpc generation
     if qpc_path_exists:
-        # execute
         logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
-    elif onnx_path_exists:
-        logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
-        # Compile -> execute
-        # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
-        generated_qpc_path = QEfficient.compile(
-            onnx_path=onnx_model_path,
-            qpc_path=os.path.dirname(qpc_dir_path),
-            num_cores=num_cores,
-            batch_size=batch_size,
-            prompt_len=prompt_len,
-            ctx_len=ctx_len,
-            mxfp6=mxfp6,
-            mxint8=mxint8,
-            aic_enable_depth_first=aic_enable_depth_first,
-            mos=mos,
-            device_group=device_group,
-        )
-        assert (
-            generated_qpc_path == qpc_dir_path
-        ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
     else:
-        #############################################
-        # hf model -> export -> compile -> execute
-        #############################################
-        # Load hf model
-        qeff_model = QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, cache_dir=cache_dir, hf_token=hf_token)
-        
-        # Easy and minimal api to update the model to QEff.
-        qeff_opt_model = QEfficient.transform(qeff_model, form_factor="cloud")
-        logger.info(f"Model after Optimized transformations {qeff_opt_model}")
-
-        # Export to the Onnx
-        logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
-        # Need to split below function into two functions one which always takes QEFFAutoModel and other with same interface as below
-        base_path, generated_onnx_path = qualcomm_efficient_converter(
-            model_name=model_name,
-            model_kv=qeff_opt_model, # type: ignore
-            tokenizer=tokenizer,
-            onnx_dir_path=onnx_dir_path,
-            kv=True,
-            form_factor="cloud",
-            return_path=True,
-        ) # type: ignore
-        print(f"Generated Onnx_path {generated_onnx_path} and Onnx_model_path {onnx_model_path} and Onnx_dir_path is {onnx_dir_path}")
-        assert (
-            generated_onnx_path == onnx_model_path
-        ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_path}"
-        logger.info(f"Base Path is {base_path} and Onnx Model Path is : {generated_onnx_path}")
+        # ##################
+        # HF model -> export
+        ####################
+        onnx_model_path = get_onnx_model_path(model_name, cache_dir, tokenizer, hf_token)
 
+        #########
         # Compile
-        # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
+        #########
         generated_qpc_path = QEfficient.compile(
-            onnx_path=onnx_model_path,
-            qpc_path=os.path.dirname(qpc_dir_path),
-            num_cores=num_cores,
-            batch_size=batch_size,
-            prompt_len=prompt_len,
-            ctx_len=ctx_len,
-            mxfp6=mxfp6,
-            mxint8=mxint8,
-            aic_enable_depth_first=aic_enable_depth_first,
-            mos=mos,
-            device_group=device_group,
-        )
+                onnx_path=onnx_model_path,
+                qpc_path=os.path.dirname(qpc_dir_path),   # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
+                num_cores=num_cores,
+                batch_size=batch_size,
+                prompt_len=prompt_len,
+                ctx_len=ctx_len,
+                mxfp6=mxfp6,
+                mxint8=mxint8,
+                aic_enable_depth_first=aic_enable_depth_first,
+                mos=mos,
+                device_group=device_group,
+            )
         assert (
-            qpc_dir_path == generated_qpc_path
-        ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
-        logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}")
-
-
+                generated_qpc_path == qpc_dir_path
+            ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
+    
+    #########
     # Execute
+    #########
     cloud_ai_100_exec_kv(
         batch_size,
         tokenizer=tokenizer,
diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
index d5da3f422..38e0e34d5 100644
--- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py
+++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -365,19 +365,12 @@ def export_kvstyle_transformed_model_to_onnx(model_name: str, transformed_model:
     return fp32_model_name, fp16_model_name
 
 
-def export_for_edge() -> None:
-    # [TODO]: Apply the class transformation to make changes for the KV models in edge use cases
-    # model = QEfficient.transform(model_hf, type="Transformers", form_factor="edge")
-    # model.eval()
-    raise NotImplementedError("Oops...reached too far!!")
-
-
 def export_for_cloud(model_name: str, qeff_model: QEFFBaseModel,
                      tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
                      onnx_dir_path: str, seq_length: int = Constants.seq_length,
                      return_path: bool = True,
                      save_fp32_onnx: bool = False,
-                     save_fp16_onnx: bool = True):
+                     save_fp16_onnx: bool = True)-> Tuple[str, str]:
     if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore
         return export_lm_model_for_cloud(model_name=model_name,
                                          qeff_model=qeff_model, # type: ignore
@@ -443,6 +436,7 @@ def qualcomm_efficient_converter(
     model_name: str,
     model_kv: QEFFBaseModel = None, # type: ignore
     tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]]=None,
+    cache_dir: Optional[str] = None,
     onnx_dir_path: Optional[str]=None,
     hf_token: Optional[str] = None,
     seq_length: int = Constants.seq_length,
@@ -451,17 +445,17 @@ def qualcomm_efficient_converter(
     form_factor: str="cloud",
     save_fp32_onnx: bool = False,
     save_fp16_onnx: bool = True,
-) -> Union[Tuple[str, str], None]:
+) -> Tuple[str, str]:
     """
     Function to convert the input string using the specified model and returns the result.
 
     Args:
         model_name (str): The name of the model to be used.
-        model_class (type): The class of the model.
         model_kv (torch.nn.Module): Transformed KV torch model to be used
         tokenizer (HF AutoTokenizer): Tokenzier to prepare inputs.
+        cache_dir (str): Path to cache dir if not specified, default HF cache_dir will be used.
         onnx_dir_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location.
-        token (bool): If True, an authentication token will be used. Default is False.
+        hf_token (bool): If True, an authentication token will be used. Default is False.
         seq_len (int, optional): The length of the sequence. Default is 128.
         kv (bool): If True, key-value pairs will be used. Default is True.
         return_path (bool): If True, return the base path for models and exported onnx model path
@@ -473,7 +467,7 @@ def qualcomm_efficient_converter(
 
     """
     # Get model_kv first
-    model_kv = model_kv if model_kv else QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token)
+    model_kv = model_kv if model_kv else QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token, cache_dir=cache_dir)
 
     # Transform if required
     if model_kv.is_transformed and not kv:
@@ -481,13 +475,12 @@ def qualcomm_efficient_converter(
     
     model_kv = model_kv if model_kv.is_transformed else QEfficient.transform(model_kv) if kv else model_kv
 
-
     if onnx_dir_path is None:
         model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
         onnx_dir_path = os.path.join(model_card_dir, "onnx")
     
     # Load tokenizer if not passed
-    tokenizer = load_hf_tokenizer(model_name=model_name, hf_token=hf_token) if tokenizer is None else tokenizer
+    tokenizer = tokenizer if tokenizer else load_hf_tokenizer(model_name=model_name, hf_token=hf_token, cache_dir=cache_dir)
     
     if form_factor == "cloud":
         return export_for_cloud(
@@ -500,4 +493,7 @@ def qualcomm_efficient_converter(
             save_fp16_onnx=save_fp16_onnx,
             save_fp32_onnx=save_fp32_onnx)
     else:
-        return export_for_edge()
+        # [TODO]: Apply the class transformation to make changes for the KV models in edge use cases
+        # model = QEfficient.transform(model_hf, type="Transformers", form_factor="edge")
+        # model.eval()
+        raise NotImplementedError("Oops! Reached too far!!")
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index 7a3cd4959..bd6b59120 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 from QEfficient.utils._utils import (  # noqa: F401
+    get_qpc_dir_name_infer,
     hf_download,
     load_hf_tokenizer,
     login_and_download_hf_lm,
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 4c2ad177c..7a0d85828 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -129,3 +129,14 @@ def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token
     #FIXME(ochougul): should this always return left padded tokenizer?
     tokenizer = AutoTokenizer.from_pretrained(model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs)
     return tokenizer
+
+
+def get_qpc_dir_name_infer(num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group):
+    qpc_base_dir_name = (
+        f"qpc_{num_cores}cores_{batch_size}BS_{prompt_len}PL_{ctx_len}CL_{mos}MOS_"
+        + f"{len(device_group)}"
+        + "devices"
+        + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16")
+    )
+
+    return qpc_base_dir_name

From b2c5fc7275b5efed3ba1c149fd281b48c96f631f Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 4 Jun 2024 19:37:53 +0530
Subject: [PATCH 14/20] addressed review comments

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/__init__.py                        |   4 +-
 QEfficient/cloud/infer.py                     |   4 +-
 QEfficient/{ => compile}/cross_compile.py     |   0
 .../exporter/export_hf_to_cloud_ai_100.py     |  14 +-
 QEfficient/loader/loader_factory.py           | 130 ------------------
 QEfficient/{loader => src}/__init__.py        |   3 +-
 QEfficient/src/_transformers/__init__.py      |   0
 QEfficient/src/_transformers/auto.py          | 111 +++++++++++++++
 QEfficient/src/base.py                        |  47 +++++++
 .../{loader/loader.py => src/common.py}       |  16 ++-
 QEfficient/transformers/transform.py          |   5 +-
 tests/test_loader.py                          |   5 +-
 tests/utils.py                                |   4 +-
 13 files changed, 190 insertions(+), 153 deletions(-)
 rename QEfficient/{ => compile}/cross_compile.py (100%)
 delete mode 100644 QEfficient/loader/loader_factory.py
 rename QEfficient/{loader => src}/__init__.py (63%)
 create mode 100644 QEfficient/src/_transformers/__init__.py
 create mode 100644 QEfficient/src/_transformers/auto.py
 create mode 100644 QEfficient/src/base.py
 rename QEfficient/{loader/loader.py => src/common.py} (91%)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 9804c4ea1..aac0bcd29 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -5,9 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
-from QEfficient.cross_compile import compile  # noqa: F401
+from QEfficient.compile.cross_compile import compile  # noqa: F401
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.loader import QEFFAutoModel  # noqa: F401
+from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader  # noqa: F401
 from QEfficient.transformers.transform import transform  # noqa: F401
 
 # Users can use QEfficient.export for exporting models to ONNX
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index c3ce86a98..58fe96660 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -55,9 +55,7 @@ def main(
     if qpc_path_exists:
         logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
     else:
-        # ##################
-        # HF model -> export
-        ####################
+        # Handle onnx model generation
         onnx_model_path = get_onnx_model_path(model_name, cache_dir, tokenizer, hf_token)
 
         #########
diff --git a/QEfficient/cross_compile.py b/QEfficient/compile/cross_compile.py
similarity index 100%
rename from QEfficient/cross_compile.py
rename to QEfficient/compile/cross_compile.py
diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
index 38e0e34d5..d55fedffc 100644
--- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py
+++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -7,19 +7,16 @@
 
 import os
 import shutil
-from typing import Optional, Tuple, Type, Union
+from typing import Optional, Tuple, Union
 
 import torch
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 import QEfficient
 from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort
-from QEfficient.loader.loader import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFFAutoModel
-from QEfficient.loader.loader_factory import (
-    QEFF_MODEL_TYPE,
-    QEFFAutoModelForCausalLM,
-    QEFFBaseModel,
-)
+from QEfficient.src._transformers.auto import QEFFAutoModelForCausalLM
+from QEfficient.src.base import QEFFBaseModel
+from QEfficient.src.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader
 from QEfficient.utils._utils import load_hf_tokenizer
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
@@ -371,6 +368,7 @@ def export_for_cloud(model_name: str, qeff_model: QEFFBaseModel,
                      return_path: bool = True,
                      save_fp32_onnx: bool = False,
                      save_fp16_onnx: bool = True)-> Tuple[str, str]:
+    # FIXME: move all this to class instead of here, and just call qeff_model.export here.
     if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore
         return export_lm_model_for_cloud(model_name=model_name,
                                          qeff_model=qeff_model, # type: ignore
@@ -467,7 +465,7 @@ def qualcomm_efficient_converter(
 
     """
     # Get model_kv first
-    model_kv = model_kv if model_kv else QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token, cache_dir=cache_dir)
+    model_kv = model_kv if model_kv else QEFFCommonLoader.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token, cache_dir=cache_dir)
 
     # Transform if required
     if model_kv.is_transformed and not kv:
diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py
deleted file mode 100644
index ede3a6c85..000000000
--- a/QEfficient/loader/loader_factory.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# ----------------------------------------------------------------------------
-
-"""
-** This file for holds the classes that handle main functions
-1.load i.e. from_pretrained
-2.execute
-3.transform
-4.export
-5.compile
-For different varities of Transformer Models
-
-** Each variety of the Transformer model that has different way of doing any of the above functions will have it's own class i.e.
-following models type will have their own class which must inherit QEFFBaseModel abstract class.
-1.Causal Language Models
-2.Diffusion
-3.Quantized models 
-
-** QEFFBASEModel is abstract base class that defines the basic structure of these classes.
-"""
-from abc import ABC, abstractmethod
-from enum import Enum
-from typing import Any
-
-import torch.nn as nn
-from transformers import AutoModelForCausalLM
-from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
-
-import QEfficient
-from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict
-
-
-class QEFF_MODEL_TYPE(Enum):
-    """
-    Defines Names of the different varities of transformer models.
-    """
-    CAUSALLM = "LLM"
-    DIFFUSION = "STABLE_DIFFUSION"
-    AWQ = "AWQ"
-
-
-class QEFFBaseModel(ABC):
-    """
-    This class acts as parent class for all the varieties of model class (i.e. LLMs, SD, quantized etc.).
-    Enforces certain methods to be implemented by child classes.
-
-    All the child classes must provide way to load, transform(optimize), exoprt to ONNX etc. capabilities.
-    """
-    def __init__(self) -> None:
-        super().__init__()
-        # Users can call generate or execute
-        self.generate = self.execute
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
-        raise NotImplementedError("Must implement for child classes")
-
-    @property
-    def is_transformed(self) -> bool:
-        raise NotImplementedError("Must implement for child classes")
-
-    @abstractmethod
-    def transform_export(self, *args, **kwargs) -> Any:
-        pass
-
-    @abstractmethod
-    def transform_export_compile(self, *args, **kwargs) -> Any:
-        pass
-
-    @abstractmethod
-    def execute(self, *args, **kwargs) -> Any:
-        pass
-
-    @abstractmethod
-    def transform(self, *args, **kwargs) -> Any:
-        pass
-
-    @abstractmethod
-    def export(self, *args, **kwargs) -> Any:
-        pass
-
-    @abstractmethod
-    def compile(self, *args, **kwargs) -> Any:
-        pass
-
-
-class QEFFAutoModelForCausalLM(QEFFBaseModel):
-    """
-    QEFF class for manipulating any causal language model from HuggingFace hub.
-    """
-    def __init__(self, model: nn.Module, pretrained_model_name_or_path: str) -> None:
-        assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or
-                model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore
-        self.model: nn.Module = model
-        self.model_files_path = pretrained_model_name_or_path
-
-    def __repr__(self) -> str:
-        return self.model.__repr__()
-    
-    @property
-    def is_transformed(self) -> bool:
-        return getattr(self.model, "qeff_transformed", False)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
-        model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-        return cls(model=model, pretrained_model_name_or_path=pretrained_model_name_or_path)
-    
-    def transform_export(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("Reached too far!!")
-    
-    def transform_export_compile(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("Reached too far!!")
-    
-    def execute(self, *args, **kwargs): # type: ignore
-        raise NotImplementedError("Reached too far!!")
-    
-    def transform(self):
-        QEfficient.transform(self)
-        return self
-
-    def export(self):
-        raise NotImplementedError("Reached too far!!")
-    
-    def compile(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("Reached too far!!")
diff --git a/QEfficient/loader/__init__.py b/QEfficient/src/__init__.py
similarity index 63%
rename from QEfficient/loader/__init__.py
rename to QEfficient/src/__init__.py
index a17f497b5..854686567 100644
--- a/QEfficient/loader/__init__.py
+++ b/QEfficient/src/__init__.py
@@ -5,4 +5,5 @@
 #
 # -----------------------------------------------------------------------------
 
-from QEfficient.loader.loader import QEFFAutoModel  # noqa: F401
+from QEfficient.src._transformers.auto import QEffAutoModel, QEFFAutoModelForCausalLM  # noqa: F401
+from QEfficient.src.common import QEFFCommonLoader  # noqa: F401
diff --git a/QEfficient/src/_transformers/__init__.py b/QEfficient/src/_transformers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/QEfficient/src/_transformers/auto.py b/QEfficient/src/_transformers/auto.py
new file mode 100644
index 000000000..68f039060
--- /dev/null
+++ b/QEfficient/src/_transformers/auto.py
@@ -0,0 +1,111 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+"""
+** This file for holds the classes that handle main functions
+1.load i.e. from_pretrained
+2.execute
+3.transform
+4.export
+5.compile
+For different varities of Transformer Models
+
+Representation of class inheritence followed keeping in line with transformers/diffusers repos ->
+
+                                                                                            QEFFBaseModel
+                                                 ________________________________________________|________________________________________________________________
+                                                |                                                                                                                 |  
+                                            QEFFTransformersBase                                                                                           QEFFDiffusersBase
+                                                |                                                                                                                 |
+                                    ____________|________________________________________________________ ________________                       _________________|______________
+                   _____           |                              |                                      |                |                     |                                |         
+                  |          QEFFAutoModel             QEFFAutoModelForCausalLM              QEFFAWQModelForCausalLM     ...                   ...                              ...
+QEFFCommonLoader -|       [Provides way to          [Provides way to do 1-5 on                 [Supports 1-5 for 
+[Provides         |        do steps 1-5 on           transformers.AutoModelForCausalLM]         AWQ Models]
+interface to      |_____   transformers.AutoModel]
+Load any of 
+These models       
+by automatically
+detecting the type
+of the model]
+
+** QEFFBASEModel is abstract base class that defines the basic structure of these classes.
+** QEFFPipeline classes will stay at the same level as QEFFAutoModel in this hierarchy in future.
+"""
+from typing import Any
+
+import torch.nn as nn
+from transformers.models.auto import AutoModel, AutoModelForCausalLM
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
+
+import QEfficient
+from QEfficient.src.base import QEFFBaseModel
+from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict
+
+# Dictionary that defines the interface from transformers to be used underneath the QEFF interface
+QEFFAutoModelToTransformersAutoModelMap = {
+    "QEFFAutoModelForCausalLM": AutoModelForCausalLM,
+    "QEFFAutoModel": AutoModel,
+}
+
+
+class QEFFTransformersBase(QEFFBaseModel):
+    """
+    Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from src/transformers/models/auto/modeling_auto.py file.
+    """
+    def __init__(self, model: nn.Module) -> None:
+        assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or
+                # FIXME: Use model architectures here instead of complete dictionary TransformersToQEffModulesDict
+                model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore
+        self.model: nn.Module = model
+
+    def __repr__(self) -> str:
+        return self.model.__repr__()
+    
+    @property
+    def is_transformed(self) -> bool:
+        return getattr(self.model, "qeff_transformed", False)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
+        model = QEFFAutoModelToTransformersAutoModelMap[cls.__name__].from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        return cls(model)
+
+    def transform_export(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("Reached too far!!")
+    
+    def transform_export_compile(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("Reached too far!!")
+        
+    def transform(self):
+        QEfficient.transform(self)
+        return self
+
+
+class QEFFAutoModelForCausalLM(QEFFTransformersBase):
+    """
+    QEFF class for manipulating any causal language model from HuggingFace hub.
+    """
+    def execute(self, *args, **kwargs): # type: ignore
+        raise NotImplementedError("Reached too far!!")
+    
+    def export(self):
+        raise NotImplementedError("Reached too far!!")
+    
+    def compile(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("Reached too far!!")
+
+
+class QEffAutoModel(QEFFTransformersBase):
+    def execute(self, *args, **kwargs): # type: ignore
+        raise NotImplementedError("Reached too far!!")
+    
+    def export(self):
+        raise NotImplementedError("Reached too far!!")
+    
+    def compile(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("Reached too far!!")
diff --git a/QEfficient/src/base.py b/QEfficient/src/base.py
new file mode 100644
index 000000000..bb96fc075
--- /dev/null
+++ b/QEfficient/src/base.py
@@ -0,0 +1,47 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class QEFFBaseModel(ABC):
+    """
+    This class acts as parent class for all the varieties of model class (i.e. LLMs, SD, quantized etc.).
+    Enforces certain methods to be implemented by child classes.
+
+    All the child classes must provide way to load, transform(optimize), exoprt to ONNX etc. capabilities.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        # Users can call generate or execute
+        self.generate = self.execute
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
+        raise NotImplementedError("Must implement for child classes")
+
+    @property
+    def is_transformed(self) -> bool:
+        raise NotImplementedError("Must implement for child classes")
+
+    @abstractmethod
+    def transform_export(self, *args, **kwargs) -> Any:
+        pass
+
+    @abstractmethod
+    def transform_export_compile(self, *args, **kwargs) -> Any:
+        pass
+
+    @abstractmethod
+    def execute(self, *args, **kwargs) -> Any:
+        pass
+
+    @abstractmethod
+    def transform(self, *args, **kwargs) -> Any:
+        pass
+
+    @abstractmethod
+    def export(self, *args, **kwargs) -> Any:
+        pass
+
+    @abstractmethod
+    def compile(self, *args, **kwargs) -> Any:
+        pass
\ No newline at end of file
diff --git a/QEfficient/loader/loader.py b/QEfficient/src/common.py
similarity index 91%
rename from QEfficient/loader/loader.py
rename to QEfficient/src/common.py
index c0da35421..bca391097 100644
--- a/QEfficient/loader/loader.py
+++ b/QEfficient/src/common.py
@@ -12,14 +12,26 @@
 QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model.
 """
 import os
+from enum import Enum
 from typing import Any, Dict, Type
 
 from transformers import AutoConfig
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
 
-from QEfficient.loader.loader_factory import QEFF_MODEL_TYPE, QEFFAutoModelForCausalLM, QEFFBaseModel
+from QEfficient.src._transformers.auto import QEFFAutoModelForCausalLM
+from QEfficient.src.base import QEFFBaseModel
 from QEfficient.utils._utils import login_and_download_hf_lm
 
+
+class QEFF_MODEL_TYPE(Enum):
+    """
+    Defines Names of the different varities of transformer models.
+    """
+    CAUSALLM = "LLM"
+    DIFFUSION = "STABLE_DIFFUSION"
+    AWQ = "AWQ"
+
+
 MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = {
     QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM
 }
@@ -50,7 +62,7 @@ def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE:
         raise NotImplementedError(f"model type {type(config)} is not yet supported")
 
 
-class QEFFAutoModel:
+class QEFFCommonLoader:
     """
     Provides HuggingFace model loading interface same as transformers APIs.
     Supports loading any model on HuggingFace.
diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py
index aaddd75fa..3a520d0ed 100644
--- a/QEfficient/transformers/transform.py
+++ b/QEfficient/transformers/transform.py
@@ -10,8 +10,8 @@
 import torch.nn as nn
 import transformers
 
-from QEfficient.loader.loader import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP
-from QEfficient.loader.loader_factory import QEFF_MODEL_TYPE, QEFFBaseModel
+from QEfficient.src.base import QEFFBaseModel
+from QEfficient.src.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader
 from QEfficient.transformers.modeling_attn_mask_utils import (
     QEffAttentionMaskConverter,
     _qeff_prepare_4d_attention_mask,
@@ -115,6 +115,7 @@ def transform(model: QEFFBaseModel, form_factor="cloud"):
     form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"].
     """
     assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!"
+    #FIXME: move this to class and use model.transform()
     if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM:
         transform_lm(model.model) # type: ignore
         return model
diff --git a/tests/test_loader.py b/tests/test_loader.py
index cc6aa050d..5c626361b 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -11,8 +11,7 @@
 from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 
 import QEfficient
-from QEfficient import QEFFAutoModel
-from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM
+from QEfficient import QEFFAutoModelForCausalLM, QEFFCommonLoader
 
 model_name_to_params_dict : Dict[str, Dict[str, Any]] = {
     "gpt2": {
@@ -27,7 +26,7 @@
 #FIXME: Add test cases for passing cache_dir, pretrained_model_path instead of card name, etc., Passing other kwargs
 @pytest.mark.parametrize("model_name", model_names)
 def test_qeff_auto_model_for_causal_lm(model_name: str):
-    model = QEFFAutoModel.from_pretrained(model_name)
+    model = QEFFCommonLoader.from_pretrained(model_name)
     assert isinstance(model, model_name_to_params_dict[model_name]['qeff_class'])
     assert isinstance(model.model, model_name_to_params_dict[model_name]['hf_class']) # type: ignore
 
diff --git a/tests/utils.py b/tests/utils.py
index 0eeb94cd8..18de39622 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -10,9 +10,9 @@
 import shutil
 import unittest
 
+from QEfficient import QEFFAutoModelForCausalLM
+from QEfficient.compile.cross_compile import compile_kv_model_on_cloud_ai_100
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.cross_compile import compile_kv_model_on_cloud_ai_100
-from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM
 from QEfficient.transformers.transform import transform_lm
 from QEfficient.utils import hf_download, load_hf_tokenizer
 from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants

From 0e1bb53cd281a3d86f3903eec8c7ec8f71543673 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 4 Jun 2024 20:01:02 +0530
Subject: [PATCH 15/20] *updated notebooks, readme *moved class desc to base.py
 *Added Runtime Enum

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/src/_transformers/auto.py | 31 -----------------
 QEfficient/src/base.py               | 52 ++++++++++++++++++++++++++++
 README.md                            |  4 +--
 notebooks/QEfficientGPT2.ipynb       |  8 ++---
 notebooks/QEfficientMPT.ipynb        |  4 +--
 5 files changed, 60 insertions(+), 39 deletions(-)

diff --git a/QEfficient/src/_transformers/auto.py b/QEfficient/src/_transformers/auto.py
index 68f039060..40877ea99 100644
--- a/QEfficient/src/_transformers/auto.py
+++ b/QEfficient/src/_transformers/auto.py
@@ -5,37 +5,6 @@
 #
 # ----------------------------------------------------------------------------
 
-"""
-** This file for holds the classes that handle main functions
-1.load i.e. from_pretrained
-2.execute
-3.transform
-4.export
-5.compile
-For different varities of Transformer Models
-
-Representation of class inheritence followed keeping in line with transformers/diffusers repos ->
-
-                                                                                            QEFFBaseModel
-                                                 ________________________________________________|________________________________________________________________
-                                                |                                                                                                                 |  
-                                            QEFFTransformersBase                                                                                           QEFFDiffusersBase
-                                                |                                                                                                                 |
-                                    ____________|________________________________________________________ ________________                       _________________|______________
-                   _____           |                              |                                      |                |                     |                                |         
-                  |          QEFFAutoModel             QEFFAutoModelForCausalLM              QEFFAWQModelForCausalLM     ...                   ...                              ...
-QEFFCommonLoader -|       [Provides way to          [Provides way to do 1-5 on                 [Supports 1-5 for 
-[Provides         |        do steps 1-5 on           transformers.AutoModelForCausalLM]         AWQ Models]
-interface to      |_____   transformers.AutoModel]
-Load any of 
-These models       
-by automatically
-detecting the type
-of the model]
-
-** QEFFBASEModel is abstract base class that defines the basic structure of these classes.
-** QEFFPipeline classes will stay at the same level as QEFFAutoModel in this hierarchy in future.
-"""
 from typing import Any
 
 import torch.nn as nn
diff --git a/QEfficient/src/base.py b/QEfficient/src/base.py
index bb96fc075..ddc23fc87 100644
--- a/QEfficient/src/base.py
+++ b/QEfficient/src/base.py
@@ -1,7 +1,54 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+"""
+** This file for holds the classes that handle main functions
+1.load i.e. from_pretrained
+2.execute
+3.transform
+4.export
+5.compile
+For different varities of Transformer Models
+
+Representation of class inheritence followed keeping in line with transformers/diffusers repos ->
+
+                                                                                            QEFFBaseModel
+                                                 ________________________________________________|________________________________________________________________
+                                                |                                                                                                                 |  
+                                            QEFFTransformersBase                                                                                           QEFFDiffusersBase
+                                                |                                                                                                                 |
+                                    ____________|________________________________________________________ ________________                       _________________|______________
+                   _____           |                              |                                      |                |                     |                                |         
+                  |          QEFFAutoModel             QEFFAutoModelForCausalLM              QEFFAWQModelForCausalLM     ...                   ...                              ...
+QEFFCommonLoader -|       [Provides way to          [Provides way to do 1-5 on                 [Supports 1-5 for 
+[Provides         |        do steps 1-5 on           transformers.AutoModelForCausalLM]         AWQ Models]
+interface to      |_____   transformers.AutoModel]
+Load any of 
+These models       
+by automatically
+detecting the type
+of the model]
+
+** QEFFBASEModel is abstract base class that defines the basic structure of these classes.
+** QEFFPipeline classes will stay at the same level as QEFFAutoModel in this hierarchy in future.
+"""
+
 from abc import ABC, abstractmethod
+from enum import Enum
 from typing import Any
 
 
+#Defining placeholder ENUM for execute function
+class Runtime(Enum):
+    CPU_ORT = "CPU ONNX Runtime"
+    CPU_PT = "CPU PyTorch Runtime"
+    AI_100 = "AI_100"
+
+
 class QEFFBaseModel(ABC):
     """
     This class acts as parent class for all the varieties of model class (i.e. LLMs, SD, quantized etc.).
@@ -13,6 +60,11 @@ def __init__(self) -> None:
         super().__init__()
         # Users can call generate or execute
         self.generate = self.execute
+        self._runtime = Runtime.CPU_PT
+
+    @property
+    def runtime(self) -> Runtime:
+        return self._runtime
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
diff --git a/README.md b/README.md
index 634e3add3..1d830f0b8 100644
--- a/README.md
+++ b/README.md
@@ -205,7 +205,7 @@ import os
 
 
 import QEfficient
-from QEfficient import QEFFAutoModel
+from QEfficient import QEFFAutoModelForCausalLM
 
 # Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.
 # os.environ["TRANSFORMERS_CACHE"] = "/local/mnt/workspace/hf_cache"
@@ -216,7 +216,7 @@ from QEfficient import QEFFAutoModel
 # Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl
 model_name = "gpt2"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.
 
-qeff_model = QEFFAutoModel.from_pretrained(model_name, cache_dir=None)
+qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=None)
 print(f"{model_name} from hugging-face \n", qeff_model)
 
 # Easy and minimal api to update the model
diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index 37d5ebb40..bedb14049 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -26,18 +26,18 @@
     "# Initiate the Orignal Transformer model\n",
     "import os\n",
     "\n",
-    "from QEfficient import QEFFAutoModel\n",
+    "from QEfficient import QEFFAutoModelForCausalLM\n",
     "\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
     "\n",
-    "#ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n",
-    "#CACHE_DIR = os.path.join(ROOT_DIR, \"tmp\"), you can use a different location for just one model by passing this param as cache_dir in below API.\n",
+    "# ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n",
+    "# CACHE_DIR = os.path.join(ROOT_DIR, \"tmp\") #, you can use a different location for just one model by passing this param as cache_dir in below API.\n",
     "\n",
     "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n",
     "model_name = \"gpt2\"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n",
     "\n",
-    "qeff_model = QEFFAutoModel.from_pretrained(model_name, cache_dir=None)\n",
+    "qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=CACHE_DIR)\n",
     "print(f\"{model_name} from hugging-face \\n\", qeff_model)"
    ]
   },
diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index 9ca0c389a..6d5204f55 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -26,7 +26,7 @@
     "# Initiate the Orignal Transformer model\n",
     "import os\n",
     "\n",
-    "from QEfficient import QEFFAutoModel\n",
+    "from QEfficient import QEFFAutoModelForCausalLM\n",
     "\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
@@ -36,7 +36,7 @@
     "\n",
     "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n",
     "model_name = \"mosaicml/mpt-7b\"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n",
-    "qeff_model = QEFFAutoModel.from_pretrained(model_name)\n",
+    "qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)\n",
     "print(f\"{qeff_model} from hugging-face \\n\", qeff_model)"
    ]
   },

From 5f751f57b15c079936af48679eb962e099aa2a05 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 4 Jun 2024 20:16:04 +0530
Subject: [PATCH 16/20] updated cloud_ai_100_exec_kv to be callable from
 QEfficient package

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/__init__.py         |   1 +
 README.md                      |   2 +-
 notebooks/QEfficientGPT2.ipynb | 212 +++++++++++++++++++++++++++++++--
 notebooks/QEfficientMPT.ipynb  |   5 +-
 4 files changed, 204 insertions(+), 16 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index aac0bcd29..04e0f825c 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -7,6 +7,7 @@
 
 from QEfficient.compile.cross_compile import compile  # noqa: F401
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
+from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv  # noqa: F401
 from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader  # noqa: F401
 from QEfficient.transformers.transform import transform  # noqa: F401
 
diff --git a/README.md b/README.md
index 1d830f0b8..89a02859c 100644
--- a/README.md
+++ b/README.md
@@ -190,7 +190,7 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3
 |  QEfficient.transform    |   [click here](#1-model-download-and-transform)         |  <li>model : $\color{green} {Mandatory}$</li><li>form_factor : Optional [Default-"cloud"]</li>  | 
 | QEfficient.export |     [click here](#2-onnx-export-of-transformed-model)       |   <li>mode_name : $\color{green} {Mandatory}$</li><li>model_kv : Optional </li><li>tokenizer : Optional</li><li>onnx_path : Optional </li><li>hf_token : Optional</li><li>seq_length : Optional [Default-128]</li><li>kv : Optional [Default-$\color{green} {True}$]</li><li>return_path : Optional [Default-False]</li><li>form_factor : Optional [Default-"cloud"]</li><li>***save_fp32_onnx : Optional [Default-False]</li><li>***save_fp16_onnx : Optional [Default-True]</li>| 
 |     QEfficient.compile | [click here](#3-compile-on-cloud-ai-100) | <li>onnx_path : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>num_cores : $\color{green} {Mandatory}$</li><li>device_group  : $\color{green} {Mandatory}$</li>  <li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li><li>ctx_len : Optional [Default-128]</li><li>aic_enable_depth_first : Optional [Default-False]</li> <li>mos : Optional [Defaul= -1]</li> <li>mxint8 : Optional [Defaul-False]</li><li>mxfp6 : Optional [Default-True]</li>| 
-|cloud_ai_100_exec_kv | [click here](#4-run-benchmark)  | <li>batch_size : $\color{green} {Mandatory}$</li> <li>tokenizer : $\color{green} {Mandatory}$</li> <li>qpc_path : $\color{green} {Mandatory}$</li><li>**prompt : Optional</li><li>**prompts_txt_file_path : Optional</li><li>input_len : Optional [Default-None]</li> <li>generation_len : Optional [Default-None]</li> <li>device_id : Optional [Default-[0]]</li> <li>enable_debug_logs : Optional [Default-False]</li> <li>stream : Optional [Default-True]</li> <li>write_io_dir : Optional</li><li>automation : Optional [Default-False]</li>| 
+|QEfficient.cloud_ai_100_exec_kv | [click here](#4-run-benchmark)  | <li>batch_size : $\color{green} {Mandatory}$</li> <li>tokenizer : $\color{green} {Mandatory}$</li> <li>qpc_path : $\color{green} {Mandatory}$</li><li>**prompt : Optional</li><li>**prompts_txt_file_path : Optional</li><li>input_len : Optional [Default-None]</li> <li>generation_len : Optional [Default-None]</li> <li>device_id : Optional [Default-[0]]</li> <li>enable_debug_logs : Optional [Default-False]</li> <li>stream : Optional [Default-True]</li> <li>write_io_dir : Optional</li><li>automation : Optional [Default-False]</li>| 
 
 **One argument, prompt or prompts_txt_file_path must be passed.<br>
 ***Both save_fp32_onnx and save_fp16_onnx can't be false.
diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index bedb14049..40e7b6a21 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -18,10 +18,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "c21f82d5-17df-4fc9-a180-05edd032f02d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr2/ochougul/.pyenv/versions/3.8.19/envs/py38/lib/python3.8/site-packages/transformers/utils/hub.py:123: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "gpt2 from hugging-face \n",
+      " GPT2LMHeadModel(\n",
+      "  (transformer): GPT2Model(\n",
+      "    (wte): Embedding(50257, 768)\n",
+      "    (wpe): Embedding(1024, 768)\n",
+      "    (drop): Dropout(p=0.1, inplace=False)\n",
+      "    (h): ModuleList(\n",
+      "      (0-11): 12 x GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "  )\n",
+      "  (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n",
+      ")\n"
+     ]
+    }
+   ],
    "source": [
     "# Initiate the Orignal Transformer model\n",
     "import os\n",
@@ -37,7 +80,7 @@
     "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n",
     "model_name = \"gpt2\"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n",
     "\n",
-    "qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=CACHE_DIR)\n",
+    "qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)\n",
     "print(f\"{model_name} from hugging-face \\n\", qeff_model)"
    ]
   },
@@ -58,10 +101,52 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "a4543b94-9b50-4bcc-90c6-484ab694c9a6",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33;20mWARNING - QEfficient - The model <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> layers has been upadted to QEff layers in-place\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model after Optimized transformations \n",
+      " QEffGPT2LMHeadModel(\n",
+      "  (transformer): QEffGPT2Model(\n",
+      "    (wte): Embedding(50257, 768)\n",
+      "    (wpe): Embedding(1024, 768)\n",
+      "    (drop): Dropout(p=0.1, inplace=False)\n",
+      "    (h): ModuleList(\n",
+      "      (0-11): 12 x QEffGPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): QEffGPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "  )\n",
+      "  (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n",
+      ")\n"
+     ]
+    }
+   ],
    "source": [
     "import QEfficient\n",
     "\n",
@@ -81,10 +166,69 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "3fb4d6dd-9973-4608-b68b-ec6825cfef0e",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5e7fe36d84a24006ba52887588e9935a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33;20mWARNING - QEfficient - Overriding /local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/onnx\u001b[0m\n",
+      "/local/mnt/workspace/open-source/myown/efficient-transformers/QEfficient/transformers/models/gpt2/modeling_gpt2.py:247: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  if batch_size <= 0:\n",
+      "/local/mnt/workspace/open-source/myown/efficient-transformers/QEfficient/transformers/models/gpt2/modeling_gpt2.py:498: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  assert value.shape[2] == seq_length\n",
+      "/local/mnt/workspace/open-source/myown/efficient-transformers/QEfficient/transformers/models/gpt2/modeling_gpt2.py:402: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  attention_mask_RetainedState=attention_mask_retained if past_length > 0 else None,\n",
+      "[W export.cpp:565] Warning: Custom opset domain: 'com.qti.aisw.onnx' provided is not used in the model. Please verify custom opset domain names. (function GraphEncoder)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============== Diagnostic Run torch.onnx.export version 2.0.0+cpu ==============\n",
+      "verbose: False, log level: Level.ERROR\n",
+      "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n",
+      "\n",
+      "\n",
+      "=============== PyTorch vs. fp32 ONNXRT (MAD) ===============\n",
+      "\n",
+      "logits \t\t 7.62939453125e-05\n",
+      "attention_mask_RetainedState \t\t 0.0\n",
+      "past_keys (mean) \t\t 2.635022004445394e-06\n",
+      "past_value (mean) \t\t 5.5730342864990234e-06\n",
+      "\n",
+      "=============================================================\n",
+      "\n",
+      "\n",
+      "=============== PyTorch vs. fp16 ONNXRT (MAD) ===============\n",
+      "\n",
+      "logits \t\t 7.62939453125e-05\n",
+      "attention_mask_RetainedState \t\t 0.0\n",
+      "past_keys (mean) \t\t 2.635022004445394e-06\n",
+      "past_value (mean) \t\t 5.5730342864990234e-06\n",
+      "\n",
+      "=============================================================\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "from QEfficient.utils import load_hf_tokenizer\n",
     "# We can now export the modified models to Onnx framework\n",
@@ -119,10 +263,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "e48be5da-02a1-4d7e-9b5f-a6dcca141d4b",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running AI 100 compiler: /opt/qti-aic/exec/qaic-exec -m=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/onnx/gpt2_kv_clipped_fp16.onnx -aic-hw -aic-hw-version=2.0 -network-specialization-config=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/specializations.json -convert-to-fp16 -retained-state -aic-num-cores=14 -custom-IO-list-file=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/onnx/custom_io_fp16.yaml -compile-only -aic-binary-dir=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/qpcs\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "===================== Compilation Done! =====================\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "# Please use platform SDk to Check num_cores for your card.\n",
     "\n",
@@ -145,17 +316,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "4711fc74-aa5d-4e20-af0e-0d461d2e19bb",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0 My name is  John .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man \n",
+      "\n",
+      "===================== Performance Stats =====================\n",
+      "Prefill time a.k.a TTFT is= 0.01 s\n",
+      "Decode: 220.31 tok/s\n",
+      "E2E: 216.88 tok/s\n",
+      "Total (E2E) inference time is= 0.44 s\n",
+      "=============================================================\n"
+     ]
+    }
+   ],
    "source": [
-    "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n",
+    "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
     "\n",
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "batch_size = get_compilation_batch_size(generated_qpc_path)\n",
-    "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
+    "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
    ]
   }
  ],
diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index 6d5204f55..023369a0f 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -150,12 +150,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n",
+    "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
     "\n",
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
-    "batch_size = get_compilation_batch_size(generated_qpc_path)cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
+    "batch_size = get_compilation_batch_size(generated_qpc_path)\n",
+    "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
    ]
   }
  ],

From 0c9dc74970f472c3f1f81255304c099e747edc80 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 4 Jun 2024 20:20:02 +0530
Subject: [PATCH 17/20] fixed tests

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 tests/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utils.py b/tests/utils.py
index 18de39622..b7fb8a2a9 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -109,7 +109,7 @@ def export_onnx(model_kv, tokenizer, model_name, model_class):
     onnx_dir_path = os.path.join(QEFF_MODELS_DIR, model_name)
     base_path, onnx_model_path = qualcomm_efficient_converter(
         model_name=model_name,
-        model_kv=QEFFAutoModelForCausalLM(model=model_kv, pretrained_model_name_or_path=None), # type: ignore
+        model_kv=QEFFAutoModelForCausalLM(model=model_kv), # type: ignore
         tokenizer=tokenizer,
         onnx_dir_path=onnx_dir_path,
         kv=True,

From df303fb34029a753cc5cf9ef9b87331ad13cf280 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 4 Jun 2024 20:43:12 +0530
Subject: [PATCH 18/20] clenaed notebook

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 notebooks/QEfficientGPT2.ipynb | 206 ++-------------------------------
 1 file changed, 10 insertions(+), 196 deletions(-)

diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index 40e7b6a21..a8661844a 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -18,53 +18,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "c21f82d5-17df-4fc9-a180-05edd032f02d",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr2/ochougul/.pyenv/versions/3.8.19/envs/py38/lib/python3.8/site-packages/transformers/utils/hub.py:123: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "gpt2 from hugging-face \n",
-      " GPT2LMHeadModel(\n",
-      "  (transformer): GPT2Model(\n",
-      "    (wte): Embedding(50257, 768)\n",
-      "    (wpe): Embedding(1024, 768)\n",
-      "    (drop): Dropout(p=0.1, inplace=False)\n",
-      "    (h): ModuleList(\n",
-      "      (0-11): 12 x GPT2Block(\n",
-      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-      "        (attn): GPT2Attention(\n",
-      "          (c_attn): Conv1D()\n",
-      "          (c_proj): Conv1D()\n",
-      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
-      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
-      "        )\n",
-      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-      "        (mlp): GPT2MLP(\n",
-      "          (c_fc): Conv1D()\n",
-      "          (c_proj): Conv1D()\n",
-      "          (act): NewGELUActivation()\n",
-      "          (dropout): Dropout(p=0.1, inplace=False)\n",
-      "        )\n",
-      "      )\n",
-      "    )\n",
-      "    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-      "  )\n",
-      "  (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n",
-      ")\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Initiate the Orignal Transformer model\n",
     "import os\n",
@@ -101,52 +58,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "a4543b94-9b50-4bcc-90c6-484ab694c9a6",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[33;20mWARNING - QEfficient - The model <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> layers has been upadted to QEff layers in-place\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Model after Optimized transformations \n",
-      " QEffGPT2LMHeadModel(\n",
-      "  (transformer): QEffGPT2Model(\n",
-      "    (wte): Embedding(50257, 768)\n",
-      "    (wpe): Embedding(1024, 768)\n",
-      "    (drop): Dropout(p=0.1, inplace=False)\n",
-      "    (h): ModuleList(\n",
-      "      (0-11): 12 x QEffGPT2Block(\n",
-      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-      "        (attn): QEffGPT2Attention(\n",
-      "          (c_attn): Conv1D()\n",
-      "          (c_proj): Conv1D()\n",
-      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
-      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
-      "        )\n",
-      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-      "        (mlp): GPT2MLP(\n",
-      "          (c_fc): Conv1D()\n",
-      "          (c_proj): Conv1D()\n",
-      "          (act): NewGELUActivation()\n",
-      "          (dropout): Dropout(p=0.1, inplace=False)\n",
-      "        )\n",
-      "      )\n",
-      "    )\n",
-      "    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-      "  )\n",
-      "  (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n",
-      ")\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import QEfficient\n",
     "\n",
@@ -166,69 +81,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "3fb4d6dd-9973-4608-b68b-ec6825cfef0e",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5e7fe36d84a24006ba52887588e9935a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[33;20mWARNING - QEfficient - Overriding /local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/onnx\u001b[0m\n",
-      "/local/mnt/workspace/open-source/myown/efficient-transformers/QEfficient/transformers/models/gpt2/modeling_gpt2.py:247: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  if batch_size <= 0:\n",
-      "/local/mnt/workspace/open-source/myown/efficient-transformers/QEfficient/transformers/models/gpt2/modeling_gpt2.py:498: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  assert value.shape[2] == seq_length\n",
-      "/local/mnt/workspace/open-source/myown/efficient-transformers/QEfficient/transformers/models/gpt2/modeling_gpt2.py:402: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  attention_mask_RetainedState=attention_mask_retained if past_length > 0 else None,\n",
-      "[W export.cpp:565] Warning: Custom opset domain: 'com.qti.aisw.onnx' provided is not used in the model. Please verify custom opset domain names. (function GraphEncoder)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "============== Diagnostic Run torch.onnx.export version 2.0.0+cpu ==============\n",
-      "verbose: False, log level: Level.ERROR\n",
-      "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n",
-      "\n",
-      "\n",
-      "=============== PyTorch vs. fp32 ONNXRT (MAD) ===============\n",
-      "\n",
-      "logits \t\t 7.62939453125e-05\n",
-      "attention_mask_RetainedState \t\t 0.0\n",
-      "past_keys (mean) \t\t 2.635022004445394e-06\n",
-      "past_value (mean) \t\t 5.5730342864990234e-06\n",
-      "\n",
-      "=============================================================\n",
-      "\n",
-      "\n",
-      "=============== PyTorch vs. fp16 ONNXRT (MAD) ===============\n",
-      "\n",
-      "logits \t\t 7.62939453125e-05\n",
-      "attention_mask_RetainedState \t\t 0.0\n",
-      "past_keys (mean) \t\t 2.635022004445394e-06\n",
-      "past_value (mean) \t\t 5.5730342864990234e-06\n",
-      "\n",
-      "=============================================================\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from QEfficient.utils import load_hf_tokenizer\n",
     "# We can now export the modified models to Onnx framework\n",
@@ -263,37 +119,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "e48be5da-02a1-4d7e-9b5f-a6dcca141d4b",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Running AI 100 compiler: /opt/qti-aic/exec/qaic-exec -m=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/onnx/gpt2_kv_clipped_fp16.onnx -aic-hw -aic-hw-version=2.0 -network-specialization-config=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/specializations.json -convert-to-fp16 -retained-state -aic-num-cores=14 -custom-IO-list-file=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/onnx/custom_io_fp16.yaml -compile-only -aic-binary-dir=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/qpcs\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "===================== Compilation Done! =====================\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Please use platform SDk to Check num_cores for your card.\n",
     "\n",
@@ -316,25 +145,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "4711fc74-aa5d-4e20-af0e-0d461d2e19bb",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0 My name is  John .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man \n",
-      "\n",
-      "===================== Performance Stats =====================\n",
-      "Prefill time a.k.a TTFT is= 0.01 s\n",
-      "Decode: 220.31 tok/s\n",
-      "E2E: 216.88 tok/s\n",
-      "Total (E2E) inference time is= 0.44 s\n",
-      "=============================================================\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
     "\n",

From abea97dafb62bc8c16216b67f68facc9ee845694 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Wed, 5 Jun 2024 15:27:22 +0530
Subject: [PATCH 19/20] *Added transfrom call within init *reanmed
 cross_compile *updated notebooks *updated README

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/__init__.py                        |  2 +-
 .../{cross_compile.py => compile_helper.py}   |  0
 QEfficient/src/_transformers/auto.py          | 16 +++++-
 QEfficient/transformers/transform.py          |  2 +-
 README.md                                     | 27 ++++------
 notebooks/QEfficientGPT2.ipynb                | 50 ++++++------------
 notebooks/QEfficientMPT.ipynb                 | 52 ++++++-------------
 tests/utils.py                                |  2 +-
 8 files changed, 58 insertions(+), 93 deletions(-)
 rename QEfficient/compile/{cross_compile.py => compile_helper.py} (100%)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 04e0f825c..ac6c1b629 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from QEfficient.compile.cross_compile import compile  # noqa: F401
+from QEfficient.compile.compile_helper import compile  # noqa: F401
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv  # noqa: F401
 from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader  # noqa: F401
diff --git a/QEfficient/compile/cross_compile.py b/QEfficient/compile/compile_helper.py
similarity index 100%
rename from QEfficient/compile/cross_compile.py
rename to QEfficient/compile/compile_helper.py
diff --git a/QEfficient/src/_transformers/auto.py b/QEfficient/src/_transformers/auto.py
index 40877ea99..de01a0840 100644
--- a/QEfficient/src/_transformers/auto.py
+++ b/QEfficient/src/_transformers/auto.py
@@ -26,11 +26,13 @@ class QEFFTransformersBase(QEFFBaseModel):
     """
     Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from src/transformers/models/auto/modeling_auto.py file.
     """
-    def __init__(self, model: nn.Module) -> None:
+    def __init__(self, model: nn.Module, transform:bool = True) -> None:
         assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or
                 # FIXME: Use model architectures here instead of complete dictionary TransformersToQEffModulesDict
                 model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore
         self.model: nn.Module = model
+        if transform:
+            self.transform()
 
     def __repr__(self) -> str:
         return self.model.__repr__()
@@ -41,8 +43,17 @@ def is_transformed(self) -> bool:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
+        """
+        This method accepts All the parameters that are acceptable by transformers.AutoModelForCausalLM.
+        There are few additional parameters that this method can take:
+        :param transform:bool. Whether to optimize model for KV retention; default is True. Pass False to get BertStyle model.
+        """
+        transform: bool = kwargs.get("transform", True)
+        kwargs.update({"use_cache": True})  # Always pass use_cache = True, to get KV values as output during ONNX export 
+        
         model = QEFFAutoModelToTransformersAutoModelMap[cls.__name__].from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-        return cls(model)
+        return cls(model, transform=transform)
+        
 
     def transform_export(self, *args, **kwargs) -> Any:
         raise NotImplementedError("Reached too far!!")
@@ -51,6 +62,7 @@ def transform_export_compile(self, *args, **kwargs) -> Any:
         raise NotImplementedError("Reached too far!!")
         
     def transform(self):
+        # FIXME: break down transform into optmization passes i.e. HW specific optimization(RMSNorm), KV retention pass etc.
         QEfficient.transform(self)
         return self
 
diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py
index 3a520d0ed..dfd0de5c7 100644
--- a/QEfficient/transformers/transform.py
+++ b/QEfficient/transformers/transform.py
@@ -11,7 +11,7 @@
 import transformers
 
 from QEfficient.src.base import QEFFBaseModel
-from QEfficient.src.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader
+from QEfficient.src.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE
 from QEfficient.transformers.modeling_attn_mask_utils import (
     QEffAttentionMaskConverter,
     _qeff_prepare_4d_attention_mask,
diff --git a/README.md b/README.md
index 89a02859c..fc429f88e 100644
--- a/README.md
+++ b/README.md
@@ -195,7 +195,7 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3
 **One argument, prompt or prompts_txt_file_path must be passed.<br>
 ***Both save_fp32_onnx and save_fp16_onnx can't be false.
 
-### 1.  Model download and transform
+### 1.  Model download and Optimize for Cloud AI 100
 
 Initialize QEfficient and transform the models, Check the list of supported architectures in the repo.
 
@@ -203,25 +203,19 @@ Initialize QEfficient and transform the models, Check the list of supported arch
 # Initiate the Orignal Transformer model
 import os
 
-
-import QEfficient
-from QEfficient import QEFFAutoModelForCausalLM
+from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM
 
 # Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.
 # os.environ["TRANSFORMERS_CACHE"] = "/local/mnt/workspace/hf_cache"
 
-#ROOT_DIR = os.path.dirname(os.path.abspath(""))
-#CACHE_DIR = os.path.join(ROOT_DIR, "tmp"), you can use a different location for just one model by passing this param as cache_dir in below API.
+# ROOT_DIR = os.path.dirname(os.path.abspath(""))
+# CACHE_DIR = os.path.join(ROOT_DIR, "tmp") #, you can use a different location for just one model by passing this param as cache_dir in below API.
 
 # Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl
 model_name = "gpt2"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.
 
-qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=None)
-print(f"{model_name} from hugging-face \n", qeff_model)
-
-# Easy and minimal api to update the model
-model_transformed = QEfficient.transform(qeff_model, form_factor="cloud")
-print("Model after Optimized transformations \n", model_transformed)
+qeff_model = AutoModelForCausalLM.from_pretrained(model_name)
+print(f"{model_name} optmized for AI 100 \n", qeff_model)
 ```
 
 ### 2. ONNX export of transformed model
@@ -229,6 +223,7 @@ print("Model after Optimized transformations \n", model_transformed)
 use the qualcomm_efficient_converter API to export the KV transformed Model to ONNX and Verify on Torch.
 
 ```Python
+import QEfficient
 from QEfficient.utils import load_hf_tokenizer
 # We can now export the modified models to Onnx framework
 # This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for
@@ -244,7 +239,7 @@ from QEfficient.utils import load_hf_tokenizer
 tokenizer = load_hf_tokenizer(model_name, use_cache=True)
 base_path, onnx_path = QEfficient.export(
     model_name=model_name,
-    model_kv=model_transformed,
+    model_kv=qeff_model,
     tokenizer=tokenizer,
     kv=True,
     form_factor="cloud",
@@ -261,7 +256,7 @@ Once, the model is exported, Compile the model on Cloud AI 100 and generate QPC.
 
 generated_qpc_path = QEfficient.compile(
     onnx_path=onnx_path,
-    num_cores=14,
+    num_cores=14,  # You can use `/opt/qti-aic/tools/qaic-util | grep "Nsp Total"` from Apps SDK for this. 
     qpc_path=os.path.dirname(base_path),
     mxfp6=False,
     device_group=[0],
@@ -272,12 +267,12 @@ generated_qpc_path = QEfficient.compile(
 Benchmark the model on Cloud AI 100, run the infer API to print tokens and tok/sec
 
 ```Python
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size
+from QEfficient.generation.text_generation_inference import get_compilation_batch_size
 
 # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100
 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach
 batch_size = get_compilation_batch_size(generated_qpc_path)
-cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=["My name is"])
+QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=["My name is"])
 ```
 End to End demo examples for various models are available in **notebooks** directory. Please check them out.
 
diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index a8661844a..3095c7044 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -13,7 +13,16 @@
    "id": "88eef7ea-3488-414c-9e36-e960abba30c9",
    "metadata": {},
    "source": [
-    "##### Download the OpenSource GPT2 based HuggingFace Model and Save in local *Cache* directory"
+    "##### Download the OpenSource GPT2 based HuggingFace Model and Save in local *Cache* directory\n",
+    "###### We Modify the GPT2 Classes using the Optimized Software Library to generate model for Cloud AI 100.\n",
+    "###### User can disable this optmization by passing `transfrom=False` in the `from_pretrained` call\n",
+    "###### Here we generate models with below Optimizations:\n",
+    "\n",
+    "* RMS Norm Fixes for FP16 Overflows and Underflow\n",
+    "* Causal Mask Fix\n",
+    "* Handling FP16 Overflows.\n",
+    "* KV Cache (Retention Changes).\n",
+    "* Triu/Tril Ops support."
    ]
   },
   {
@@ -26,7 +35,7 @@
     "# Initiate the Orignal Transformer model\n",
     "import os\n",
     "\n",
-    "from QEfficient import QEFFAutoModelForCausalLM\n",
+    "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n",
     "\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
@@ -37,38 +46,8 @@
     "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n",
     "model_name = \"gpt2\"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n",
     "\n",
-    "qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)\n",
-    "print(f\"{model_name} from hugging-face \\n\", qeff_model)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a89dfa0a-d8fe-4472-bf00-55e563ae9058",
-   "metadata": {},
-   "source": [
-    "##### Now we Modify the GPT2 Classes using the Optimized Software Library to generate model for Cloud AI 100.\n",
-    "##### Here we generate models with below Optimizations:\n",
-    "\n",
-    "* RMS Norm Fixes for FP16 Overflows and Underflow\n",
-    "* Causal Mask Fix\n",
-    "* Handling FP16 Overflows.\n",
-    "* KV Cache (Retention Changes).\n",
-    "* Triu/Tril Ops support."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a4543b94-9b50-4bcc-90c6-484ab694c9a6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import QEfficient\n",
-    "\n",
-    "# Easy and minimal api to update the model\n",
-    "model_transformed = QEfficient.transform(qeff_model, form_factor=\"cloud\")\n",
-    "\n",
-    "print(\"Model after Optimized transformations \\n\", model_transformed)"
+    "qeff_model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "print(f\"{model_name} optmized for AI 100 \\n\", qeff_model)"
    ]
   },
   {
@@ -86,6 +65,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import QEfficient\n",
     "from QEfficient.utils import load_hf_tokenizer\n",
     "# We can now export the modified models to Onnx framework\n",
     "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n",
@@ -101,7 +81,7 @@
     "tokenizer = load_hf_tokenizer(model_name, use_cache=True)\n",
     "base_path, onnx_path = QEfficient.export(\n",
     "    model_name=model_name,\n",
-    "    model_kv=model_transformed,\n",
+    "    model_kv=qeff_model,\n",
     "    tokenizer=tokenizer,\n",
     "    kv=True,\n",
     "    form_factor=\"cloud\",\n",
diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index 023369a0f..15e84399a 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -13,7 +13,15 @@
    "id": "88eef7ea-3488-414c-9e36-e960abba30c9",
    "metadata": {},
    "source": [
-    "##### Download the OpenSource MPT based HuggingFace Model and Save in local *Cache* directory"
+    "##### Download the OpenSource MPT based HuggingFace Model and Save in local *Cache* directory\n",
+    "###### Now we Modify the MPT Classes using the Optimized Software Library to generate model for Cloud AI 100.\n",
+    "###### Here we generate models with below Optimizations:\n",
+    "\n",
+    "* RMS Norm Fixes for FP16 Overflows and Underflow\n",
+    "* Causal Mask Fix\n",
+    "* Handling FP16 Overflows.\n",
+    "* KV Cache (Retention Changes).\n",
+    "* Triu/Tril Ops support."
    ]
   },
   {
@@ -26,7 +34,7 @@
     "# Initiate the Orignal Transformer model\n",
     "import os\n",
     "\n",
-    "from QEfficient import QEFFAutoModelForCausalLM\n",
+    "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n",
     "\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
@@ -36,39 +44,8 @@
     "\n",
     "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n",
     "model_name = \"mosaicml/mpt-7b\"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n",
-    "qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)\n",
-    "print(f\"{qeff_model} from hugging-face \\n\", qeff_model)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a89dfa0a-d8fe-4472-bf00-55e563ae9058",
-   "metadata": {},
-   "source": [
-    "##### Now we Modify the MPT Classes using the Optimized Software Library to generate model for Cloud AI 100.\n",
-    "##### Here we generate models with below Optimizations:\n",
-    "\n",
-    "* RMS Norm Fixes for FP16 Overflows and Underflow\n",
-    "* Causal Mask Fix\n",
-    "* Handling FP16 Overflows.\n",
-    "* KV Cache (Retention Changes).\n",
-    "* Triu/Tril Ops support."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a4543b94-9b50-4bcc-90c6-484ab694c9a6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import QEfficient\n",
-    "\n",
-    "# Easy and minimal api to update the model\n",
-    "model_transformed = QEfficient.transform(qeff_model, form_factor=\"cloud\")\n",
-    "\n",
-    "model_transformed.eval()\n",
-    "print(\"Model after Optimized transformations \\n\", model_transformed)"
+    "qeff_model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "print(f\"{model_name} optmized for AI 100 \\n\", qeff_model)"
    ]
   },
   {
@@ -86,6 +63,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import QEfficient\n",
     "from QEfficient.utils import load_hf_tokenizer\n",
     "\n",
     "# We have the utils to export the modified models to Onnx framework\n",
@@ -98,10 +76,10 @@
     "\n",
     "# We can generate both bertstyle and KV Style models with the flag \"kv\"\n",
     "# Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version.\n",
-    "tokenizer = load_hf_tokenizer(model_hf_path, use_cache=True, padding_side=\"left\")\n",
+    "tokenizer = load_hf_tokenizer(model_name, use_cache=True, padding_side=\"left\")\n",
     "base_path, onnx_path = QEfficient.export(\n",
     "    model_name=model_name,\n",
-    "    model_kv=model_transformed,\n",
+    "    model_kv=qeff_model,\n",
     "    tokenizer=tokenizer,\n",
     "    kv=True,\n",
     "    form_factor=\"cloud\",\n",
diff --git a/tests/utils.py b/tests/utils.py
index b7fb8a2a9..ace803f8f 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -11,7 +11,7 @@
 import unittest
 
 from QEfficient import QEFFAutoModelForCausalLM
-from QEfficient.compile.cross_compile import compile_kv_model_on_cloud_ai_100
+from QEfficient.compile.compile_helper import compile_kv_model_on_cloud_ai_100
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.transformers.transform import transform_lm
 from QEfficient.utils import hf_download, load_hf_tokenizer

From 4ae5825a36d5116197632a434ffcc2eaa6605c23 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Wed, 5 Jun 2024 23:31:54 +0530
Subject: [PATCH 20/20] addressed review comments

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/compile/compile_helper.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py
index 771d52f54..8b5272e8d 100644
--- a/QEfficient/compile/compile_helper.py
+++ b/QEfficient/compile/compile_helper.py
@@ -7,6 +7,7 @@
 
 import json
 import os
+import shutil
 import subprocess
 from typing import List, Tuple
 
@@ -42,7 +43,6 @@ def compile_kv_model_on_cloud_ai_100(
     device_group: List[int] = [0],
     **kwargs,
 ) -> Tuple[bool, str]:
-    import shutil
     if kwargs:
         # FIXME
         raise NotImplementedError("Can't handle extra compilation args now!")
@@ -101,7 +101,7 @@ def compile(
     onnx_path: str,
     qpc_path: str,
     num_cores: int,
-    device_group: List[int],
+    device_group: List[int],  #  FIXME: use num_devices instead
     aic_enable_depth_first: bool = False,
     mos: int = -1,
     batch_size: int = 1,
@@ -116,12 +116,16 @@ def compile(
     Api() to compile the Onnx Model on Cloud AI 100 Platform with give config.
     ---------
     :param onnx_path: str. Generated Onnx Model Path.
-    :base_path: str. Base path for the generated models.
+    :param qpc_path: str. Path for saving compiled qpc binaries.
+    :num_cores: int. Number of cores to compile model on.
+    :device_group: List[int]. Used for finding number of devices to compile for.
+    :aic_enable_depth_first: bool. Enables DFS with default memory size, disabled by default.
+    :mos: int. Effort level to reduce the on-chip memory.
     :batch_size: int. Batch size to compile the model for.
     :prompt_len: int. prompt len for the model to compile.
     :ctx_len: int. Maximum context length to compile the model.
     :mxfp6: bool. Enable compilation for MXFP6 precision
-    :num_cores: int. Number of cores to compile model on. default: 16 available option: [1 to 16]
+    :mxint8: Compress Present/Past KV to MXINT8 using CustomIO config, default is False.
     """
 
     os.makedirs(qpc_path, exist_ok=True)