diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 63ab9da92..ac6c1b629 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -5,20 +5,11 @@
 #
 # -----------------------------------------------------------------------------
 
-import torch.nn as nn
-
-from QEfficient.transformers.modeling_utils import transform as transform_hf
-
-
-def transform(model: nn.Module, type="Transformers", form_factor="cloud"):
-    """
-    Low level apis in library
-    ---------
-    :param model: nn.Module. instance of nn.Module.
-    :type: str. Transformers | Diffusers, default : Transformers.
-    :form_factor: str.
-    """
-    if type == "Transformers":
-        return transform_hf(model, form_factor)
-    else:
-        raise NotImplementedError
+from QEfficient.compile.compile_helper import compile  # noqa: F401
+from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
+from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv  # noqa: F401
+from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader  # noqa: F401
+from QEfficient.transformers.transform import transform  # noqa: F401
+
+# Users can use QEfficient.export for exporting models to ONNX
+export = qualcomm_efficient_converter
diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py
index d6003c35c..4e446de05 100644
--- a/QEfficient/cloud/compile.py
+++ b/QEfficient/cloud/compile.py
@@ -6,97 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import argparse
-import json
-import os
-from typing import List
-
-from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100
-from QEfficient.utils.logging_utils import logger
-
-
-def create_and_dump_specializations(batch_size: int, prompt_len: int, ctx_len: int, path: str):
-    # Create 
-    specializations = {
-        "specializations": [
-            {
-                "batch_size": str(batch_size),
-                "seq_len": str(prompt_len),
-                "ctx_len": str(ctx_len),
-            },
-            {"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)},
-        ]
-    }
-    # Dump
-    with open(path, "w") as file:
-        json.dump(specializations, file, indent=4)
-
-
-def main(
-    onnx_path: str,
-    qpc_path: str,
-    num_cores: int,
-    device_group: List[int],
-    aic_enable_depth_first: bool = False,
-    mos: int = -1,
-    batch_size: int = 1,
-    prompt_len: int = 32,
-    ctx_len: int = 128,
-    mxfp6: bool = True,
-    mxint8: bool = False,
-) -> str:
-    """
-    API to compile the ONNX model on Cloud AI 100 platform with given config.
-    ---------
-    :param onnx_path: str. Generated ONNX model path.
-    :qpc_path: str. Path of store compiled qpc binaries file
-    :num_cores: int. Number of cores to compile model on. Default: 16, available option: [1 to 16].
-    :device_group: List[int]. Cloud AI 100 device ids (comma-separated) e.g. [0,1]. if devices > 1, it enable multiple card setup.
-    :aic_enable_depth_first: bool. If passed, this option will be enabled during compilation. Default=False.
-    :mos: int. Effort level to reduce the on-chip memory. Default=-1.
-    :batch_size: int. Batch size for model to compile.
-    :prompt_len: int. prompt len for the model to compile.
-    :ctx_len: int. Maximum context length for the model to compile.
-    :mxfp6: bool. Compress constant MatMul weights to MXFP6 E2M3, default is no compression.
-    :mxint8: bool. Compress Present/Past KV to MXINT8 using CustomIO config, default is False.
-
-    Return:
-        Path of the QPC files.
-    """
-
-    os.makedirs(qpc_path, exist_ok=True)
-    specialization_json_path = os.path.join(qpc_path, "specializations.json")
-    create_and_dump_specializations(
-        batch_size=batch_size, prompt_len=prompt_len, ctx_len=ctx_len, path=specialization_json_path
-    )
-
-    # Select the customIO config based on the mx flag.
-    if mxint8:
-        custom_io_file_name = "custom_io_int8.yaml"
-    else:
-        custom_io_file_name = "custom_io_fp16.yaml"
-
-    custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
-
-    if not os.path.isfile(custom_io_file_path):
-        raise FileNotFoundError(
-            f"file {custom_io_file_path} needs to exist in the same directory as onnx model files. Please rerun infer/export Api"
-        )
-
-    _, qpc_path = compile_kv_model_on_cloud_ai_100(
-        onnx_path=onnx_path,
-        specializations_json=specialization_json_path,
-        num_cores=num_cores,
-        custom_io_path=custom_io_file_path,
-        base_path=qpc_path,
-        mxfp6=mxfp6,
-        aic_enable_depth_first=aic_enable_depth_first,
-        mos=mos,
-        device_group=device_group,
-    )
-
-    logger.info(f"Compiled QPC files can be found here: {qpc_path}")
-    return qpc_path
-
+import QEfficient
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Compilation script.")
@@ -152,5 +62,7 @@ def main(
         default=-1,
         help=" Effort level to reduce the on-chip memory",
     )
+
+    # FIXME(ochougul): Allow extra compilation arguments
     args = parser.parse_args()
-    main(**vars(args))
+    QEfficient.compile(**vars(args))
diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
index 2eb8386d3..4660a951a 100644
--- a/QEfficient/cloud/execute.py
+++ b/QEfficient/cloud/execute.py
@@ -6,15 +6,14 @@
 # -----------------------------------------------------------------------------
 
 import argparse
-from typing import List
-from huggingface_hub import login
-from transformers import AutoTokenizer
+from typing import List, Optional
+
 from QEfficient.generation.text_generation_inference import (
     check_batch_size_and_num_prompts,
     cloud_ai_100_exec_kv,
     get_compilation_batch_size,
-    )
-from QEfficient.utils import hf_download
+)
+from QEfficient.utils import load_hf_tokenizer
 from QEfficient.utils.constants import Constants
 
 
@@ -22,10 +21,10 @@ def main(
     model_name: str,
     qpc_path: str,
     device_group: List[int],
-    prompt: str = None,
-    prompts_txt_file_path: str = None,
-    cache_dir: str = Constants.CACHE_DIR,
-    hf_token: str = None,
+    prompt: Optional[str] = None,  # type: ignore
+    prompts_txt_file_path: Optional[str] = None,
+    cache_dir: Optional[str] = Constants.CACHE_DIR,
+    hf_token: Optional[str] = None,
 ):
     """
     API to run the model on Cloud AI 100 platform.
@@ -35,22 +34,14 @@ def main(
     :qpc_path: str.  Path to the save generated binary file after compilation.
     :cache_dir: str. Cache dir to store the downloaded huggingface files.
     :hf_token: Huggingface token to access gated models.
-<<<<<<< HEAD
     :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
     :prompts_txt_file_path: str. Path to txt file for multiple input prompts
-=======
->>>>>>> bb46e21 (Added sphinx files in docs)
     """
 
-    if hf_token is not None:
-        login(hf_token)
-
-    # Download tokenizer along with model if it doesn't exist
-    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"])
-    tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
+    tokenizer = load_hf_tokenizer(model_name, cache_dir, hf_token)
 
     batch_size = get_compilation_batch_size(qpc_path)
-    prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
+    prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
 
     # Execute
     cloud_ai_100_exec_kv(
diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index ce49f49d6..25cf2700f 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -7,13 +7,12 @@
 
 import argparse
 import os
+from typing import Optional, Union
 
-from huggingface_hub import login
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-import QEfficient
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.utils import hf_download, onnx_exists
+from QEfficient.utils import onnx_exists
 from QEfficient.utils.constants import Constants
 from QEfficient.utils.logging_utils import logger
 
@@ -21,10 +20,40 @@
 ROOT_DIR = os.path.dirname(os.path.abspath(""))
 
 
+def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]]=None, hf_token: Optional[str] = None):
+    """
+    exports the model to onnx if pre-exported file is not found and returns onnx_model_path
+    """
+    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
+    if onnx_path_exists:
+        logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
+    else:
+        ###################
+        # hf model -> export
+        ####################
+        # Export to the Onnx
+        logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
+        _, generated_onnx_model_path = qualcomm_efficient_converter(
+                model_name=model_name,
+                tokenizer=tokenizer,
+                onnx_dir_path=onnx_dir_path,
+                kv=True,
+                form_factor="cloud",
+                return_path=True,
+                hf_token=hf_token,
+                cache_dir=cache_dir
+            ) # type: ignore
+        logger.info(f"Generated Onnx_path {generated_onnx_model_path} \nOnnx_model_path {onnx_model_path} \nand Onnx_dir_path is {onnx_dir_path}")
+        assert (
+                generated_onnx_model_path == onnx_model_path
+            ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_model_path}"
+    return onnx_model_path
+
+
 def main(
     model_name: str,
     cache_dir: str,
-    hf_token: str = None,
+    hf_token: Optional[str] = None,
 ) -> None:
     """
     ApI for exporting to ONNX Model.
@@ -33,38 +62,7 @@ def main(
     :cache_dir: str. Cache dir to store the downloaded huggingface files.
     :hf_token: str. HuggingFace login token to access private repos.
     """
-    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
-    if onnx_path_exists:
-        logger.warning(f"Generated Onnx files found {onnx_model_path}! Please use Infer/Compile Apis()")
-        return
-
-    if hf_token is not None:
-        login(hf_token)
-    model_hf_path = hf_download(
-        repo_id=model_name,
-        cache_dir=cache_dir,
-        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True
-    )
-    model = AutoModelForCausalLM.from_pretrained(model_hf_path, use_cache=True)
-
-    # Easy and minimal api to update the model to QEff.
-    QEfficient.transform(model, type="Transformers", form_factor="cloud")
-    print(f"Model after Optimized transformations {model}")
-
-    # Export to the Onnx
-    print(f"Exporting to Pytorch {model_name} to Onnx")
-    base_path, onnx_path = qualcomm_efficient_converter(
-        model_kv=model,
-        model_name=model_name,
-        tokenizer=tokenizer,
-        kv=True,
-        form_factor="cloud",
-        return_path=True,
-    )
-    print(f"Base Path is {base_path} and Onnx Model Path is : {onnx_path}")
+    get_onnx_model_path(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token)
 
 
 if __name__ == "__main__":
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 09c6ff3be..457678a7a 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -6,20 +6,17 @@
 # -----------------------------------------------------------------------------
 
 import argparse
+import logging
 import os
-from typing import List
-
-from huggingface_hub import login
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from typing import List, Optional
 
 import QEfficient
-from QEfficient.cloud.compile import main as compile
-from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
+from QEfficient.cloud.export import get_onnx_model_path
 from QEfficient.generation.text_generation_inference import (
     check_batch_size_and_num_prompts,
     cloud_ai_100_exec_kv,
 )
-from QEfficient.utils import hf_download, onnx_exists, qpc_exists
+from QEfficient.utils import get_qpc_dir_name_infer, load_hf_tokenizer, qpc_exists
 from QEfficient.utils.constants import Constants
 from QEfficient.utils.logging_utils import logger
 
@@ -34,19 +31,19 @@
 def main(
     model_name: str,
     num_cores: int,
-    prompt: str = None,
-    prompts_txt_file_path: str = None,
+    prompt: Optional[str] = None, # type: ignore
+    prompts_txt_file_path: Optional[str] = None,
     aic_enable_depth_first: bool = False,
     mos: int = -1,
     cache_dir: str = Constants.CACHE_DIR,
-    hf_token: str = None,
+    hf_token: Optional[str] = None,
     batch_size: int = 1,
     prompt_len: int = 32,
     ctx_len: int = 128,
     mxfp6: bool = False,
     mxint8: bool = False,
     device_group: List[int] = [0],
-):
+) -> None:
     """
     Inference command, the model will be downloaded from HF, optimized, compiled, executed on AIC.
     ---------
@@ -63,120 +60,41 @@ def main(
     :mxfp6: bool. Enable compilation for MXFP6 precision.
     :device_group: List[int]. Cloud AI 100 device ids (comma-separated) e.g. [0,1]. if devices > 1, it enable multiple card setup.
     """
-    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
-    os.makedirs(model_card_dir, exist_ok=True)
-
-    qpc_base_dir_name = (
-        f"qpc_{num_cores}cores_{batch_size}BS_{prompt_len}PL_{ctx_len}CL_{mos}MOS_"
-        + f"{len(device_group)}"
-        + "devices"
-        + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16")
-    )
-
-    prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
-
-    # Get tokenizer
-    if hf_token is not None:
-        login(hf_token)
-    model_hf_path = hf_download(
-        repo_id=model_name,
-        cache_dir=cache_dir,
-        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True
-    )
+    qpc_base_dir_name = get_qpc_dir_name_infer(num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group)
+    prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
+    tokenizer = load_hf_tokenizer(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token)
 
     qpc_path_exists, qpc_dir_path = qpc_exists(model_name, qpc_base_dir_name)
+    # Handle qpc generation
     if qpc_path_exists:
-        # execute
-        logger.info("Pre-compiled qpc found! Trying to execute with given prompt")
-        cloud_ai_100_exec_kv(
-            batch_size,
-            tokenizer=tokenizer,
-            qpc_path=qpc_dir_path,
-            device_id=device_group,
-            prompt=prompt,
-        )
-        return
-
-    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name)
-    if onnx_path_exists:
-        # Compile -> execute
-        # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
-        generated_qpc_path = compile(
-            onnx_path=onnx_model_path,
-            qpc_path=os.path.dirname(qpc_dir_path),
-            num_cores=num_cores,
-            batch_size=batch_size,
-            prompt_len=prompt_len,
-            ctx_len=ctx_len,
-            mxfp6=mxfp6,
-            mxint8=mxint8,
-            aic_enable_depth_first=aic_enable_depth_first,
-            mos=mos,
-            device_group=device_group,
-        )
+        logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
+    else:
+        # Handle onnx model generation
+        onnx_model_path = get_onnx_model_path(model_name, cache_dir, tokenizer, hf_token)
+
+        #########
+        # Compile
+        #########
+        generated_qpc_path = QEfficient.compile(
+                onnx_path=onnx_model_path,
+                qpc_path=os.path.dirname(qpc_dir_path),   # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
+                num_cores=num_cores,
+                batch_size=batch_size,
+                prompt_len=prompt_len,
+                ctx_len=ctx_len,
+                mxfp6=mxfp6,
+                mxint8=mxint8,
+                aic_enable_depth_first=aic_enable_depth_first,
+                mos=mos,
+                device_group=device_group,
+            )
         assert (
-            generated_qpc_path == qpc_dir_path
-        ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
-        cloud_ai_100_exec_kv(
-            batch_size,
-            tokenizer=tokenizer,
-            qpc_path=qpc_dir_path,
-            device_id=device_group,
-            prompt=prompt,
-        )
-        return
-
-    #############################################
-    # hf model -> export -> compile -> execute
-    #############################################
-    model_hf = AutoModelForCausalLM.from_pretrained(model_hf_path, use_cache=True)
-    # Easy and minimal api to update the model to QEff.
-    model_transformed = QEfficient.transform(model_hf, type="Transformers", form_factor="cloud")
-    logger.info(f"Model after Optimized transformations {model_transformed}")
-
-    # Export to the Onnx
-    logger.info(f"Exporting to Pytorch {model_name} to ONNX...")
-    base_path, generated_onnx_path = qualcomm_efficient_converter(
-        model_kv=model_transformed,
-        onnx_dir_path=onnx_dir_path,
-        model_name=model_name,
-        kv=True,
-        form_factor="cloud",
-        return_path=True,
-        tokenizer=tokenizer,
-    )
-    print(
-        f"Generated Onnx_path {generated_onnx_path} and Onnx_model_path {onnx_model_path} and Onnx_dir_path is {onnx_dir_path}"
-    )
-    assert (
-        generated_onnx_path == onnx_model_path
-    ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_path}"
-    logger.info(f"Base Path is {base_path} and Onnx Model Path is : {generated_onnx_path}")
-
-    # Compile
-    # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
-    generated_qpc_path = compile(
-        onnx_path=onnx_model_path,
-        qpc_path=os.path.dirname(qpc_dir_path),
-        num_cores=num_cores,
-        batch_size=batch_size,
-        prompt_len=prompt_len,
-        ctx_len=ctx_len,
-        mxfp6=mxfp6,
-        mxint8=mxint8,
-        aic_enable_depth_first=aic_enable_depth_first,
-        mos=mos,
-        device_group=device_group,
-    )
-    assert (
-        qpc_dir_path == generated_qpc_path
-    ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
-    logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}")
-
+                generated_qpc_path == qpc_dir_path
+            ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
+    
+    #########
     # Execute
+    #########
     cloud_ai_100_exec_kv(
         batch_size,
         tokenizer=tokenizer,
@@ -247,6 +165,15 @@ def main(
         default=-1,
         help="Effort level to reduce the on-chip memory",
     )
+    #FIXME: Add verbose feature
+    parser.add_argument(
+        "--verbose","-v",
+        action="store_true",
+        help="pass to print info logs",
+    )
 
     args = parser.parse_args()
+    if args.verbose:
+        logger.setLevel(logging.INFO)
+    del args.verbose # type: ignore
     main(**args.__dict__)
diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py
new file mode 100644
index 000000000..8b5272e8d
--- /dev/null
+++ b/QEfficient/compile/compile_helper.py
@@ -0,0 +1,163 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import json
+import os
+import shutil
+import subprocess
+from typing import List, Tuple
+
+from QEfficient.utils.logging_utils import logger
+
+
+def create_and_dump_specializations(batch_size: int, prompt_len: int, ctx_len: int, path: str):
+    # Create
+    specializations = {
+        "specializations": [
+            {
+                "batch_size": str(batch_size),
+                "seq_len": str(prompt_len),
+                "ctx_len": str(ctx_len),
+            },
+            {"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)},
+        ]
+    }
+    # Dump
+    with open(path, "w") as file:
+        json.dump(specializations, file, indent=4)
+
+
+def compile_kv_model_on_cloud_ai_100(
+    onnx_path: str,
+    specializations_json: str,
+    num_cores: int,
+    base_path: str,
+    mxfp6: bool,
+    custom_io_path: str,
+    aic_enable_depth_first: bool,
+    mos: int = -1,
+    device_group: List[int] = [0],
+    **kwargs,
+) -> Tuple[bool, str]:
+    if kwargs:
+        # FIXME
+        raise NotImplementedError("Can't handle extra compilation args now!")
+    aic_binary_dir = os.path.join(base_path, "qpcs")
+
+    if os.path.isdir(aic_binary_dir):
+        shutil.rmtree(aic_binary_dir)
+
+    assert os.path.isfile(
+        specializations_json
+    ), f"Please use 'QEfficient.compile', as {specializations_json} file was not found"
+    assert os.path.isfile(custom_io_path), f"{custom_io_path} file was not found!"
+    command = [
+        "/opt/qti-aic/exec/qaic-exec",
+        f"-m={onnx_path}",
+        "-aic-hw",
+        "-aic-hw-version=2.0",
+        f"-network-specialization-config={specializations_json}",
+        "-convert-to-fp16",
+        "-retained-state",
+        f"-aic-num-cores={num_cores}",
+        f"-custom-IO-list-file={custom_io_path}",
+        "-compile-only",
+        f"-aic-binary-dir={aic_binary_dir}",
+    ]
+    if mxfp6:
+        command.append("-mxfp6-matmul")
+    if mos > 0:
+        command.append(f"-mos={mos}")
+    if aic_enable_depth_first:
+        command.append("-aic-enable-depth-first")
+    if len(device_group) > 1:
+        mdp_ts_config = {
+            "connections": [{"devices": list(range(len(device_group))), "type": "p2p"}],
+            "partitions": [
+                {
+                    "name": "Partition0",
+                    "devices": [{"deviceId": device, "numCores": num_cores} for device in range(len(device_group))],
+                }
+            ],
+        }
+        mdp_ts_config_path = os.path.join(base_path, "mdp_ts_config.json")
+        with open(mdp_ts_config_path, "w") as file:
+            json.dump(mdp_ts_config, file, indent=4)
+        command.append(f"-mdp-load-partition-config={mdp_ts_config_path}")
+    print("Running AI 100 compiler:", " ".join(command))
+    result = subprocess.run(command, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"Compilation Failed!!\n\nSTDOUT\n{result.stdout}\n\nSTDERR\n{result.stderr}")
+
+    print("\n===================== Compilation Done! =====================\n")
+    return result.returncode == 0, aic_binary_dir
+
+
+def compile(
+    onnx_path: str,
+    qpc_path: str,
+    num_cores: int,
+    device_group: List[int],  #  FIXME: use num_devices instead
+    aic_enable_depth_first: bool = False,
+    mos: int = -1,
+    batch_size: int = 1,
+    prompt_len: int = 32,
+    ctx_len: int = 128,
+    mxfp6: bool = True,
+    mxint8: bool = False,
+    **kwargs
+) -> str:
+    # Dynamically create the specializations JSON
+    """
+    Api() to compile the Onnx Model on Cloud AI 100 Platform with give config.
+    ---------
+    :param onnx_path: str. Generated Onnx Model Path.
+    :param qpc_path: str. Path for saving compiled qpc binaries.
+    :num_cores: int. Number of cores to compile model on.
+    :device_group: List[int]. Used for finding number of devices to compile for.
+    :aic_enable_depth_first: bool. Enables DFS with default memory size, disabled by default.
+    :mos: int. Effort level to reduce the on-chip memory.
+    :batch_size: int. Batch size to compile the model for.
+    :prompt_len: int. prompt len for the model to compile.
+    :ctx_len: int. Maximum context length to compile the model.
+    :mxfp6: bool. Enable compilation for MXFP6 precision
+    :mxint8: Compress Present/Past KV to MXINT8 using CustomIO config, default is False.
+    """
+
+    os.makedirs(qpc_path, exist_ok=True)
+    specialization_json_path = os.path.join(qpc_path, "specializations.json")
+    create_and_dump_specializations(
+        batch_size=batch_size, prompt_len=prompt_len, ctx_len=ctx_len, path=specialization_json_path
+    )
+
+    # Select the customIO config based on the mx flag.
+    if mxint8:
+        custom_io_file_name = "custom_io_int8.yaml"
+    else:
+        custom_io_file_name = "custom_io_fp16.yaml"
+
+    custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
+
+    if not os.path.isfile(custom_io_file_path):
+        raise FileNotFoundError(
+            f"file {custom_io_file_path} needs to exist in the same directory as onnx model files. Please rerun infer/export Api"
+        )
+
+    _, qpc_path = compile_kv_model_on_cloud_ai_100(
+        onnx_path=onnx_path,
+        specializations_json=specialization_json_path,
+        num_cores=num_cores,
+        custom_io_path=custom_io_file_path,
+        base_path=qpc_path,
+        mxfp6=mxfp6,
+        aic_enable_depth_first=aic_enable_depth_first,
+        mos=mos,
+        device_group=device_group,
+    )
+
+    logger.info(f"Compiled QPC files can be found here: {qpc_path}")
+    return qpc_path
diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
index b598b3108..535fa8095 100644
--- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py
+++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -7,31 +7,31 @@
 
 import os
 import shutil
+from typing import Optional, Tuple, Union
 
 import torch
-from huggingface_hub import login
-from transformers import AutoTokenizer
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
+import QEfficient
 from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort
-from QEfficient.transformers.modeling_utils import transform
-from QEfficient.utils import hf_download
+from QEfficient.src._transformers.auto import QEFFAutoModelForCausalLM
+from QEfficient.src.base import QEFFBaseModel
+from QEfficient.src.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader
+from QEfficient.utils._utils import load_hf_tokenizer
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
 
 
 def convert_to_cloud_bertstyle(
     model_name: str,
-    model_class: type = None,
-    tokenizer=None,
-    onnx_dir_path=None,
-    hf_token: str = None,
-    seq_len: int = Constants.seq_length,
-    input_str: str = Constants.input_str,
-    return_path: bool = False,
-    save_fp32_onnx: bool = False,
-    save_fp16_onnx: bool = True,
-) -> str:
-
+    qeff_model: QEFFAutoModelForCausalLM,
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    onnx_dir_path: str,
+    seq_len: int,
+    return_path: bool,
+    save_fp32_onnx: bool,
+    save_fp16_onnx: bool,
+):
     """
     API to convert model to Bertstyle approach.
     Bertstyle Approach:
@@ -56,12 +56,6 @@ def convert_to_cloud_bertstyle(
     Return:
         Path of exported ONNX file.
     """
-    # todo (amitraj) Optimize the onnx export
-    if onnx_dir_path is None:
-        model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
-        onnx_dir_path = os.path.join(model_card_dir, "onnx_bertstyle")
-
-    # Check if ONNX already exist
     if os.path.exists(onnx_dir_path):
         logger.warning(f"Overriding {onnx_dir_path}")
         shutil.rmtree(onnx_dir_path)
@@ -69,37 +63,29 @@ def convert_to_cloud_bertstyle(
     if not (save_fp32_onnx or save_fp16_onnx):
         raise AttributeError("save_fp32_onnx and save_fp16_onnx can't be false")
 
-    seq_len = Constants.seq_length
-    input_str = Constants.input_str
-
-    # Load tokenizer
-    if tokenizer is None:
-        tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=True)
-    else:
-        if tokenizer.padding_side != "left":
-            logger.warning("Please use padding_side='left' while initializing the tokenizer")
-            tokenizer.padding_side = "left"
+    if tokenizer.padding_side != "left":
+        logger.warning("Please use padding_side='left' while initializing the tokenizer")
+        tokenizer.padding_side = "left"
 
-    if tokenizer.pad_token_id is None:
+    if tokenizer.pad_token_id is None: 
         tokenizer.pad_token_id = tokenizer.eos_token_id
 
-    try:
-        if hf_token:
-            login(hf_token)
-        model_hf_path = hf_download(
-            repo_id=model_name,
-            cache_dir=Constants.CACHE_DIR,
-            ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"],
-        )
-        model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True)
-    except Exception as e:
-        print(f"Failed to download the {model_name} model from Huggingface:%s", e)
-    model.eval()
-
     # Decide path for saving exported ONNX files.
+    fp32_model_name, fp16_model_name = export_bertstyle_model_to_onnx(model_name, qeff_model.model, tokenizer, onnx_dir_path, seq_len, save_fp32_onnx, save_fp16_onnx) # type: ignore
+
+    # return the model path for automation.
+    if return_path:
+        if save_fp16_onnx:
+            return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx")
+        else:
+            return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx")
+
+
+def export_bertstyle_model_to_onnx(model_name, model, tokenizer, onnx_dir_path, seq_len, save_fp32_onnx, save_fp16_onnx):
     model_base_name = model_name.replace("/", "_") + "_bertstyle"
     os.makedirs(onnx_dir_path, exist_ok=True)
 
+    input_str = Constants.input_str
     # Preprocess inputs
     if seq_len > 0:
         if tokenizer.pad_token_id is None:
@@ -178,30 +164,20 @@ def convert_to_cloud_bertstyle(
         inputs=inputs,
         input_list_file=input_list_file,
     )
-
-    # return the model path for automation.
-    if return_path:
-        if save_fp16_onnx:
-            return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx")
-        else:
-            return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx")
-    else:
-        return
+    
+    return fp32_model_name,fp16_model_name
 
 
 def convert_to_cloud_kvstyle(
     model_name: str,
-    model_class: type = None,
-    model_kv: torch.nn.Module = None,
-    tokenizer=None,
-    onnx_dir_path=None,
-    hf_token: str = None,
-    seq_len: int = Constants.seq_length,
-    input_str: str = Constants.input_str,
-    return_path: bool = False,
-    save_fp32_onnx: bool = False,
-    save_fp16_onnx: bool = True,
-) -> str:
+    qeff_model: QEFFAutoModelForCausalLM,
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    onnx_dir_path: str,
+    seq_len: int,
+    return_path: bool,
+    save_fp32_onnx: bool,
+    save_fp16_onnx: bool,
+):
     """
     API change model for kv retention and export to ONNX.
     KV Style Approach-
@@ -226,58 +202,45 @@ def convert_to_cloud_kvstyle(
     Returns:
         Path of exported ONNX file.
     """
-    if onnx_dir_path is None:
-        model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
-        onnx_dir_path = os.path.join(model_card_dir, "onnx")
-
     if os.path.exists(onnx_dir_path):
         logger.warning(f"Overriding {onnx_dir_path}")
         shutil.rmtree(onnx_dir_path)
 
     if not (save_fp32_onnx or save_fp16_onnx):
         raise AttributeError("save_fp32_onnx and save_fp16_onnx can't be false")
+    
 
-    if model_class is None and model_kv is None:
-        raise AttributeError("model_class and model_kv both can't be None")
+    if tokenizer.padding_side != "left":
+        logger.warning("Please use padding_side='left' while initializing the tokenizer")
+        tokenizer.padding_side = "left"
 
-    if model_kv is not None:
-        if not getattr(model_kv, "qeff_transformed", False):
-            raise AttributeError(
-                "Model is not transformed, Please first use QEfficient.transform to transform the model."
-            )
-        model = model_kv
-    else:
-        try:
-            if hf_token:
-                login(hf_token)
-            model_hf_path = hf_download(
-                repo_id=model_name,
-                cache_dir=Constants.CACHE_DIR,
-                ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"],
-            )
-            model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True)
-        except Exception as e:
-            print(f"Failed to download the {model_name} model from Huggingface:%s", e)
-        transform(model, form_factor="cloud")
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+    assert qeff_model.is_transformed, f"please pass the {qeff_model.__class__.__name__} after transform API"
 
     # Decide path for saving exported ONNX files.
-    model_base_name = model_name.replace("/", "_") + "_kv"
-    os.makedirs(onnx_dir_path, exist_ok=True)
+    fp32_model_name, fp16_model_name = export_kvstyle_transformed_model_to_onnx(model_name, qeff_model.model,  tokenizer, onnx_dir_path, seq_len, save_fp32_onnx, save_fp16_onnx) # type: ignore
 
-    # Load tokenizer
-    if tokenizer is None:
-        # todo(ochougul): use cache dir from snapshot download
-        tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
-    else:
-        if tokenizer.padding_side != "left":
-            logger.warning("Please use padding_side='left' while initializing the tokenizer")
-            tokenizer.padding_side = "left"
+    # return the model path for automation.
+    if return_path:
+        if save_fp16_onnx:
+            return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx")
+        else:
+            return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx")
 
-    if tokenizer.pad_token_id is None:
-        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+def export_kvstyle_transformed_model_to_onnx(model_name: str, transformed_model: torch.nn.Module, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+                                          onnx_dir_path: str, seq_len: int, save_fp32_onnx: Optional[bool] = False, save_fp16_onnx: Optional[bool] = True):
+    
+    if tokenizer.padding_side != "left":
+        logger.warning("Please use padding_side='left' while initializing the tokenizer")
+        tokenizer.padding_side = "left"
+
+    tokenizer.pad_token_id = tokenizer.eos_token_id if tokenizer.pad_token_id is None else tokenizer.pad_token_id    
 
     # Disabling requires_grad on all parameters
-    for j, p in enumerate(model.parameters()):
+    for j, p in enumerate(transformed_model.parameters()):
         p.requires_grad_(False)
 
     # Preprocess inputs
@@ -303,11 +266,10 @@ def convert_to_cloud_kvstyle(
     else:
         inputs = tokenizer(input_str, return_tensors="pt")
 
-    try:
-        pt_outputs = model(**inputs)
-        output_names = list(pt_outputs.keys())
-    except Exception as e:
-        print(f"Model {model_name} Execution failed in pytorch:%s", e)
+
+    pt_outputs = transformed_model(**inputs)
+    output_names = list(pt_outputs.keys())
+
 
     # Raise error if expected outputs are not present
     assert "logits" in output_names, "logits not found in output"
@@ -325,11 +287,9 @@ def convert_to_cloud_kvstyle(
     inputs["past_key_values"] = tuple([(key.detach(), value.detach()) for key, value in pt_outputs.past_key_values])
 
     # Run PyTorch inference with past
-    try:
-        pt_outputs = model(**inputs)
-        output_names = list(pt_outputs.keys())
-    except Exception as e:
-        print(f"Model {model_name} Execution failed in pytorch:%s", e)
+    pt_outputs = transformed_model(**inputs)
+    output_names = list(pt_outputs.keys())
+
 
     # Add pkv into output_names
     pkv = tuple([(key.detach(), value.detach()) for key, value in pt_outputs.past_key_values])
@@ -344,9 +304,12 @@ def convert_to_cloud_kvstyle(
         pt_outputs[f"past_key.{i}_RetainedState"] = key
         pt_outputs[f"past_value.{i}_RetainedState"] = value
 
+
+    model_base_name = model_name.replace("/", "_") + "_kv"
+    os.makedirs(onnx_dir_path, exist_ok=True)
     # Export and simplify ONNX model
     fp32_model_name = export_onnx(
-        pt_model=model,
+        pt_model=transformed_model,
         inputs=inputs,
         output_names=output_names,
         gen_models_path=onnx_dir_path,
@@ -405,39 +368,93 @@ def convert_to_cloud_kvstyle(
         inputs=inputs,
         input_list_file=input_list_file,
     )
+    
+    return fp32_model_name, fp16_model_name
+
+
+def export_for_cloud(model_name: str, qeff_model: QEFFBaseModel,
+                     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+                     onnx_dir_path: str, seq_length: int = Constants.seq_length,
+                     return_path: bool = True,
+                     save_fp32_onnx: bool = False,
+                     save_fp16_onnx: bool = True)-> Tuple[str, str]:
+    # FIXME: move all this to class instead of here, and just call qeff_model.export here.
+    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore
+        return export_lm_model_for_cloud(model_name=model_name,
+                                         qeff_model=qeff_model, # type: ignore
+                                         tokenizer=tokenizer,
+                                         onnx_dir_path=onnx_dir_path,
+                                         seq_length=seq_length,
+                                         return_path=return_path,
+                                         save_fp16_onnx=save_fp16_onnx,
+                                         save_fp32_onnx=save_fp32_onnx)
+    else:
+        raise NotImplementedError(f"Only model type {QEFFAutoModelForCausalLM.__class__.__name__} is supported for export, got {type(qeff_model)}")
+    
+
+def export_lm_model_for_cloud(model_name:str, qeff_model: QEFFAutoModelForCausalLM,
+                              tokenizer:Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+                              onnx_dir_path: str, seq_length: int, return_path:bool,
+                              save_fp32_onnx:bool, save_fp16_onnx: bool):
+    if os.path.exists(onnx_dir_path):
+        logger.warning(f"Overriding {onnx_dir_path}")
+        shutil.rmtree(onnx_dir_path)
 
+    if not (save_fp32_onnx or save_fp16_onnx):
+        raise AttributeError("save_fp32_onnx and save_fp16_onnx can't be false")
+
+    if tokenizer.padding_side != "left":
+        logger.warning("Please use padding_side='left' while initializing the tokenizer")
+        tokenizer.padding_side = "left"
+
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+
+    if qeff_model.is_transformed:
+        fp32_model_name, fp16_model_name = export_kvstyle_transformed_model_to_onnx(
+            model_name=model_name,
+            transformed_model=qeff_model.model,
+            tokenizer=tokenizer,
+            onnx_dir_path=onnx_dir_path,
+            seq_len=seq_length,
+            save_fp32_onnx=save_fp32_onnx,
+            save_fp16_onnx=save_fp16_onnx) # type: ignore
+
+    else:
+        fp32_model_name, fp16_model_name = export_bertstyle_model_to_onnx(
+            model_name=model_name,
+            model=qeff_model.model,
+            tokenizer=tokenizer, 
+            onnx_dir_path=onnx_dir_path,
+            seq_len=seq_length,
+            save_fp32_onnx=save_fp32_onnx,
+            save_fp16_onnx=save_fp16_onnx) # type: ignore
+
+    
     # return the model path for automation.
     if return_path:
         if save_fp16_onnx:
             return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx")
         else:
             return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx")
-    else:
-        return
-
-
-def convert_to_edge(self) -> None:
-    # [TODO]: Apply the class transformation to make changes for the KV models in edge use cases
-    # model = QEfficient.transform(model_hf, type="Transformers", form_factor="edge")
-    # model.eval()
-    raise NotImplementedError("Oops...reached too far!!")
 
 
 def qualcomm_efficient_converter(
     model_name: str,
-    model_class: type = None,
-    model_kv: torch.nn.Module = None,
-    tokenizer=None,
-    onnx_dir_path=None,
-    hf_token: str = "",
+    model_kv: QEFFBaseModel = None, # type: ignore
+    tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]]=None,
+    cache_dir: Optional[str] = None,
+    onnx_dir_path: Optional[str]=None,
+    hf_token: Optional[str] = None,
     seq_length: int = Constants.seq_length,
-    input_str: str = Constants.input_str,
     kv: bool = True,
-    return_path: bool = False,
-    form_factor="cloud",
+    return_path: bool = True,
+    form_factor: str="cloud",
     save_fp32_onnx: bool = False,
     save_fp16_onnx: bool = True,
-) -> str:
+) -> Tuple[str, str]:
+    
     """
     API to convert torch Bert style and KV style model to ONNX.
     ---------
@@ -458,36 +475,34 @@ def qualcomm_efficient_converter(
     Returns:
        Path of exported ONNX file.
     """
-    if model_kv is not None and not kv:
-        raise AttributeError("For transformed model kv must be True")
+    # Get model_kv first
+    model_kv = model_kv if model_kv else QEFFCommonLoader.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token, cache_dir=cache_dir)
+
+    # Transform if required
+    if model_kv.is_transformed and not kv:
+        raise AttributeError("Transformed model is passed while requsting to convert non-transformed model")
+    
+    model_kv = model_kv if model_kv.is_transformed else QEfficient.transform(model_kv) if kv else model_kv
 
+    if onnx_dir_path is None:
+        model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
+        onnx_dir_path = os.path.join(model_card_dir, "onnx")
+    
+    # Load tokenizer if not passed
+    tokenizer = tokenizer if tokenizer else load_hf_tokenizer(model_name=model_name, hf_token=hf_token, cache_dir=cache_dir)
+    
     if form_factor == "cloud":
-        if kv:
-            return convert_to_cloud_kvstyle(
-                model_name=model_name,
-                model_class=model_class,
-                model_kv=model_kv,
-                onnx_dir_path=onnx_dir_path,
-                tokenizer=tokenizer,
-                hf_token=hf_token,
-                seq_len=seq_length,
-                input_str=input_str,
-                return_path=return_path,
-                save_fp32_onnx=save_fp32_onnx,
-                save_fp16_onnx=save_fp16_onnx,
-            )
-        else:
-            return convert_to_cloud_bertstyle(
-                model_name=model_name,
-                model_class=model_class,
-                tokenizer=tokenizer,
-                onnx_dir_path=onnx_dir_path,
-                hf_token=hf_token,
-                seq_len=seq_length,
-                input_str=input_str,
-                return_path=return_path,
-                save_fp32_onnx=save_fp32_onnx,
-                save_fp16_onnx=save_fp16_onnx,
-            )
+        return export_for_cloud(
+            model_name=model_name,
+            qeff_model=model_kv,
+            tokenizer=tokenizer,
+            onnx_dir_path=onnx_dir_path,
+            seq_length=seq_length,
+            return_path=return_path,
+            save_fp16_onnx=save_fp16_onnx,
+            save_fp32_onnx=save_fp32_onnx)
     else:
-        return convert_to_edge()
+        # [TODO]: Apply the class transformation to make changes for the KV models in edge use cases
+        # model = QEfficient.transform(model_hf, type="Transformers", form_factor="edge")
+        # model.eval()
+        raise NotImplementedError("Oops! Reached too far!!")
diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py
index 324bcc092..fc71c9747 100644
--- a/QEfficient/exporter/export_utils.py
+++ b/QEfficient/exporter/export_utils.py
@@ -5,12 +5,10 @@
 #
 # -----------------------------------------------------------------------------
 
-import json
 import os
 import shutil
-import subprocess
 import sys
-from logging import error, info
+from logging import info
 from typing import Dict, List, Tuple, Union
 
 import numpy as np
@@ -95,8 +93,8 @@ def export_onnx(
             custom_opsets={"com.qti.aisw.onnx": 1},
         )
     except Exception as e:
-        error("Exporting to ONNX failed. {}".format(e))
-        return
+        raise RuntimeError("Exporting to ONNX failed. {}".format(e))
+        
 
     onnx.checker.check_model(f"{gen_models_path}_tmp/{model_base_name}.onnx")
     loaded_model = onnx.load(f"{gen_models_path}_tmp/{model_base_name}.onnx")
@@ -337,7 +335,7 @@ def generate_input_files(
         fp.write(",".join(filenames))
         fp.write("\n")
 
-
+# FIXME(ochougul/quic-mamta): Remove duplication with APIRunner
 def run_model_on_ort(
     onnx_path: str,
     inputs: Dict[str, torch.Tensor],
@@ -396,121 +394,3 @@ def run_model_on_ort(
         print(f"Failed to run the onnx {onnx_path} model in onnx runtime:%s", e)
         print("\n=============================================================\n")
         return input_names, None
-
-
-def run_model_on_cloud_ai_100(
-    onnx_path: str,
-    onnx_symbol_defs: Dict[str, int] = {},
-    **kwargs,
-) -> bool:
-    
-    """
-    API to run model on Cloud AI 100.
-    ---------
-    :param onnx_path: str. Path of the ONNX file.
-    :ONNX_symbol_defs: Dict[str, int] = {}.  Custom ONNX symbols definition.
-
-    Return:
-        True if model run successfully on Cloud AI 100.
-    """
-    
-    args = [
-        "/opt/qti-aic/exec/qaic-exec",
-        f"-m={onnx_path}",
-        "-aic-hw",
-        "-aic-hw-version=2.0",
-    ]
-    for onnx_symbol, onnx_def in onnx_symbol_defs.items():
-        args.append(f"-onnx-define-symbol={onnx_symbol},{onnx_def}")
-    for k, v in kwargs.items():
-        k = k.replace("_", "-")
-        if isinstance(v, bool):
-            if v:
-                args.append(f"-{k}")
-            continue
-        args.append(f"-{k}={v}")
-
-    info("Running compiler:", " ".join(args))
-    result = subprocess.run(args)
-    return result.returncode == 0
-
-
-def compile_kv_model_on_cloud_ai_100(
-    onnx_path: str,
-    specializations_json: str,
-    num_cores: int,
-    base_path: str,
-    mxfp6: bool,
-    custom_io_path: str,
-    aic_enable_depth_first: bool,
-    mos: int = -1,
-    device_group: List[int] = [0],
-    **kwargs,
-) -> bool:
-    
-    """
-    API to compile model Cloud AI 100.
-    ---------
-    :param onnx_path: str. Path of the ONNX file.
-    :specializations_json: str. Path of specializations.json file.
-    :num_cores: int. Number of cores to use during compilation.
-    :base_path: str. Path where intermediate files and compiled artifacts will be stored.
-    :mxfp6: bool. If true, it enables MXFP6 (Mixed Precision Floating Point 6) mode during compilation.
-    :custom_io_path: Path to a custom I/O configuration file.
-    :aic_enable_depth_first: bool. If true, it enables during compilation.
-    :mos: int. Effort level to reduce the on-chip memory.
-    :device_group: List[int]. List of device group IDs.
-
-    Return:
-        True if model compiled successfully.
-    """
-
-    aic_binary_dir = os.path.join(base_path, "qpcs")
-
-    if os.path.isdir(aic_binary_dir):
-        shutil.rmtree(aic_binary_dir)
-
-    assert os.path.isfile(
-        specializations_json
-    ), f"Please use 'from QEfficient.cloud.compile import main as compile', as {specializations_json} file was not found"
-    assert os.path.isfile(custom_io_path), f"{custom_io_path} file was not found!"
-    command = [
-        "/opt/qti-aic/exec/qaic-exec",
-        f"-m={onnx_path}",
-        "-aic-hw",
-        "-aic-hw-version=2.0",
-        f"-network-specialization-config={specializations_json}",
-        "-convert-to-fp16",
-        "-retained-state",
-        f"-aic-num-cores={num_cores}",
-        f"-custom-IO-list-file={custom_io_path}",
-        "-compile-only",
-        f"-aic-binary-dir={aic_binary_dir}",
-    ]
-    if mxfp6:
-        command.append("-mxfp6-matmul")
-    if mos > 0:
-        command.append(f"-mos={mos}")
-    if aic_enable_depth_first:
-        command.append("-aic-enable-depth-first")
-    if len(device_group) > 1:
-        mdp_ts_config = {
-            "connections": [{"devices": list(range(len(device_group))), "type": "p2p"}],
-            "partitions": [
-                {
-                    "name": "Partition0",
-                    "devices": [{"deviceId": device, "numCores": num_cores} for device in range(len(device_group))],
-                }
-            ],
-        }
-        mdp_ts_config_path = os.path.join(base_path, "mdp_ts_config.json")
-        with open(mdp_ts_config_path, "w") as file:
-            json.dump(mdp_ts_config, file, indent=4)
-        command.append(f"-mdp-load-partition-config={mdp_ts_config_path}")
-    print("Running AI 100 compiler:", " ".join(command))
-    result = subprocess.run(command, capture_output=True, text=True)
-    if result.returncode != 0:
-        raise RuntimeError(f"Compilation Failed!!\n\nSTDOUT\n{result.stdout}\n\nSTDERR\n{result.stderr}")
-
-    print("\n===================== Compilation Done! =====================\n")
-    return result.returncode == 0, aic_binary_dir
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 97200dfca..2225750a3 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -117,7 +117,7 @@ def get_compilation_batch_size(qpc_path: str):
     return compilation_batch_size
 
 
-def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size):
+def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) -> List[str]:
     assert (
         prompt is not None or prompts_txt_file_path is not None
     ), "Please pass atleast one argument either using --prompt or --prompts_txt_file_path"
diff --git a/QEfficient/src/__init__.py b/QEfficient/src/__init__.py
new file mode 100644
index 000000000..854686567
--- /dev/null
+++ b/QEfficient/src/__init__.py
@@ -0,0 +1,9 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from QEfficient.src._transformers.auto import QEffAutoModel, QEFFAutoModelForCausalLM  # noqa: F401
+from QEfficient.src.common import QEFFCommonLoader  # noqa: F401
diff --git a/QEfficient/src/_transformers/__init__.py b/QEfficient/src/_transformers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/QEfficient/src/_transformers/auto.py b/QEfficient/src/_transformers/auto.py
new file mode 100644
index 000000000..de01a0840
--- /dev/null
+++ b/QEfficient/src/_transformers/auto.py
@@ -0,0 +1,92 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+from typing import Any
+
+import torch.nn as nn
+from transformers.models.auto import AutoModel, AutoModelForCausalLM
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
+
+import QEfficient
+from QEfficient.src.base import QEFFBaseModel
+from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict
+
+# Dictionary that defines the interface from transformers to be used underneath the QEFF interface
+QEFFAutoModelToTransformersAutoModelMap = {
+    "QEFFAutoModelForCausalLM": AutoModelForCausalLM,
+    "QEFFAutoModel": AutoModel,
+}
+
+
+class QEFFTransformersBase(QEFFBaseModel):
+    """
+    Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from src/transformers/models/auto/modeling_auto.py file.
+    """
+    def __init__(self, model: nn.Module, transform:bool = True) -> None:
+        assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or
+                # FIXME: Use model architectures here instead of complete dictionary TransformersToQEffModulesDict
+                model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore
+        self.model: nn.Module = model
+        if transform:
+            self.transform()
+
+    def __repr__(self) -> str:
+        return self.model.__repr__()
+    
+    @property
+    def is_transformed(self) -> bool:
+        return getattr(self.model, "qeff_transformed", False)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
+        """
+        This method accepts All the parameters that are acceptable by transformers.AutoModelForCausalLM.
+        There are few additional parameters that this method can take:
+        :param transform:bool. Whether to optimize model for KV retention; default is True. Pass False to get BertStyle model.
+        """
+        transform: bool = kwargs.get("transform", True)
+        kwargs.update({"use_cache": True})  # Always pass use_cache = True, to get KV values as output during ONNX export 
+        
+        model = QEFFAutoModelToTransformersAutoModelMap[cls.__name__].from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        return cls(model, transform=transform)
+        
+
+    def transform_export(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("Reached too far!!")
+    
+    def transform_export_compile(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("Reached too far!!")
+        
+    def transform(self):
+        # FIXME: break down transform into optmization passes i.e. HW specific optimization(RMSNorm), KV retention pass etc.
+        QEfficient.transform(self)
+        return self
+
+
+class QEFFAutoModelForCausalLM(QEFFTransformersBase):
+    """
+    QEFF class for manipulating any causal language model from HuggingFace hub.
+    """
+    def execute(self, *args, **kwargs): # type: ignore
+        raise NotImplementedError("Reached too far!!")
+    
+    def export(self):
+        raise NotImplementedError("Reached too far!!")
+    
+    def compile(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("Reached too far!!")
+
+
+class QEffAutoModel(QEFFTransformersBase):
+    def execute(self, *args, **kwargs): # type: ignore
+        raise NotImplementedError("Reached too far!!")
+    
+    def export(self):
+        raise NotImplementedError("Reached too far!!")
+    
+    def compile(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("Reached too far!!")
diff --git a/QEfficient/src/base.py b/QEfficient/src/base.py
new file mode 100644
index 000000000..ddc23fc87
--- /dev/null
+++ b/QEfficient/src/base.py
@@ -0,0 +1,99 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+"""
+** This file for holds the classes that handle main functions
+1.load i.e. from_pretrained
+2.execute
+3.transform
+4.export
+5.compile
+For different varities of Transformer Models
+
+Representation of class inheritence followed keeping in line with transformers/diffusers repos ->
+
+                                                                                            QEFFBaseModel
+                                                 ________________________________________________|________________________________________________________________
+                                                |                                                                                                                 |  
+                                            QEFFTransformersBase                                                                                           QEFFDiffusersBase
+                                                |                                                                                                                 |
+                                    ____________|________________________________________________________ ________________                       _________________|______________
+                   _____           |                              |                                      |                |                     |                                |         
+                  |          QEFFAutoModel             QEFFAutoModelForCausalLM              QEFFAWQModelForCausalLM     ...                   ...                              ...
+QEFFCommonLoader -|       [Provides way to          [Provides way to do 1-5 on                 [Supports 1-5 for 
+[Provides         |        do steps 1-5 on           transformers.AutoModelForCausalLM]         AWQ Models]
+interface to      |_____   transformers.AutoModel]
+Load any of 
+These models       
+by automatically
+detecting the type
+of the model]
+
+** QEFFBASEModel is abstract base class that defines the basic structure of these classes.
+** QEFFPipeline classes will stay at the same level as QEFFAutoModel in this hierarchy in future.
+"""
+
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Any
+
+
+#Defining placeholder ENUM for execute function
+class Runtime(Enum):
+    CPU_ORT = "CPU ONNX Runtime"
+    CPU_PT = "CPU PyTorch Runtime"
+    AI_100 = "AI_100"
+
+
+class QEFFBaseModel(ABC):
+    """
+    This class acts as parent class for all the varieties of model class (i.e. LLMs, SD, quantized etc.).
+    Enforces certain methods to be implemented by child classes.
+
+    All the child classes must provide way to load, transform(optimize), exoprt to ONNX etc. capabilities.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        # Users can call generate or execute
+        self.generate = self.execute
+        self._runtime = Runtime.CPU_PT
+
+    @property
+    def runtime(self) -> Runtime:
+        return self._runtime
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
+        raise NotImplementedError("Must implement for child classes")
+
+    @property
+    def is_transformed(self) -> bool:
+        raise NotImplementedError("Must implement for child classes")
+
+    @abstractmethod
+    def transform_export(self, *args, **kwargs) -> Any:
+        pass
+
+    @abstractmethod
+    def transform_export_compile(self, *args, **kwargs) -> Any:
+        pass
+
+    @abstractmethod
+    def execute(self, *args, **kwargs) -> Any:
+        pass
+
+    @abstractmethod
+    def transform(self, *args, **kwargs) -> Any:
+        pass
+
+    @abstractmethod
+    def export(self, *args, **kwargs) -> Any:
+        pass
+
+    @abstractmethod
+    def compile(self, *args, **kwargs) -> Any:
+        pass
\ No newline at end of file
diff --git a/QEfficient/src/common.py b/QEfficient/src/common.py
new file mode 100644
index 000000000..bca391097
--- /dev/null
+++ b/QEfficient/src/common.py
@@ -0,0 +1,86 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+"""
+MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP dictionary defines the mapping between names of the varities of Transformer model defined in 
+QEFF_MODEL_TYPE and the classes that implement the methods i.e.(compile, export etc.) for those types.
+
+QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model.
+"""
+import os
+from enum import Enum
+from typing import Any, Dict, Type
+
+from transformers import AutoConfig
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
+
+from QEfficient.src._transformers.auto import QEFFAutoModelForCausalLM
+from QEfficient.src.base import QEFFBaseModel
+from QEfficient.utils._utils import login_and_download_hf_lm
+
+
+class QEFF_MODEL_TYPE(Enum):
+    """
+    Defines Names of the different varities of transformer models.
+    """
+    CAUSALLM = "LLM"
+    DIFFUSION = "STABLE_DIFFUSION"
+    AWQ = "AWQ"
+
+
+MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = {
+    QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM
+}
+
+AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()}
+
+def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE:
+    """
+    Loads model config file and returns the type of the model (i.e. LLMs, SD, quantized etc.) as supported by the library.
+    """
+    assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model"
+    config, kwargs = AutoConfig.from_pretrained(
+                hf_model_path,
+                return_unused_kwargs=True,
+            )
+
+    if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING:
+        # FIXME: Add logic to handle if quantization config is stored in separate quant_config.json outside of config, also create a separate function for this and below lines
+        quant_config = getattr(config, "quantization_config", getattr(config, "quant_config", None))
+        if quant_config is not None:
+            if quant_config.get("quant_method", None) == "awq":
+                return QEFF_MODEL_TYPE.AWQ
+            else:
+                raise NotImplementedError(f"current model type is not yet supported {type(config)}")
+        else:
+            return QEFF_MODEL_TYPE.CAUSALLM
+    else:
+        raise NotImplementedError(f"model type {type(config)} is not yet supported")
+
+
+class QEFFCommonLoader:
+    """
+    Provides HuggingFace model loading interface same as transformers APIs.
+    Supports loading any model on HuggingFace.
+    """
+    def __init__(self, *args: Any, **kwds: Any) -> None:
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`")
+    
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseModel:
+        """
+        Downloads HuggingFace model if already doesn't exist locally, returns QEffAutoModel object based on type of model.
+        """
+        pretrained_model_name_or_path = pretrained_model_name_or_path if os.path.isdir(pretrained_model_name_or_path) \
+            else login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs)
+        model_type = get_hf_model_type(hf_model_path=pretrained_model_name_or_path)
+        qeff_auto_model_class = MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP[model_type]
+        assert issubclass(qeff_auto_model_class, QEFFBaseModel), f"Expected class that inherits {QEFFBaseModel}, got {type(qeff_auto_model_class)}"
+
+        return qeff_auto_model_class.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path)
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 5ad29ef3d..753d08204 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,11 +5,10 @@
 #
 # -----------------------------------------------------------------------------
 
-import hashlib
 from collections import namedtuple
+from typing import Dict, Type
 
 import torch.nn as nn
-import transformers
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
     CodeGenBlock,
@@ -34,32 +33,18 @@
 )
 from transformers.models.mixtral.modeling_mixtral import (
     MixtralAttention,
+    MixtralBLockSparseTop2MLP,
+    MixtralDecoderLayer,
     MixtralForCausalLM,
     MixtralModel,
-    MixtralDecoderLayer,
-    MixtralSparseMoeBlock,
-    MixtralBLockSparseTop2MLP,
-    MixtralRotaryEmbedding,
     MixtralRMSNorm,
+    MixtralRotaryEmbedding,
+    MixtralSparseMoeBlock,
 )
 from transformers.models.mpt.modeling_mpt import MptAttention, MptBlock, MptForCausalLM, MptModel
 
 from QEfficient.customop import CustomRMSNormAIC
-from QEfficient.utils.logging_utils import logger
 
-from .modeling_attn_mask_utils import (
-    QEffAttentionMaskConverter,
-    _qeff_prepare_4d_attention_mask,
-    _qeff_prepare_4d_causal_attention_mask,
-)
-from .modeling_outputs import (
-    QEffBaseModelOutputWithPast,
-    QEffBaseModelOutputWithPastAndCrossAttentions,
-    QEffCausalLMOutputWithCrossAttentions,
-    QEffCausalLMOutputWithPast,
-    QEffMoeCausalLMOutputWithPast,
-    QEffMoeModelOutputWithPast,
-)
 from .models.codegen.modeling_codegen import (
     QEffCodeGenAttention,
     QEffCodeGenBlock,
@@ -81,13 +66,13 @@
     QEffMistralRotaryEmbedding,
 )
 from .models.mixtral_moe.modeling_mixtral import (
-    QEffMixtralModel,
-    QEffMixtralRotaryEmbedding,
     QEffMixtralAttention,
-    QEffMixtralForCausalLM,
+    QEffMixtralBLockSparseTop2MLP,
     QEffMixtralDecoderLayer,
+    QEffMixtralForCausalLM,
+    QEffMixtralModel,
+    QEffMixtralRotaryEmbedding,
     QEffMixtralSparseMoeBlock,
-    QEffMixtralBLockSparseTop2MLP,
 )
 from .models.mpt.modeling_mpt import QEffMptAttention, QEffMptBlock, QEffMptForCausalLM, QEFfMptModel
 
@@ -108,7 +93,7 @@
 
 # Define a transformers layers to QEff layers dictionary
 # While onboarding new models make sure to add the new layer maps to this dictionary.
-TransformersToQEffModulesDict = {
+TransformersToQEffModulesDict: Dict[Type[nn.Module], Type[nn.Module]] = {
     # GPT model layers
     GPT2Model: QEffGPT2Model,
     GPT2Block: QEffGPT2Block,
@@ -147,87 +132,3 @@
     MixtralSparseMoeBlock: QEffMixtralSparseMoeBlock,
     MixtralBLockSparseTop2MLP:QEffMixtralBLockSparseTop2MLP,
 }
-
-
-def get_params_hash(model: nn.Module) -> str:
-    """
-    Creates a Hash of all the parameters values i.e. weights using SHA256 algo.
-    --------
-    :param model: torch.nn.Module. Base PyTorch model.
-    :returns: str. Hash string
-    """
-    hasher = hashlib.sha256()
-    for _, params in model.named_parameters():
-        hasher.update(params.data.numpy().tobytes())
-
-    return hasher.hexdigest()
-
-
-def replace_module_with_qeff_layers(model: nn.Module) -> None:
-    """
-    Replaces the transformers nn.Module classes with optmized QEff classes in place.
-    ----------
-    :param model: torch.nn.Module. Base PyTorch model.
-    """
-    # Replace if module class is registed in TransformersToQEffModulesDict
-    target_module = TransformersToQEffModulesDict.get(model.__class__)
-    if target_module is not None:
-        model.__class__ = target_module
-
-    # Iterate over child modules
-    for _, module in model.named_children():
-        replace_module_with_qeff_layers(module)
-
-
-def transform(model: nn.Module, form_factor: str = "cloud") -> nn.Module:
-    """
-    Replaces some Transformers' methods for equivalent methods optimized for AI 100.
-    ---------
-    Args:
-    param model (torch.nn.Module): PyTorch model.
-    form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"].
-
-    Returns:
-    torch.nn.Module: PyTorch Module with replaced QEff layers.
-    """
-    
-    # Introducnig qeff_transformed attribue in model to check status of transform
-    if getattr(model, "qeff_transformed", False):
-        print("Model is already transformed")
-        return model
-
-    
-    if form_factor == "cloud":
-        # Get Hash of all params for checking later
-        prior_params_hash = get_params_hash(model)
-        logger.warning(f"The model {model.__class__} layers has been upadted to QEff layers in-place")
-        # Replace with QEff layers
-        replace_module_with_qeff_layers(model)
-
-        # Check with new params hash
-        later_params_hash = get_params_hash(model)
-        assert (
-            prior_params_hash == later_params_hash
-        ), "Weights were changed in the transform process, please report an issue"
-
-        # Replace the modeling output classes
-        transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions = (
-            QEffBaseModelOutputWithPastAndCrossAttentions
-        )
-        transformers.modeling_outputs.CausalLMOutputWithCrossAttentions = QEffCausalLMOutputWithCrossAttentions
-        transformers.modeling_outputs.BaseModelOutputWithPast = QEffBaseModelOutputWithPast
-        transformers.modeling_outputs.CausalLMOutputWithPast = QEffCausalLMOutputWithPast
-        transformers.modeling_outputs.MoeCausalLMOutputWithPast = QEffMoeCausalLMOutputWithPast
-        transformers.modeling_outputs.MoeModelOutputWithPast = QEffMoeModelOutputWithPast
-
-        # Replace the modeling attn util classes and functions
-        transformers.modeling_attn_mask_utils.AttentionMaskConverter = QEffAttentionMaskConverter
-        transformers.modeling_attn_mask_utils._prepare_4d_attention_mask = _qeff_prepare_4d_attention_mask
-        transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask = _qeff_prepare_4d_causal_attention_mask
-
-        setattr(model,'qeff_transformed',True)
-        return model.eval()
-
-    elif form_factor == "edge":
-        # Add changes for the edge usecase
-        raise NotImplementedError("We currently only support cloud form factor!")
diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py
new file mode 100644
index 000000000..dfd0de5c7
--- /dev/null
+++ b/QEfficient/transformers/transform.py
@@ -0,0 +1,123 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import hashlib
+
+import torch.nn as nn
+import transformers
+
+from QEfficient.src.base import QEFFBaseModel
+from QEfficient.src.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE
+from QEfficient.transformers.modeling_attn_mask_utils import (
+    QEffAttentionMaskConverter,
+    _qeff_prepare_4d_attention_mask,
+    _qeff_prepare_4d_causal_attention_mask,
+)
+from QEfficient.transformers.modeling_outputs import (
+    QEffBaseModelOutputWithPast,
+    QEffBaseModelOutputWithPastAndCrossAttentions,
+    QEffCausalLMOutputWithCrossAttentions,
+    QEffCausalLMOutputWithPast,
+    QEffMoeCausalLMOutputWithPast,
+    QEffMoeModelOutputWithPast,
+)
+from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict
+from QEfficient.utils.logging_utils import logger
+
+
+def replace_module_with_qeff_layers(model: nn.Module) -> None:
+    """
+    Replaces the transformers nn.Module classes with optmized QEff classes in place.
+    ----------
+    :param model: torch.nn.Module. Base PyTorch model.
+    """
+    # Replace if module class is registed in TransformersToQEffModulesDict
+    target_module = TransformersToQEffModulesDict.get(model.__class__)
+    if target_module is not None:
+        model.__class__ = target_module
+
+    # Iterate over child modules
+    for _, module in model.named_children():
+        replace_module_with_qeff_layers(module)
+
+
+def get_params_hash(model: nn.Module) -> str:
+    """
+    Creates a Hash of all the parameters values i.e. weights using SHA256 algo.
+    --------
+    :param model: torch.nn.Module. Base PyTorch model.
+    :returns: str. Hash string
+    """
+    hasher = hashlib.sha256()
+    for _, params in model.named_parameters():
+        hasher.update(params.data.numpy().tobytes())
+
+    return hasher.hexdigest()
+
+
+def transform_lm(model: nn.Module) -> nn.Module:
+    """
+    Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for cloud AI 100.
+    ---------
+    Args:
+    param model (torch.nn.Module): PyTorch model.
+
+    Returns:
+    torch.nn.Module: PyTorch Module with replaced QEff layers.
+    """
+
+    # Introducnig qeff_transformed attribue in model to check status of transform
+    if getattr(model, "qeff_transformed", False):
+        print("Model is already transformed")
+        return model
+
+    # Get Hash of all params for checking later
+    prior_params_hash = get_params_hash(model)
+    logger.warning(f"The model {model.__class__} layers has been upadted to QEff layers in-place")
+    # Replace with QEff layers
+    replace_module_with_qeff_layers(model)
+
+    # Check with new params hash
+    later_params_hash = get_params_hash(model)
+    assert (
+        prior_params_hash == later_params_hash
+    ), "Weights were changed in the transform process, please report an issue"
+
+    # Replace the modeling output classes
+    transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions = (
+        QEffBaseModelOutputWithPastAndCrossAttentions
+    )
+    transformers.modeling_outputs.CausalLMOutputWithCrossAttentions = QEffCausalLMOutputWithCrossAttentions
+    transformers.modeling_outputs.BaseModelOutputWithPast = QEffBaseModelOutputWithPast
+    transformers.modeling_outputs.CausalLMOutputWithPast = QEffCausalLMOutputWithPast
+    transformers.modeling_outputs.MoeCausalLMOutputWithPast = QEffMoeCausalLMOutputWithPast
+    transformers.modeling_outputs.MoeModelOutputWithPast = QEffMoeModelOutputWithPast
+
+    # Replace the modeling attn util classes and functions
+    transformers.modeling_attn_mask_utils.AttentionMaskConverter = QEffAttentionMaskConverter
+    transformers.modeling_attn_mask_utils._prepare_4d_attention_mask = _qeff_prepare_4d_attention_mask
+    transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask = _qeff_prepare_4d_causal_attention_mask
+
+    setattr(model,'qeff_transformed',True)
+    return model.eval()
+
+
+def transform(model: QEFFBaseModel, form_factor="cloud"):
+    """
+    This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for cloud AI 100.
+    Will replace the torch.nn.Module layers of passed QEffModel with optimized implementation of the same.
+
+    model: object of any instance of class that is child of `QEFFBaseAutoModelFactory`
+    form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"].
+    """
+    assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!"
+    #FIXME: move this to class and use model.transform()
+    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM:
+        transform_lm(model.model) # type: ignore
+        return model
+    else:
+        raise NotImplementedError(f"Recieved unsupported class of type {type(model)}")
\ No newline at end of file
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index 486bae664..bd6b59120 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -5,96 +5,11 @@
 #
 # -----------------------------------------------------------------------------
 
-import os
-from typing import List, Optional, Tuple, Union
-
-import requests
-from huggingface_hub import snapshot_download
-from requests.exceptions import HTTPError
-
-from QEfficient.utils.constants import QEFF_MODELS_DIR
-
-
-def hf_download(
-    repo_id: Optional[str] = None,
-    cache_dir: Optional[str] = None,
-    hf_token: Optional[str] = None,
-    allow_patterns: Optional[List[str]] = None,
-    ignore_patterns: Optional[List[str]] = None,
-):
-    # Setup cache and local dir
-    local_dir = None
-    if cache_dir is not None:
-        cache_dir = f"{cache_dir}"
-        local_dir = f"{cache_dir}/{repo_id}"
-
-    os.makedirs(f"{cache_dir}/{repo_id}", exist_ok=True)
-    max_retries = 5
-    retry_count = 0
-    while retry_count < max_retries:
-        try:
-            model_path = snapshot_download(
-                repo_id,
-                cache_dir=cache_dir,
-                local_dir=local_dir,
-                local_dir_use_symlinks=True,
-                revision="main",
-                resume_download=True,
-                token=hf_token,
-                allow_patterns=allow_patterns,
-                ignore_patterns=ignore_patterns,
-            )
-            break
-        except requests.ReadTimeout as e:
-            print(f"Read timeout: {e}")
-            retry_count += 1
-
-        except HTTPError as e:
-            retry_count = max_retries
-            if e.response.status_code == 401:
-                print("You need to pass a valid `--hf_token=...` to download private checkpoints.")
-            else:
-                raise e
-
-    return model_path
-
-
-def qpc_exists(model_name: str, qpc_base_dir_name: str) -> Union[Tuple[bool, str], None]:
-    """
-    Checks if qpc files already exists, removes the directory if files have been manipulated.
-    ---------
-    :param model_name: str. HF Model card name.
-    :param dir_path: str. Path of qpc directory.
-    :return: Union[Tuple[bool, str]]: qpc_exists and path to qpc directory
-    """
-    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
-    os.makedirs(model_card_dir, exist_ok=True)
-
-    qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs")
-
-    # Compute the boolean indicating if the QPC exists
-    qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin"))
-
-    return qpc_exists_bool, qpc_dir_path
-
-
-def onnx_exists(model_name: str) -> Union[Tuple[bool, str, str], None]:
-    """
-    Checks if qpc files already exists, removes the directory if files have been manipulated.
-    ---------
-    :param model_name: str. HF Model card name.
-    :return: Union[Tuple[bool, str, str]]: onnx_exists and path to onnx file and directory
-    """
-    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
-    os.makedirs(model_card_dir, exist_ok=True)
-
-    onnx_dir_path = os.path.join(model_card_dir, "onnx")
-    onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
-
-    # Compute the boolean indicating if the ONNX model exists
-    onnx_exists_bool = os.path.isfile(onnx_model_path) and os.path.isfile(
-        os.path.join(os.path.dirname(onnx_model_path), "custom_io_fp16.yaml")
-    )
-
-    # Return the boolean, onnx_dir_path, and onnx_model_path
-    return onnx_exists_bool, onnx_dir_path, onnx_model_path
+from QEfficient.utils._utils import (  # noqa: F401
+    get_qpc_dir_name_infer,
+    hf_download,
+    load_hf_tokenizer,
+    login_and_download_hf_lm,
+    onnx_exists,
+    qpc_exists,
+)
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
new file mode 100644
index 000000000..7a0d85828
--- /dev/null
+++ b/QEfficient/utils/_utils.py
@@ -0,0 +1,142 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import os
+from typing import List, Optional, Tuple, Union
+
+import requests
+from huggingface_hub import login, snapshot_download
+from requests.exceptions import HTTPError
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from QEfficient.utils.constants import QEFF_MODELS_DIR
+from QEfficient.utils.logging_utils import logger
+
+
+def login_and_download_hf_lm(model_name, *args, **kwargs):
+    logger.info(f"loading HuggingFace model for {model_name}")
+    hf_token = kwargs.pop("hf_token", None)
+    cache_dir = kwargs.pop("cache_dir", None)   
+    if hf_token is not None:
+        login(hf_token)
+    model_name = hf_download(
+        repo_id=model_name,
+        cache_dir=cache_dir,
+        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
+    )
+    return model_name
+
+
+def hf_download(
+    repo_id: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+    hf_token: Optional[str] = None,
+    allow_patterns: Optional[List[str]] = None,
+    ignore_patterns: Optional[List[str]] = None,
+):
+    # Setup cache and local dir
+    local_dir = None
+    if cache_dir is not None:
+        cache_dir = f"{cache_dir}"
+        local_dir = f"{cache_dir}/{repo_id}"
+
+    os.makedirs(f"{cache_dir}/{repo_id}", exist_ok=True)
+    max_retries = 5
+    retry_count = 0
+    while retry_count < max_retries:
+        try:
+            model_path = snapshot_download(
+                repo_id,
+                cache_dir=cache_dir,
+                local_dir=local_dir,
+                local_dir_use_symlinks=True,
+                revision="main",
+                resume_download=True,
+                token=hf_token,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
+            )
+            break
+        except requests.ReadTimeout as e:
+            print(f"Read timeout: {e}")
+            retry_count += 1
+
+        except HTTPError as e:
+            retry_count = max_retries
+            if e.response.status_code == 401:
+                print("You need to pass a valid `--hf_token=...` to download private checkpoints.")
+            else:
+                raise e
+
+    return model_path
+
+
+def qpc_exists(model_name: str, qpc_base_dir_name: str) -> Tuple[bool, str]:
+    """
+    Checks if qpc dir exists.
+    Returns
+    1. Boolean variable indicating if qpc files exist
+    2. Path of the qpc dir if found.
+    ---------
+    :param model_name: str. HF Model card name.
+    :param dir_path: str. Path of qpc directory.
+    :return: Union[Tuple[bool, str]]: qpc_exists and path to qpc directory
+    """
+    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
+    os.makedirs(model_card_dir, exist_ok=True)
+
+    qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs")
+
+    # Compute the boolean indicating if the QPC exists
+    qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin"))
+
+    return qpc_exists_bool, qpc_dir_path
+
+
+def onnx_exists(model_name: str) -> Tuple[bool, str, str]:
+    """
+    Checks if qpc files already exists, removes the directory if files have been manipulated.
+    ---------
+    :param model_name: str. HF Model card name.
+    :return: Union[Tuple[bool, str, str]]: onnx_exists and path to onnx file and directory
+    """
+    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
+    os.makedirs(model_card_dir, exist_ok=True)
+
+    onnx_dir_path = os.path.join(model_card_dir, "onnx")
+    onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
+
+    # Compute the boolean indicating if the ONNX model exists
+    onnx_exists_bool = os.path.isfile(onnx_model_path) and os.path.isfile(
+        os.path.join(os.path.dirname(onnx_model_path), "custom_io_fp16.yaml")
+    )
+
+    # Return the boolean, onnx_dir_path, and onnx_model_path
+    return onnx_exists_bool, onnx_dir_path, onnx_model_path
+
+
+def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token: Optional[str] = None, padding_side:str = "left", **kwargs) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]:
+    logger.info(f"Loading Tokenizer for {model_name}")
+    if hf_token is not None:
+        login(hf_token)
+
+    # Download tokenizer along with model if it doesn't exist
+    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"])
+    #FIXME(ochougul): should this always return left padded tokenizer?
+    tokenizer = AutoTokenizer.from_pretrained(model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs)
+    return tokenizer
+
+
+def get_qpc_dir_name_infer(num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group):
+    qpc_base_dir_name = (
+        f"qpc_{num_cores}cores_{batch_size}BS_{prompt_len}PL_{ctx_len}CL_{mos}MOS_"
+        + f"{len(device_group)}"
+        + "devices"
+        + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16")
+    )
+
+    return qpc_base_dir_name
diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py
index fe42d5ed9..044e6e83f 100644
--- a/QEfficient/utils/logging_utils.py
+++ b/QEfficient/utils/logging_utils.py
@@ -13,19 +13,20 @@ class QEffFormatter(logging.Formatter):
     Formatter class used to set colors for printing different logging levels of messages on console.
     """
 
-    grey = "\x1b[38;20m"
-    yellow = "\x1b[33;20m"
-    red = "\x1b[31;20m"
-    bold_red = "\x1b[31;1m"
-    reset = "\x1b[0m"
-    format = "%(levelname)s - %(name)s - %(message)s  (%(filename)s:%(lineno)d)"
+    cyan: str = "\x1b[38;5;14m"
+    yellow: str = "\x1b[33;20m"
+    red: str = "\x1b[31;20m"
+    bold_red: str = "\x1b[31;1m"
+    reset: str = "\x1b[0m"
+    common_format: str = "%(levelname)s - %(name)s - %(message)s" # type: ignore
+    format_with_line_info = "%(levelname)s - %(name)s - %(message)s  (%(filename)s:%(lineno)d)" # type: ignore
 
     FORMATS = {
-        logging.DEBUG: grey + format + reset,
-        logging.INFO: grey + format + reset,
-        logging.WARNING: yellow + format + reset,
-        logging.ERROR: red + format + reset,
-        logging.CRITICAL: bold_red + format + reset,
+        logging.DEBUG: cyan + format_with_line_info + reset,
+        logging.INFO: cyan + common_format + reset,
+        logging.WARNING: yellow + common_format + reset,
+        logging.ERROR: red + format_with_line_info + reset,
+        logging.CRITICAL: bold_red + format_with_line_info + reset,
     }
 
     def format(self, record):
@@ -45,7 +46,7 @@ def create_logger() -> logging.Logger:
 
     # create console handler and set level to debug
     ch = logging.StreamHandler()
-    ch.setLevel(logging.WARNING)
+    ch.setLevel(logging.INFO)
     # define formatter
     ch.setFormatter(QEffFormatter())
 
diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py
index 598f24553..f30016a5a 100644
--- a/QEfficient/utils/run_utils.py
+++ b/QEfficient/utils/run_utils.py
@@ -58,7 +58,6 @@ def run_hf_model_on_pytorch(self, model_hf):
         Return:
             generated_ids: numpy.ndarray - output tokens
         """
-
         input_ids = self.tokenizer.encode(self.prompt[0], return_tensors="pt")
 
         input_ids_len = len(input_ids[0])
@@ -77,6 +76,7 @@ def run_hf_model_on_pytorch(self, model_hf):
         print("Completion:", repr(generated_text))
         return generated_ids
 
+
     def run_kv_model_on_pytorch(self, model, n_layer, padding_shape):
         """
         Function responsible for running KV PyTorch model and return the output tokens
diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index 668a3b473..3095c7044 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -13,7 +13,16 @@
    "id": "88eef7ea-3488-414c-9e36-e960abba30c9",
    "metadata": {},
    "source": [
-    "##### Download the OpenSource GPT2 based HuggingFace Model and Save in local *Cache* directory"
+    "##### Download the OpenSource GPT2 based HuggingFace Model and Save in local *Cache* directory\n",
+    "###### We Modify the GPT2 Classes using the Optimized Software Library to generate model for Cloud AI 100.\n",
+    "###### User can disable this optmization by passing `transfrom=False` in the `from_pretrained` call\n",
+    "###### Here we generate models with below Optimizations:\n",
+    "\n",
+    "* RMS Norm Fixes for FP16 Overflows and Underflow\n",
+    "* Causal Mask Fix\n",
+    "* Handling FP16 Overflows.\n",
+    "* KV Cache (Retention Changes).\n",
+    "* Triu/Tril Ops support."
    ]
   },
   {
@@ -26,59 +35,19 @@
     "# Initiate the Orignal Transformer model\n",
     "import os\n",
     "\n",
-    "from transformers import AutoTokenizer\n",
-    "from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel\n",
-    "\n",
-    "from QEfficient.utils import hf_download\n",
-    "from QEfficient.utils.constants import Constants\n",
+    "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n",
     "\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
     "\n",
-    "ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n",
+    "# ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n",
+    "# CACHE_DIR = os.path.join(ROOT_DIR, \"tmp\") #, you can use a different location for just one model by passing this param as cache_dir in below API.\n",
     "\n",
     "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n",
     "model_name = \"gpt2\"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n",
     "\n",
-    "model_hf_path = hf_download(\n",
-    "    repo_id=model_name,\n",
-    "    cache_dir=Constants.CACHE_DIR,\n",
-    "    ignore_patterns=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n",
-    ")\n",
-    "model_hf = GPT2LMHeadModel.from_pretrained(model_hf_path, use_cache=True)\n",
-    "model_hf.eval()\n",
-    "print(f\"{model_name} from hugging-face \\n\", model_hf)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a89dfa0a-d8fe-4472-bf00-55e563ae9058",
-   "metadata": {},
-   "source": [
-    "##### Now we Modify the GPT2 Classes using the Optimized Software Library to generate model for Cloud AI 100.\n",
-    "##### Here we generate models with below Optimizations:\n",
-    "\n",
-    "* RMS Norm Fixes for FP16 Overflows and Underflow\n",
-    "* Causal Mask Fix\n",
-    "* Handling FP16 Overflows.\n",
-    "* KV Cache (Retention Changes).\n",
-    "* Triu/Tril Ops support."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a4543b94-9b50-4bcc-90c6-484ab694c9a6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import QEfficient\n",
-    "\n",
-    "# Easy and minimal api to update the model\n",
-    "model_transformed = QEfficient.transform(model_hf, type=\"Transformers\", form_factor=\"cloud\")\n",
-    "\n",
-    "model_transformed.eval()\n",
-    "print(\"Model after Optimized transformations \\n\", model_transformed)"
+    "qeff_model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "print(f\"{model_name} optmized for AI 100 \\n\", qeff_model)"
    ]
   },
   {
@@ -96,8 +65,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter\n",
-    "\n",
+    "import QEfficient\n",
+    "from QEfficient.utils import load_hf_tokenizer\n",
     "# We can now export the modified models to Onnx framework\n",
     "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n",
     "# Cloud AI 100 Platform.\n",
@@ -109,14 +78,14 @@
     "# We can generate the KV Style models with the flag \"kv\"\n",
     "# Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version.\n",
     "# It is recommended to use kv=True for better performance.\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side=\"left\")\n",
-    "base_path, onnx_path = qualcomm_efficient_converter(\n",
-    "    model_kv=model_transformed,\n",
+    "tokenizer = load_hf_tokenizer(model_name, use_cache=True)\n",
+    "base_path, onnx_path = QEfficient.export(\n",
     "    model_name=model_name,\n",
+    "    model_kv=qeff_model,\n",
+    "    tokenizer=tokenizer,\n",
     "    kv=True,\n",
     "    form_factor=\"cloud\",\n",
     "    return_path=True,\n",
-    "    tokenizer=tokenizer,\n",
     ")"
    ]
   },
@@ -136,13 +105,12 @@
    "outputs": [],
    "source": [
     "# Please use platform SDk to Check num_cores for your card.\n",
-    "from QEfficient.cloud.compile import main as compile\n",
     "\n",
-    "generated_qpc_path = compile(\n",
+    "generated_qpc_path = QEfficient.compile(\n",
     "    onnx_path=onnx_path,\n",
     "    num_cores=14,\n",
-    "    qpc_path=base_path,\n",
-    "    mxfp6=True,\n",
+    "    qpc_path=os.path.dirname(base_path),\n",
+    "    mxfp6=False,\n",
     "    device_group=[0],\n",
     ")"
    ]
@@ -162,12 +130,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n",
+    "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
     "\n",
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
-    "batch_size = get_compilation_batch_size(generated_qpc_path)\n"
-    "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
+    "batch_size = get_compilation_batch_size(generated_qpc_path)\n",
+    "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
    ]
   }
  ],
diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index 8533eedcc..15e84399a 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -13,7 +13,15 @@
    "id": "88eef7ea-3488-414c-9e36-e960abba30c9",
    "metadata": {},
    "source": [
-    "##### Download the OpenSource MPT based HuggingFace Model and Save in local *Cache* directory"
+    "##### Download the OpenSource MPT based HuggingFace Model and Save in local *Cache* directory\n",
+    "###### Now we Modify the MPT Classes using the Optimized Software Library to generate model for Cloud AI 100.\n",
+    "###### Here we generate models with below Optimizations:\n",
+    "\n",
+    "* RMS Norm Fixes for FP16 Overflows and Underflow\n",
+    "* Causal Mask Fix\n",
+    "* Handling FP16 Overflows.\n",
+    "* KV Cache (Retention Changes).\n",
+    "* Triu/Tril Ops support."
    ]
   },
   {
@@ -26,58 +34,18 @@
     "# Initiate the Orignal Transformer model\n",
     "import os\n",
     "\n",
-    "from transformers import AutoTokenizer\n",
-    "from transformers.models.mpt.modeling_mpt import MptForCausalLM\n",
-    "\n",
-    "from QEfficient.utils import hf_download\n",
-    "from QEfficient.utils.constants import Constants\n",
+    "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n",
     "\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
     "\n",
-    "ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n",
+    "#ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n",
+    "#CACHE_DIR = os.path.join(ROOT_DIR, \"tmp\"), you can use a different location for just one model by passing this param as cache_dir in below API.\n",
     "\n",
     "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n",
     "model_name = \"mosaicml/mpt-7b\"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n",
-    "model_hf_path = hf_download(\n",
-    "    repo_id=model_name,\n",
-    "    cache_dir=Constants.CACHE_DIR,\n",
-    "    ignore_patterns=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n",
-    ")\n",
-    "model_hf = MptForCausalLM.from_pretrained(model_hf_path, use_cache=True)\n",
-    "model_hf.eval()\n",
-    "print(f\"{model_name} from hugging-face \\n\", model_hf)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a89dfa0a-d8fe-4472-bf00-55e563ae9058",
-   "metadata": {},
-   "source": [
-    "##### Now we Modify the MPT Classes using the Optimized Software Library to generate model for Cloud AI 100.\n",
-    "##### Here we generate models with below Optimizations:\n",
-    "\n",
-    "* RMS Norm Fixes for FP16 Overflows and Underflow\n",
-    "* Causal Mask Fix\n",
-    "* Handling FP16 Overflows.\n",
-    "* KV Cache (Retention Changes).\n",
-    "* Triu/Tril Ops support."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a4543b94-9b50-4bcc-90c6-484ab694c9a6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import QEfficient\n",
-    "\n",
-    "# Easy and minimal api to update the model\n",
-    "model_transformed = QEfficient.transform(model_hf, type=\"Transformers\", form_factor=\"cloud\")\n",
-    "\n",
-    "model_transformed.eval()\n",
-    "print(\"Model after Optimized transformations \\n\", model_transformed)"
+    "qeff_model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "print(f\"{model_name} optmized for AI 100 \\n\", qeff_model)"
    ]
   },
   {
@@ -95,7 +63,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter\n",
+    "import QEfficient\n",
+    "from QEfficient.utils import load_hf_tokenizer\n",
     "\n",
     "# We have the utils to export the modified models to Onnx framework\n",
     "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n",
@@ -107,14 +76,14 @@
     "\n",
     "# We can generate both bertstyle and KV Style models with the flag \"kv\"\n",
     "# Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version.\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side=\"left\")\n",
-    "base_path, onnx_path = qualcomm_efficient_converter(\n",
-    "    model_kv=model_transformed,\n",
+    "tokenizer = load_hf_tokenizer(model_name, use_cache=True, padding_side=\"left\")\n",
+    "base_path, onnx_path = QEfficient.export(\n",
     "    model_name=model_name,\n",
+    "    model_kv=qeff_model,\n",
+    "    tokenizer=tokenizer,\n",
     "    kv=True,\n",
     "    form_factor=\"cloud\",\n",
     "    return_path=True,\n",
-    "    tokenizer=tokenizer,\n",
     ")"
    ]
   },
@@ -134,12 +103,11 @@
    "outputs": [],
    "source": [
     "# Please use platform SDk to Check num_cores for your card.\n",
-    "from QEfficient.cloud.compile import main as compile\n",
     "\n",
-    "generated_qpc_path = compile(\n",
+    "generated_qpc_path = QEfficient.compile(\n",
     "    onnx_path=onnx_path,\n",
     "    num_cores=14,\n",
-    "    qpc_path=base_path,\n",
+    "    qpc_path=os.path.dirname(base_path),\n",
     "    mxfp6=True,\n",
     "    device_group=[0],\n",
     ")"
@@ -160,13 +128,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n",
+    "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
     "\n",
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
-    "batch_size = get_compilation_batch_size(generated_qpc_path)"
-    "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
+    "batch_size = get_compilation_batch_size(generated_qpc_path)\n",
+    "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
    ]
   }
  ],
diff --git a/tests/test_loader.py b/tests/test_loader.py
new file mode 100644
index 000000000..5c626361b
--- /dev/null
+++ b/tests/test_loader.py
@@ -0,0 +1,35 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from typing import Any, Dict
+
+import pytest
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+
+import QEfficient
+from QEfficient import QEFFAutoModelForCausalLM, QEFFCommonLoader
+
+model_name_to_params_dict : Dict[str, Dict[str, Any]] = {
+    "gpt2": {
+        "qeff_class": QEFFAutoModelForCausalLM,
+        "hf_class": GPT2LMHeadModel,
+        "prompt": "Equator is"
+    },
+    
+}
+model_names = model_name_to_params_dict.keys()
+
+#FIXME: Add test cases for passing cache_dir, pretrained_model_path instead of card name, etc., Passing other kwargs
+@pytest.mark.parametrize("model_name", model_names)
+def test_qeff_auto_model_for_causal_lm(model_name: str):
+    model = QEFFCommonLoader.from_pretrained(model_name)
+    assert isinstance(model, model_name_to_params_dict[model_name]['qeff_class'])
+    assert isinstance(model.model, model_name_to_params_dict[model_name]['hf_class']) # type: ignore
+
+    # Run transform
+    QEfficient.transform(model)
+    print(model)
diff --git a/tests/utils.py b/tests/utils.py
index f8fd7566e..ace803f8f 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -10,12 +10,11 @@
 import shutil
 import unittest
 
-import transformers
-
-import QEfficient
+from QEfficient import QEFFAutoModelForCausalLM
+from QEfficient.compile.compile_helper import compile_kv_model_on_cloud_ai_100
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100
-from QEfficient.utils import hf_download
+from QEfficient.transformers.transform import transform_lm
+from QEfficient.utils import hf_download, load_hf_tokenizer
 from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants
 from QEfficient.utils.device_utils import get_available_device_id, is_multi_qranium_setup_available, is_qpc_size_gt_32gb
 from QEfficient.utils.run_utils import ApiRunner
@@ -67,10 +66,7 @@ def get_tokenizer(model_name):
     :param model_name: str
     :return tokenizer
     """
-    model_hf_path = hf_download(repo_id=model_name, allow_patterns=["*.json"])
-    tokenizer = transformers.AutoTokenizer.from_pretrained(model_hf_path, padding_side="left")
-    if tokenizer.pad_token_id is None:
-        tokenizer.pad_token_id = tokenizer.eos_token_id
+    tokenizer = load_hf_tokenizer(model_name=model_name)
     return tokenizer
 
 
@@ -98,7 +94,7 @@ def transform_pt_model_with_qeff(model_hf):
     :param model_hf: pytorch model
     :return model_kv
     """
-    model_kv = QEfficient.transform(model_hf, type="Transformers", form_factor="cloud")
+    model_kv = transform_lm(model_hf)
     model_kv.eval()
     return model_kv
 
@@ -113,8 +109,7 @@ def export_onnx(model_kv, tokenizer, model_name, model_class):
     onnx_dir_path = os.path.join(QEFF_MODELS_DIR, model_name)
     base_path, onnx_model_path = qualcomm_efficient_converter(
         model_name=model_name,
-        model_class=model_class,
-        model_kv=model_kv,
+        model_kv=QEFFAutoModelForCausalLM(model=model_kv), # type: ignore
         tokenizer=tokenizer,
         onnx_dir_path=onnx_dir_path,
         kv=True,
@@ -159,14 +154,13 @@ def set_up(model_config, device_group=[0]):
         model_config["model_name"],
         model_config["model_class"],
     )
-    try:
-        ort_tokens = api_runner.run_kv_model_on_ort(
-            onnx_model_path,
-            model_config["n_layer"],
-            model_config["padding_shape"],
-        )
-    except Exception as e:
-        print(f"ONNX Model run on onnxrt failed due to : {e}")
+
+    ort_tokens = api_runner.run_kv_model_on_ort(
+        onnx_model_path,
+        model_config["n_layer"],
+        model_config["padding_shape"],
+    )
+
 
     setup_info = {}
     setup_info["model_config"] = model_config