From 31b14667deb9322f16f58958751c3e0d1717ae21 Mon Sep 17 00:00:00 2001
From: Mamta Singh <quic_mamtsing@quicinc.com>
Date: Mon, 29 Apr 2024 19:22:26 +0530
Subject: [PATCH] Update README.md to fix broken links and hf_download function
 to allow and ignore patterns while downloading model files from hugginface

Signed-off-by: vbaddi <quic_mamtsing@quicinc.com>
---
 QEfficient/cloud/execute.py                   |  4 +--
 QEfficient/cloud/infer.py                     | 27 ++++++++++---------
 .../exporter/export_hf_to_cloud_ai_100.py     | 21 ++++++++++-----
 QEfficient/exporter/export_utils.py           | 18 ++++++++-----
 .../generation/text_generation_inference.py   |  4 +--
 QEfficient/utils/__init__.py                  | 11 +++++---
 README.md                                     | 18 ++++++-------
 notebooks/QEfficientGPT2.ipynb                | 11 ++++++--
 notebooks/QEfficientMPT.ipynb                 | 10 +++++--
 tests/utils.py                                | 19 ++++++++++---
 10 files changed, 92 insertions(+), 51 deletions(-)

diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
index b83ff0324..6c53ce73c 100644
--- a/QEfficient/cloud/execute.py
+++ b/QEfficient/cloud/execute.py
@@ -8,8 +8,8 @@
 import argparse
 from typing import List
 
-from transformers import AutoTokenizer
 from huggingface_hub import login
+from transformers import AutoTokenizer
 
 from QEfficient.generation.text_generation_inference import latency_stats_kv
 from QEfficient.utils import hf_download
@@ -35,7 +35,7 @@ def main(
     if hf_token is not None:
         login(hf_token)
     # Download tokenizer along with model if it doesn't exist
-    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir)
+    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json"])
     tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True)
 
     latency_stats_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt)
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 376eb1e4d..687a4872c 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -5,20 +5,20 @@
 #
 # -----------------------------------------------------------------------------
 
-import os
-import shutil
 import argparse
+import os
 from typing import List
 
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import login
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
 import QEfficient
-from QEfficient.utils import hf_download
 from QEfficient.cloud.compile import main as compile
-from QEfficient.utils.constants import Constants, QEFF_MODELS_DIR
-from QEfficient.utils.logging_utils import logger
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.generation.text_generation_inference import latency_stats_kv
+from QEfficient.utils import hf_download
+from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
+from QEfficient.utils.logging_utils import logger
 
 """
 1. Check if compiled qpc for given config already exists, if it does jump to execute, else
@@ -35,9 +35,7 @@ def qpc_exists(qpc_dir_path: str) -> bool:
     :param dir_path: str. Path of qpc directory.
     :return: bool.
     """
-    return (os.path.isdir(qpc_dir_path) and
-            os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin")))
-
+    return os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin"))
 
 
 def onnx_exists(onnx_file_path: str) -> bool:
@@ -81,7 +79,11 @@ def main(
     # Get tokenizer
     if hf_token is not None:
         login(hf_token)
-    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir)
+    model_hf_path = hf_download(
+        repo_id=model_name,
+        cache_dir=cache_dir,
+        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"],
+    )
     tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True)
 
     if qpc_exists(qpc_dir_path):
@@ -128,7 +130,7 @@ def main(
         kv=True,
         form_factor="cloud",
         return_path=True,
-        tokenizer=tokenizer
+        tokenizer=tokenizer,
     )
     assert (
         generated_onnx_path == onnx_model_path
@@ -194,7 +196,8 @@ def main(
         help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol",
     )
     parser.add_argument(
-        "--aic_enable_depth_first", "--aic-enable-depth-first",
+        "--aic_enable_depth_first",
+        "--aic-enable-depth-first",
         action="store_true",
         help="If passed, this option will be enabled during compilation, disabled by default",
     )
diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
index df9912bfa..cbd2495eb 100644
--- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py
+++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -6,18 +6,17 @@
 # -----------------------------------------------------------------------------
 
 import os
-from typing import Tuple, Optional
 import shutil
+from typing import Optional, Tuple
 
 import torch
-from transformers import AutoTokenizer
 from huggingface_hub import login
+from transformers import AutoTokenizer
 
 from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort
-from QEfficient.utils.constants import Constants
-from QEfficient.utils import hf_download
 from QEfficient.transformers.modeling_utils import transform
-from QEfficient.utils.constants import QEFF_MODELS_DIR
+from QEfficient.utils import hf_download
+from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
 
 
@@ -80,7 +79,11 @@ def convert_to_cloud_bertstyle(
     try:
         if hf_token:
             login(hf_token)
-        model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR)
+        model_hf_path = hf_download(
+            repo_id=model_name,
+            cache_dir=Constants.CACHE_DIR,
+            ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"],
+        )
         model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True)
     except Exception as e:
         print(f"Failed to download the {model_name} model from Huggingface:%s", e)
@@ -238,7 +241,11 @@ def convert_to_cloud_kvstyle(
         try:
             if hf_token:
                 login(hf_token)
-            model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR)
+            model_hf_path = hf_download(
+                repo_id=model_name,
+                cache_dir=Constants.CACHE_DIR,
+                ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"],
+            )
             model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True)
         except Exception as e:
             print(f"Failed to download the {model_name} model from Huggingface:%s", e)
diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py
index 775677d09..a2f3e9782 100644
--- a/QEfficient/exporter/export_utils.py
+++ b/QEfficient/exporter/export_utils.py
@@ -329,12 +329,14 @@ def run_model_on_ort(
         past_value_mean = past_key_sum / num
         print(f"past_keys (mean) \t\t {past_key_mean}")
         print(f"past_value (mean) \t\t {past_value_mean}")
+        print("\n=============================================================\n")
 
         return input_names, ort_outputs
     except Exception as e:
         model = onnx.load(onnx_path, load_external_data=False)
         input_names = [x.name for x in model.graph.input]
         print(f"Failed to run the onnx {onnx_path} model in onnx runtime:%s", e)
+        print("\n=============================================================\n")
         return input_names, None
 
 
@@ -373,17 +375,19 @@ def compile_kv_model_on_cloud_ai_100(
     custom_io_path: str,
     aic_enable_depth_first: bool,
     mos: int = -1,
-    device_group: List[int]=[0],
+    device_group: List[int] = [0],
     **kwargs,
 ) -> bool:
     import shutil
 
-    aic_binary_dir =  os.path.join(base_path, "qpcs")
+    aic_binary_dir = os.path.join(base_path, "qpcs")
 
     if os.path.isdir(aic_binary_dir):
         shutil.rmtree(aic_binary_dir)
 
-    assert os.path.isfile(specializations_json), f"Please use 'from QEfficient.cloud.compile import main as compile', as {specializations_json} file was not found"
+    assert os.path.isfile(
+        specializations_json
+    ), f"Please use 'from QEfficient.cloud.compile import main as compile', as {specializations_json} file was not found"
     assert os.path.isfile(custom_io_path), f"{custom_io_path} file was not found!"
     command = [
         "/opt/qti-aic/exec/qaic-exec",
@@ -400,7 +404,7 @@ def compile_kv_model_on_cloud_ai_100(
     ]
     if mxfp6:
         command.append("-mxfp6-matmul")
-    if (mos>0):
+    if mos > 0:
         command.append(f"-mos={mos}")
     if aic_enable_depth_first:
         command.append("-aic-enable-depth-first")
@@ -414,14 +418,14 @@ def compile_kv_model_on_cloud_ai_100(
                 }
             ],
         }
-        mdp_ts_config_path = os.path.join(base_path, f"mdp_ts_config.json")
+        mdp_ts_config_path = os.path.join(base_path, "mdp_ts_config.json")
         with open(mdp_ts_config_path, "w") as file:
             json.dump(mdp_ts_config, file, indent=4)
         command.append(f"-mdp-load-partition-config={mdp_ts_config_path}")
     print("Running AI 100 compiler:", " ".join(command))
     result = subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
-    if result.returncode !=0:
+    if result.returncode != 0:
         raise RuntimeError("Compilation Failed!!, please check compilation arguments.")
 
-    print(f"\n=============== Compilation Done! ===============\n")
+    print("\n===================== Compilation Done! =====================\n")
     return result.returncode == 0, aic_binary_dir
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 142395235..32237a2b9 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -204,7 +204,7 @@ def latency_stats_kv(
         print("Total (E2E) inference time is=", round(total_time, 2))
         return
     print()
-    print("*****************Performance Stats**********************")
+    print("===================== Performance Stats =====================")
     if batch_size > 1:
         print("Prefill time a.k.a TTFT (batch) is :", round(prefill_time, 2), "s")
         print("Decode (batch):", round(decode_perf * batch_size, 2), "tok/s")
@@ -215,4 +215,4 @@ def latency_stats_kv(
         print("Decode:", round(decode_perf, 2), "tok/s")
         print("E2E:", round(total_perf, 2), "tok/s")
         print("Total (E2E) inference time is=", round(total_time, 2), "s")
-    print("********************************************************")
+    print("=============================================================")
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index 833c4539a..1929b8d4a 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -6,17 +6,19 @@
 # -----------------------------------------------------------------------------
 
 import os
-import requests
-from typing import Optional
+from typing import List, Optional
 
-from requests.exceptions import HTTPError
+import requests
 from huggingface_hub import snapshot_download
+from requests.exceptions import HTTPError
 
 
 def hf_download(
     repo_id: Optional[str] = None,
     cache_dir: Optional[str] = None,
     hf_token: Optional[str] = None,
+    allow_patterns: Optional[List[str]] = None,
+    ignore_patterns: Optional[List[str]] = None,
 ):
     # Setup cache and local dir
     local_dir = None
@@ -37,7 +39,8 @@ def hf_download(
                 revision="main",
                 resume_download=True,
                 token=hf_token,
-                ignore_patterns=["*.txt", "*.msgpack", "*.h5", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"],
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
             )
             break
         except requests.ReadTimeout as e:
diff --git a/README.md b/README.md
index 1017041d7..ed83b6182 100644
--- a/README.md
+++ b/README.md
@@ -99,11 +99,11 @@ In summary:
 
 | High Level APIs | Sample use | Arguments         |
 |-----------------|------------|-------------------|
-| QEfficient.cloud.infer           |   [click here](#qeff-python-infer-api-e2e)         |  <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>prompt : Optinoal [Default-"My name is"]</li> |
-| QEfficient.cloud.execute  |     [click here](#qeff-python-execute-api)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>prompt : Optional [Default-"My name is"]</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li>             |
+| QEfficient.cloud.infer           |   [click here](#1-use-qefficientcloudinfer)         |  <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>prompt : Optinoal [Default-"My name is"]</li> |
+| QEfficient.cloud.execute  |     [click here](#2-use-of-qefficientcloudexcute)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>prompt : Optional [Default-"My name is"]</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li>             |
 
 
-### 1. Use QEfficient.cloud.infer for 
+### 1. Use QEfficient.cloud.infer 
 
 This is the single e2e python api in the library, which takes model_card name as input along with other compile args if necessary and does everything in one go. 
 
@@ -160,10 +160,10 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3
 
 | Low Level APIs | Sample use | Arguments         | 
 |-----------------|------------|-------------------|
-|  QEfficient.transform    |   [click here](#)         |  <li>model : $\color{green} {Mandatory}$</li><li>Type : Optional [Default- "Transformers"]</li> <li>form_factor : Optional [Default-"cloud"]</li>  | 
-| qualcomm_efficient_converter |     [click here](#qeff-python-execute-api)       |   <li>mode_name : $\color{green} {Mandatory}$</li><li>model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed] </li><li>model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]</li> <li>tokenizer : Optional</li><li>onnx_path : Optional </li><li>hf_token : Optional</li><li>seq_length : Optional [Default-128]</li><li>input_str : Optional [Default-"My name is"]</li><li>kv : Optional [Default-True]</li><li>return_path : Optional [Default-False]</li><li>form_factor : Optional [Default-"cloud"]</li><li>save_fp32_onnx : Optional [Default-False]</li><li>save_fp16_onnx : Optional [Default-True]</li> <li>*Both save_fp32_onnx and save_fp16_onnx can't be false*</li> | 
-|     compile_kv_model_on_cloud_ai_100 | [click here](#3-compile-on-cloud-ai-100) | <li>onnx_path : $\color{green} {Mandatory}$</li><li>specializations_json : $\color{green} {Mandatory}$</li><li>num_cores : $\color{green} {Mandatory}$</li><li>base_path : $\color{green} {Mandatory}$</li> <li>mxfp6 : $\color{green} {Mandatory}$</li> <li>custom_io_path : $\color{green} {Mandatory}$</li> <li>device_group : Optional [Default -[0]]</li>     | 
-|latency_stats_kv | [click here](#4print-benchmark)  | <li>tokenizer : $\color{green} {Mandatory}$</li> <li>qpc : $\color{green} {Mandatory}$</li><li>prompt : $\color{green} {Mandatory}$</li><li>input_len : Optional [Default-None]</li> <li>generation_len : Optional [Default-None]</li> <li>device_id : Optional [Default-[0]]</li> <li>enable_debug_logs : Optional [Default-False]</li> <li>stream : Optional [Default-True]</li> <li>write_io_dir : Optional</li><li>automation : Optional [Default-False]</li> | 
+|  QEfficient.transform    |   [click here](#1-model-download-and-transform)         |  <li>model : $\color{green} {Mandatory}$</li><li>Type : Optional [Default- "Transformers"]</li> <li>form_factor : Optional [Default-"cloud"]</li>  | 
+| qualcomm_efficient_converter |     [click here](#2-onnx-export-of-transformed-model)       |   <li>mode_name : $\color{green} {Mandatory}$</li><li>model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed] </li><li>model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]</li> <li>tokenizer : Optional</li><li>onnx_path : Optional </li><li>hf_token : Optional</li><li>seq_length : Optional [Default-128]</li><li>input_str : Optional [Default-"My name is"]</li><li>kv : Optional [Default-$\color{green} {True}$]</li><li>return_path : Optional [Default-False]</li><li>form_factor : Optional [Default-"cloud"]</li><li>save_fp32_onnx : Optional [Default-False]</li><li>save_fp16_onnx : Optional [Default-True]</li> <li>*Both save_fp32_onnx and save_fp16_onnx can't be false*</li> | 
+|     compile | [click here](#3-compile-on-cloud-ai-100) | <li>onnx_path : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>num_cores : $\color{green} {Mandatory}$</li><li>device_group  : $\color{green} {Mandatory}$</li> <li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li><li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional [Default-True]</li>| 
+|latency_stats_kv | [click here](#4-run-benchmark)  | <li>tokenizer : $\color{green} {Mandatory}$</li> <li>qpc : $\color{green} {Mandatory}$</li><li>prompt : $\color{green} {Mandatory}$</li><li>input_len : Optional [Default-None]</li> <li>generation_len : Optional [Default-None]</li> <li>device_id : Optional [Default-[0]]</li> <li>enable_debug_logs : Optional [Default-False]</li> <li>stream : Optional [Default-True]</li> <li>write_io_dir : Optional</li><li>automation : Optional [Default-False]</li> | 
 
 
 ### 1.  Model download and transform
@@ -189,7 +189,7 @@ model_name = "gpt2"
 
 # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.
 
-model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR)
+model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR, ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"])
 model_hf = GPT2LMHeadModel.from_pretrained(model_hf_path, use_cache=True)
 model_hf.eval()
 print(f"{model_name} from hugging-face \n", model_hf)
@@ -243,8 +243,8 @@ generated_qpc_path = compile(
     onnx_path=onnx_path,
     num_cores=14,
     qpc_path=base_path,
-    mxfp6=True,
     device_group=[0],
+    mxfp6=True,
 )
 ```
 ### 4. Run Benchmark 
diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index 9883e52ad..537475dbf 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -25,10 +25,13 @@
    "source": [
     "# Initiate the Orignal Transformer model\n",
     "import os\n",
-    "from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel\n",
+    "\n",
     "from transformers import AutoTokenizer\n",
+    "from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel\n",
+    "\n",
     "from QEfficient.utils import hf_download\n",
     "from QEfficient.utils.constants import Constants\n",
+    "\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
     "\n",
@@ -37,7 +40,11 @@
     "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n",
     "model_name = \"gpt2\"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n",
     "\n",
-    "model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR)\n",
+    "model_hf_path = hf_download(\n",
+    "    repo_id=model_name,\n",
+    "    cache_dir=Constants.CACHE_DIR,\n",
+    "    ignore_pattrens=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n",
+    ")\n",
     "model_hf = GPT2LMHeadModel.from_pretrained(model_hf_path, use_cache=True)\n",
     "model_hf.eval()\n",
     "print(f\"{model_name} from hugging-face \\n\", model_hf)"
diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index f0968ff12..ecce36b23 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -25,8 +25,10 @@
    "source": [
     "# Initiate the Orignal Transformer model\n",
     "import os\n",
-    "from transformers.models.mpt.modeling_mpt import MptForCausalLM\n",
+    "\n",
     "from transformers import AutoTokenizer\n",
+    "from transformers.models.mpt.modeling_mpt import MptForCausalLM\n",
+    "\n",
     "from QEfficient.utils import hf_download\n",
     "from QEfficient.utils.constants import Constants\n",
     "\n",
@@ -37,7 +39,11 @@
     "\n",
     "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n",
     "model_name = \"mosaicml/mpt-7b\"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n",
-    "model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR)\n",
+    "model_hf_path = hf_download(\n",
+    "    repo_id=model_name,\n",
+    "    cache_dir=Constants.CACHE_DIR,\n",
+    "    ignore_pattrens=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n",
+    ")\n",
     "model_hf = MptForCausalLM.from_pretrained(model_hf_path, use_cache=True)\n",
     "model_hf.eval()\n",
     "print(f\"{model_name} from hugging-face \\n\", model_hf)"
diff --git a/tests/utils.py b/tests/utils.py
index dd6edc407..8ff1f627d 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -14,7 +14,7 @@
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100
 from QEfficient.utils import hf_download
-from QEfficient.utils.constants import Constants, QEFF_MODELS_DIR, ROOT_DIR
+from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants
 from QEfficient.utils.device_utils import get_available_device_id
 from QEfficient.utils.run_utils import ApiRunner
 
@@ -32,6 +32,7 @@ def prepare_work_dir(work_dir):
     # create empty temp dir
     os.makedirs(temp_dir)
 
+
 def remove_temp_dir(work_dir):
     """
     Function to remove the temp work directory location
@@ -42,18 +43,20 @@ def remove_temp_dir(work_dir):
     if os.path.exists(temp_dir):
         shutil.rmtree(temp_dir)
 
+
 def get_tokenizer(model_name):
     """
     Function to get tokenizer info from transformers.AutoTokenizer
     :param model_name: str
     :return tokenizer
     """
-    model_hf_path = hf_download(repo_id=model_name)
+    model_hf_path = hf_download(repo_id=model_name, allow_patterns=["*.json"])
     tokenizer = transformers.AutoTokenizer.from_pretrained(model_hf_path, padding_side="left")
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
     return tokenizer
 
+
 def load_pytorch_model(model_name, model_class):
     """
     Function to load model from huggingface and transform to KV model
@@ -61,11 +64,14 @@ def load_pytorch_model(model_name, model_class):
     :param model_class: type
     :return model_hf
     """
-    model_path = hf_download(repo_id=model_name)
+    model_path = hf_download(
+        repo_id=model_name, ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"]
+    )
     model_hf = model_class.from_pretrained(model_path, use_cache=True)
     model_hf.eval()
     return model_hf
 
+
 def transform_pt_model_with_qeff(model_hf):
     """
     Function to take huggingface model and transform to KV model
@@ -76,6 +82,7 @@ def transform_pt_model_with_qeff(model_hf):
     model_kv.eval()
     return model_kv
 
+
 def export_onnx(model_kv, tokenizer, model_name, model_class):
     """
     Function to export onnx model
@@ -91,9 +98,11 @@ def export_onnx(model_kv, tokenizer, model_name, model_class):
         tokenizer=tokenizer,
         onnx_dir_path=onnx_dir_path,
         kv=True,
-        return_path=True)
+        return_path=True,
+    )
     return base_path, onnx_model_path
 
+
 def set_up(model_config):
     """
     Set up function to set up the test environment for TestQEfficientModel class
@@ -148,6 +157,7 @@ def set_up(model_config):
     setup_info["ort_tokens"] = ort_tokens
     return setup_info
 
+
 def get_cloud_ai_100_tokens(setup_info):
     """
     Test function to validate the llama model before and after KV changes on Cloud AI 100
@@ -168,6 +178,7 @@ def get_cloud_ai_100_tokens(setup_info):
             device_group=[0],
         )
         from QEfficient.generation.cloud_infer import QAICInferenceSession
+
         session = QAICInferenceSession(test_qpcs_path, device_id, enable_debug_logs=False)
         try:
             cloud_ai_100_tokens = setup_info["api_runner"].run_kv_model_on_cloud_ai_100(