Skip to content

Commit

Permalink
Update README.md to fix broken links and hf_download function to allo…
Browse files Browse the repository at this point in the history
…w and ignore patterns while downloading model files from hugginface

Signed-off-by: vbaddi <[email protected]>
  • Loading branch information
quic-mamta authored and anujgupt-github committed May 3, 2024
1 parent d342956 commit 31b1466
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 51 deletions.
4 changes: 2 additions & 2 deletions QEfficient/cloud/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
import argparse
from typing import List

from transformers import AutoTokenizer
from huggingface_hub import login
from transformers import AutoTokenizer

from QEfficient.generation.text_generation_inference import latency_stats_kv
from QEfficient.utils import hf_download
Expand All @@ -35,7 +35,7 @@ def main(
if hf_token is not None:
login(hf_token)
# Download tokenizer along with model if it doesn't exist
model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir)
model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json"])
tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True)

latency_stats_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt)
Expand Down
27 changes: 15 additions & 12 deletions QEfficient/cloud/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@
#
# -----------------------------------------------------------------------------

import os
import shutil
import argparse
import os
from typing import List

from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer

import QEfficient
from QEfficient.utils import hf_download
from QEfficient.cloud.compile import main as compile
from QEfficient.utils.constants import Constants, QEFF_MODELS_DIR
from QEfficient.utils.logging_utils import logger
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import latency_stats_kv
from QEfficient.utils import hf_download
from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
from QEfficient.utils.logging_utils import logger

"""
1. Check if compiled qpc for given config already exists, if it does jump to execute, else
Expand All @@ -35,9 +35,7 @@ def qpc_exists(qpc_dir_path: str) -> bool:
:param dir_path: str. Path of qpc directory.
:return: bool.
"""
return (os.path.isdir(qpc_dir_path) and
os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin")))

return os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin"))


def onnx_exists(onnx_file_path: str) -> bool:
Expand Down Expand Up @@ -81,7 +79,11 @@ def main(
# Get tokenizer
if hf_token is not None:
login(hf_token)
model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir)
model_hf_path = hf_download(
repo_id=model_name,
cache_dir=cache_dir,
ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"],
)
tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True)

if qpc_exists(qpc_dir_path):
Expand Down Expand Up @@ -128,7 +130,7 @@ def main(
kv=True,
form_factor="cloud",
return_path=True,
tokenizer=tokenizer
tokenizer=tokenizer,
)
assert (
generated_onnx_path == onnx_model_path
Expand Down Expand Up @@ -194,7 +196,8 @@ def main(
help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol",
)
parser.add_argument(
"--aic_enable_depth_first", "--aic-enable-depth-first",
"--aic_enable_depth_first",
"--aic-enable-depth-first",
action="store_true",
help="If passed, this option will be enabled during compilation, disabled by default",
)
Expand Down
21 changes: 14 additions & 7 deletions QEfficient/exporter/export_hf_to_cloud_ai_100.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,17 @@
# -----------------------------------------------------------------------------

import os
from typing import Tuple, Optional
import shutil
from typing import Optional, Tuple

import torch
from transformers import AutoTokenizer
from huggingface_hub import login
from transformers import AutoTokenizer

from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort
from QEfficient.utils.constants import Constants
from QEfficient.utils import hf_download
from QEfficient.transformers.modeling_utils import transform
from QEfficient.utils.constants import QEFF_MODELS_DIR
from QEfficient.utils import hf_download
from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
from QEfficient.utils.logging_utils import logger


Expand Down Expand Up @@ -80,7 +79,11 @@ def convert_to_cloud_bertstyle(
try:
if hf_token:
login(hf_token)
model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR)
model_hf_path = hf_download(
repo_id=model_name,
cache_dir=Constants.CACHE_DIR,
ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"],
)
model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True)
except Exception as e:
print(f"Failed to download the {model_name} model from Huggingface:%s", e)
Expand Down Expand Up @@ -238,7 +241,11 @@ def convert_to_cloud_kvstyle(
try:
if hf_token:
login(hf_token)
model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR)
model_hf_path = hf_download(
repo_id=model_name,
cache_dir=Constants.CACHE_DIR,
ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"],
)
model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True)
except Exception as e:
print(f"Failed to download the {model_name} model from Huggingface:%s", e)
Expand Down
18 changes: 11 additions & 7 deletions QEfficient/exporter/export_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,12 +329,14 @@ def run_model_on_ort(
past_value_mean = past_key_sum / num
print(f"past_keys (mean) \t\t {past_key_mean}")
print(f"past_value (mean) \t\t {past_value_mean}")
print("\n=============================================================\n")

return input_names, ort_outputs
except Exception as e:
model = onnx.load(onnx_path, load_external_data=False)
input_names = [x.name for x in model.graph.input]
print(f"Failed to run the onnx {onnx_path} model in onnx runtime:%s", e)
print("\n=============================================================\n")
return input_names, None


Expand Down Expand Up @@ -373,17 +375,19 @@ def compile_kv_model_on_cloud_ai_100(
custom_io_path: str,
aic_enable_depth_first: bool,
mos: int = -1,
device_group: List[int]=[0],
device_group: List[int] = [0],
**kwargs,
) -> bool:
import shutil

aic_binary_dir = os.path.join(base_path, "qpcs")
aic_binary_dir = os.path.join(base_path, "qpcs")

if os.path.isdir(aic_binary_dir):
shutil.rmtree(aic_binary_dir)

assert os.path.isfile(specializations_json), f"Please use 'from QEfficient.cloud.compile import main as compile', as {specializations_json} file was not found"
assert os.path.isfile(
specializations_json
), f"Please use 'from QEfficient.cloud.compile import main as compile', as {specializations_json} file was not found"
assert os.path.isfile(custom_io_path), f"{custom_io_path} file was not found!"
command = [
"/opt/qti-aic/exec/qaic-exec",
Expand All @@ -400,7 +404,7 @@ def compile_kv_model_on_cloud_ai_100(
]
if mxfp6:
command.append("-mxfp6-matmul")
if (mos>0):
if mos > 0:
command.append(f"-mos={mos}")
if aic_enable_depth_first:
command.append("-aic-enable-depth-first")
Expand All @@ -414,14 +418,14 @@ def compile_kv_model_on_cloud_ai_100(
}
],
}
mdp_ts_config_path = os.path.join(base_path, f"mdp_ts_config.json")
mdp_ts_config_path = os.path.join(base_path, "mdp_ts_config.json")
with open(mdp_ts_config_path, "w") as file:
json.dump(mdp_ts_config, file, indent=4)
command.append(f"-mdp-load-partition-config={mdp_ts_config_path}")
print("Running AI 100 compiler:", " ".join(command))
result = subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
if result.returncode !=0:
if result.returncode != 0:
raise RuntimeError("Compilation Failed!!, please check compilation arguments.")

print(f"\n=============== Compilation Done! ===============\n")
print("\n===================== Compilation Done! =====================\n")
return result.returncode == 0, aic_binary_dir
4 changes: 2 additions & 2 deletions QEfficient/generation/text_generation_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def latency_stats_kv(
print("Total (E2E) inference time is=", round(total_time, 2))
return
print()
print("*****************Performance Stats**********************")
print("===================== Performance Stats =====================")
if batch_size > 1:
print("Prefill time a.k.a TTFT (batch) is :", round(prefill_time, 2), "s")
print("Decode (batch):", round(decode_perf * batch_size, 2), "tok/s")
Expand All @@ -215,4 +215,4 @@ def latency_stats_kv(
print("Decode:", round(decode_perf, 2), "tok/s")
print("E2E:", round(total_perf, 2), "tok/s")
print("Total (E2E) inference time is=", round(total_time, 2), "s")
print("********************************************************")
print("=============================================================")
11 changes: 7 additions & 4 deletions QEfficient/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,19 @@
# -----------------------------------------------------------------------------

import os
import requests
from typing import Optional
from typing import List, Optional

from requests.exceptions import HTTPError
import requests
from huggingface_hub import snapshot_download
from requests.exceptions import HTTPError


def hf_download(
repo_id: Optional[str] = None,
cache_dir: Optional[str] = None,
hf_token: Optional[str] = None,
allow_patterns: Optional[List[str]] = None,
ignore_patterns: Optional[List[str]] = None,
):
# Setup cache and local dir
local_dir = None
Expand All @@ -37,7 +39,8 @@ def hf_download(
revision="main",
resume_download=True,
token=hf_token,
ignore_patterns=["*.txt", "*.msgpack", "*.h5", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"],
allow_patterns=allow_patterns,
ignore_patterns=ignore_patterns,
)
break
except requests.ReadTimeout as e:
Expand Down
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,11 @@ In summary:

| High Level APIs | Sample use | Arguments |
|-----------------|------------|-------------------|
| QEfficient.cloud.infer | [click here](#qeff-python-infer-api-e2e) | <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>prompt : Optinoal [Default-"My name is"]</li> |
| QEfficient.cloud.execute | [click here](#qeff-python-execute-api) | <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>prompt : Optional [Default-"My name is"]</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li> |
| QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) | <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>prompt : Optinoal [Default-"My name is"]</li> |
| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexcute) | <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>prompt : Optional [Default-"My name is"]</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li> |


### 1. Use QEfficient.cloud.infer for
### 1. Use QEfficient.cloud.infer

This is the single e2e python api in the library, which takes model_card name as input along with other compile args if necessary and does everything in one go.

Expand Down Expand Up @@ -160,10 +160,10 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3

| Low Level APIs | Sample use | Arguments |
|-----------------|------------|-------------------|
| QEfficient.transform | [click here](#) | <li>model : $\color{green} {Mandatory}$</li><li>Type : Optional [Default- "Transformers"]</li> <li>form_factor : Optional [Default-"cloud"]</li> |
| qualcomm_efficient_converter | [click here](#qeff-python-execute-api) | <li>mode_name : $\color{green} {Mandatory}$</li><li>model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed] </li><li>model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]</li> <li>tokenizer : Optional</li><li>onnx_path : Optional </li><li>hf_token : Optional</li><li>seq_length : Optional [Default-128]</li><li>input_str : Optional [Default-"My name is"]</li><li>kv : Optional [Default-True]</li><li>return_path : Optional [Default-False]</li><li>form_factor : Optional [Default-"cloud"]</li><li>save_fp32_onnx : Optional [Default-False]</li><li>save_fp16_onnx : Optional [Default-True]</li> <li>*Both save_fp32_onnx and save_fp16_onnx can't be false*</li> |
| compile_kv_model_on_cloud_ai_100 | [click here](#3-compile-on-cloud-ai-100) | <li>onnx_path : $\color{green} {Mandatory}$</li><li>specializations_json : $\color{green} {Mandatory}$</li><li>num_cores : $\color{green} {Mandatory}$</li><li>base_path : $\color{green} {Mandatory}$</li> <li>mxfp6 : $\color{green} {Mandatory}$</li> <li>custom_io_path : $\color{green} {Mandatory}$</li> <li>device_group : Optional [Default -[0]]</li> |
|latency_stats_kv | [click here](#4print-benchmark) | <li>tokenizer : $\color{green} {Mandatory}$</li> <li>qpc : $\color{green} {Mandatory}$</li><li>prompt : $\color{green} {Mandatory}$</li><li>input_len : Optional [Default-None]</li> <li>generation_len : Optional [Default-None]</li> <li>device_id : Optional [Default-[0]]</li> <li>enable_debug_logs : Optional [Default-False]</li> <li>stream : Optional [Default-True]</li> <li>write_io_dir : Optional</li><li>automation : Optional [Default-False]</li> |
| QEfficient.transform | [click here](#1-model-download-and-transform) | <li>model : $\color{green} {Mandatory}$</li><li>Type : Optional [Default- "Transformers"]</li> <li>form_factor : Optional [Default-"cloud"]</li> |
| qualcomm_efficient_converter | [click here](#2-onnx-export-of-transformed-model) | <li>mode_name : $\color{green} {Mandatory}$</li><li>model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed] </li><li>model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]</li> <li>tokenizer : Optional</li><li>onnx_path : Optional </li><li>hf_token : Optional</li><li>seq_length : Optional [Default-128]</li><li>input_str : Optional [Default-"My name is"]</li><li>kv : Optional [Default-$\color{green} {True}$]</li><li>return_path : Optional [Default-False]</li><li>form_factor : Optional [Default-"cloud"]</li><li>save_fp32_onnx : Optional [Default-False]</li><li>save_fp16_onnx : Optional [Default-True]</li> <li>*Both save_fp32_onnx and save_fp16_onnx can't be false*</li> |
| compile | [click here](#3-compile-on-cloud-ai-100) | <li>onnx_path : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>num_cores : $\color{green} {Mandatory}$</li><li>device_group : $\color{green} {Mandatory}$</li> <li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li><li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional [Default-True]</li>|
|latency_stats_kv | [click here](#4-run-benchmark) | <li>tokenizer : $\color{green} {Mandatory}$</li> <li>qpc : $\color{green} {Mandatory}$</li><li>prompt : $\color{green} {Mandatory}$</li><li>input_len : Optional [Default-None]</li> <li>generation_len : Optional [Default-None]</li> <li>device_id : Optional [Default-[0]]</li> <li>enable_debug_logs : Optional [Default-False]</li> <li>stream : Optional [Default-True]</li> <li>write_io_dir : Optional</li><li>automation : Optional [Default-False]</li> |


### 1. Model download and transform
Expand All @@ -189,7 +189,7 @@ model_name = "gpt2"

# Similar, we can change model name and generate corresponding models, if we have added the support in the lib.

model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR)
model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR, ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"])
model_hf = GPT2LMHeadModel.from_pretrained(model_hf_path, use_cache=True)
model_hf.eval()
print(f"{model_name} from hugging-face \n", model_hf)
Expand Down Expand Up @@ -243,8 +243,8 @@ generated_qpc_path = compile(
onnx_path=onnx_path,
num_cores=14,
qpc_path=base_path,
mxfp6=True,
device_group=[0],
mxfp6=True,
)
```
### 4. Run Benchmark
Expand Down
11 changes: 9 additions & 2 deletions notebooks/QEfficientGPT2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,13 @@
"source": [
"# Initiate the Orignal Transformer model\n",
"import os\n",
"from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel\n",
"\n",
"from transformers import AutoTokenizer\n",
"from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel\n",
"\n",
"from QEfficient.utils import hf_download\n",
"from QEfficient.utils.constants import Constants\n",
"\n",
"# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
"# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
"\n",
Expand All @@ -37,7 +40,11 @@
"# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n",
"model_name = \"gpt2\" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n",
"\n",
"model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR)\n",
"model_hf_path = hf_download(\n",
" repo_id=model_name,\n",
" cache_dir=Constants.CACHE_DIR,\n",
" ignore_pattrens=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n",
")\n",
"model_hf = GPT2LMHeadModel.from_pretrained(model_hf_path, use_cache=True)\n",
"model_hf.eval()\n",
"print(f\"{model_name} from hugging-face \\n\", model_hf)"
Expand Down
10 changes: 8 additions & 2 deletions notebooks/QEfficientMPT.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
"source": [
"# Initiate the Orignal Transformer model\n",
"import os\n",
"from transformers.models.mpt.modeling_mpt import MptForCausalLM\n",
"\n",
"from transformers import AutoTokenizer\n",
"from transformers.models.mpt.modeling_mpt import MptForCausalLM\n",
"\n",
"from QEfficient.utils import hf_download\n",
"from QEfficient.utils.constants import Constants\n",
"\n",
Expand All @@ -37,7 +39,11 @@
"\n",
"# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n",
"model_name = \"mosaicml/mpt-7b\" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n",
"model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR)\n",
"model_hf_path = hf_download(\n",
" repo_id=model_name,\n",
" cache_dir=Constants.CACHE_DIR,\n",
" ignore_pattrens=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n",
")\n",
"model_hf = MptForCausalLM.from_pretrained(model_hf_path, use_cache=True)\n",
"model_hf.eval()\n",
"print(f\"{model_name} from hugging-face \\n\", model_hf)"
Expand Down
Loading

0 comments on commit 31b1466

Please sign in to comment.