From dee2d442db50c02850a839957751b07a32c9ffc4 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Tue, 28 May 2024 01:21:18 +0530 Subject: [PATCH 01/20] all changes Signed-off-by: Onkar Chougule --- QEfficient/__init__.py | 2 + QEfficient/loader/__init__.py | 8 +++ QEfficient/loader/loader.py | 33 +++++++++ QEfficient/loader/loader_factory.py | 107 ++++++++++++++++++++++++++++ QEfficient/utils/__init__.py | 99 ++----------------------- QEfficient/utils/generate_inputs.py | 32 +++++++++ QEfficient/utils/run_utils.py | 43 ++++++----- 7 files changed, 212 insertions(+), 112 deletions(-) create mode 100644 QEfficient/loader/__init__.py create mode 100644 QEfficient/loader/loader.py create mode 100644 QEfficient/loader/loader_factory.py diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index c4ccb4ef7..0d623eeee 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -6,6 +6,8 @@ # ----------------------------------------------------------------------------- import torch.nn as nn + +from QEfficient.loader import QEFFAutoModel # noqa: F401 from QEfficient.transformers.modeling_utils import transform as transform_hf diff --git a/QEfficient/loader/__init__.py b/QEfficient/loader/__init__.py new file mode 100644 index 000000000..a17f497b5 --- /dev/null +++ b/QEfficient/loader/__init__.py @@ -0,0 +1,8 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +from QEfficient.loader.loader import QEFFAutoModel # noqa: F401 diff --git a/QEfficient/loader/loader.py b/QEfficient/loader/loader.py new file mode 100644 index 000000000..950fcb946 --- /dev/null +++ b/QEfficient/loader/loader.py @@ -0,0 +1,33 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import os +from typing import Any + +from QEfficient.loader.loader_factory import ( + MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP, + QEFFBaseAutoModelFactory, + get_hf_model_type, +) +from QEfficient.utils.utils import login_and_download_hf_lm + + +class QEFFAutoModel: + def __init__(self, *args: Any, **kwds: Any) -> None: + raise EnvironmentError( + f"{self.__class__.__name__} is designed to be instantiated " + f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseAutoModelFactory: + pretrained_model_name_or_path = pretrained_model_name_or_path if os.path.isdir(pretrained_model_name_or_path) \ + else login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs) + model_type = get_hf_model_type(hf_model_path=pretrained_model_name_or_path) + qeff_auto_model_class = MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP[model_type] + assert issubclass(qeff_auto_model_class, QEFFBaseAutoModelFactory), f"Expected class that inherits {QEFFBaseAutoModelFactory}, got {type(qeff_auto_model_class)}" + + return qeff_auto_model_class.from_pretrained(pretrained_model_name_or_path) diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py new file mode 100644 index 000000000..c5421fd25 --- /dev/null +++ b/QEfficient/loader/loader_factory.py @@ -0,0 +1,107 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +import os +from typing import Any +from abc import ABC, abstractmethod +from enum import Enum +from typing import Union + +from qtpy import API +import torch.nn as nn +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING + +from QEfficient.utils.run_utils import ApiRunner, run_hf_lm_model_with_pt +import QEfficient + +class QEFFBaseAutoModelFactory(ABC): + + def __init__(self) -> None: + super().__init__() + # Users can call generate or execute + self.generate = self.execute + + @abstractmethod + def from_pretrained(self, pretrained_model_name_or_path: str, *args, **kwargs): + pass + + @abstractmethod + def execute(self, *args, **kwargs) -> Any: + pass + + @abstractmethod + def transform(self, *args, **kwargs) -> Any: + pass + + @abstractmethod + def export(self, *args, **kwargs) -> Any: + raise NotImplementedError("Reached too far!!") + + +class QEFFAutoModelForCausalLM(QEFFBaseAutoModelFactory): + def __init__(self, model: nn.Module, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], pretrained_model_name_or_path: str) -> None: + assert model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values(), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore + self.model = model + self.tokenizer = tokenizer + self.model_files_path = pretrained_model_name_or_path + self._model_executor = None + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): + model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + return cls(model=model, tokenizer=tokenizer, pretrained_model_name_or_path=pretrained_model_name_or_path) + + def _run_kv_lm_model_with_pt(self, prompt, prompt_len, ctx_len): + api_runner = ApiRunner(self.tokenizer, prompt=prompt, prompt_len=prompt_len, ctx_len=ctx_len) + return api_runner.run_kv_model_on_pytorch(self.model, ) + + def execute(self, prompt: str, prompt_len: int = None, ctx_len: int = None, max_gen_length: int = 128): # type: ignore + if getattr(self.model, "qeff_transformed", False): + output_ids = run_hf_lm_model_with_pt(self.model, self.tokenizer, prompt, max_gen_length) + else: + output_ids = self._run_kv_lm_model_with_pt(prompt, prompt_len, ctx_len) + return output_ids + + def transform(self): + QEfficient.transform(self.model) + return self + + def export(self): + pass + + +class QEFF_MODEL_TYPE(Enum): + LLM = "LLM" + STABLE_DIFFUSION = "STABLE_DIFFUSION" + AWQ = "AWQ" + + +MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP= { + QEFF_MODEL_TYPE.LLM: QEFFAutoModelForCausalLM +} + + +def get_hf_model_type(hf_model_path: str): + assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model" + config, kwargs = AutoConfig.from_pretrained( + hf_model_path, + return_unused_kwargs=True, + ) + if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING: + # FIXME: Add logic to handle if quantization config is stored in separate quant_config.json outside of config, also create a separate function for this and below lines + quant_config = getattr(config, "quantization_config", getattr(config, "quant_config", None)) + if quant_config is not None: + if quant_config.get("quant_method", None) == "awq": + return QEFF_MODEL_TYPE.AWQ + else: + raise NotImplementedError(f"current model type is not yet supported {type(config)}") + else: + return QEFF_MODEL_TYPE.LLM + else: + raise NotImplementedError(f"model type {type(config)} is not yet supported") diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index 486bae664..bed89942b 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -5,96 +5,9 @@ # # ----------------------------------------------------------------------------- -import os -from typing import List, Optional, Tuple, Union - -import requests -from huggingface_hub import snapshot_download -from requests.exceptions import HTTPError - -from QEfficient.utils.constants import QEFF_MODELS_DIR - - -def hf_download( - repo_id: Optional[str] = None, - cache_dir: Optional[str] = None, - hf_token: Optional[str] = None, - allow_patterns: Optional[List[str]] = None, - ignore_patterns: Optional[List[str]] = None, -): - # Setup cache and local dir - local_dir = None - if cache_dir is not None: - cache_dir = f"{cache_dir}" - local_dir = f"{cache_dir}/{repo_id}" - - os.makedirs(f"{cache_dir}/{repo_id}", exist_ok=True) - max_retries = 5 - retry_count = 0 - while retry_count < max_retries: - try: - model_path = snapshot_download( - repo_id, - cache_dir=cache_dir, - local_dir=local_dir, - local_dir_use_symlinks=True, - revision="main", - resume_download=True, - token=hf_token, - allow_patterns=allow_patterns, - ignore_patterns=ignore_patterns, - ) - break - except requests.ReadTimeout as e: - print(f"Read timeout: {e}") - retry_count += 1 - - except HTTPError as e: - retry_count = max_retries - if e.response.status_code == 401: - print("You need to pass a valid `--hf_token=...` to download private checkpoints.") - else: - raise e - - return model_path - - -def qpc_exists(model_name: str, qpc_base_dir_name: str) -> Union[Tuple[bool, str], None]: - """ - Checks if qpc files already exists, removes the directory if files have been manipulated. - --------- - :param model_name: str. HF Model card name. - :param dir_path: str. Path of qpc directory. - :return: Union[Tuple[bool, str]]: qpc_exists and path to qpc directory - """ - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) - os.makedirs(model_card_dir, exist_ok=True) - - qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs") - - # Compute the boolean indicating if the QPC exists - qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin")) - - return qpc_exists_bool, qpc_dir_path - - -def onnx_exists(model_name: str) -> Union[Tuple[bool, str, str], None]: - """ - Checks if qpc files already exists, removes the directory if files have been manipulated. - --------- - :param model_name: str. HF Model card name. - :return: Union[Tuple[bool, str, str]]: onnx_exists and path to onnx file and directory - """ - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) - os.makedirs(model_card_dir, exist_ok=True) - - onnx_dir_path = os.path.join(model_card_dir, "onnx") - onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") - - # Compute the boolean indicating if the ONNX model exists - onnx_exists_bool = os.path.isfile(onnx_model_path) and os.path.isfile( - os.path.join(os.path.dirname(onnx_model_path), "custom_io_fp16.yaml") - ) - - # Return the boolean, onnx_dir_path, and onnx_model_path - return onnx_exists_bool, onnx_dir_path, onnx_model_path +from QEfficient.utils.utils import ( # noqa: F401 + hf_download, + login_and_download_hf_lm, + onnx_exists, + qpc_exists, +) diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py index 4cf15024a..deb7bcf32 100644 --- a/QEfficient/utils/generate_inputs.py +++ b/QEfficient/utils/generate_inputs.py @@ -5,12 +5,44 @@ # # ----------------------------------------------------------------------------- +from abc import ABC, abstractmethod import numpy as np import torch from QEfficient.utils.logging_utils import logger +class AwesomeInputHandler(ABC): + + def __init__(self) -> None: + super().__init__() + self.counter = 0 + + def reset(self): + self.counter = 0 + + def prepare_inputs(self, prompt, n_layer, padding_shape): + if self.counter!=0: + logger.warning("Resetting Input Handler as prepare_inputs is called even though it's in the middle of generating outputs") + self.reset() + + self._prepare_inputs(prompt, n_layer, padding_shape) + self.counter+=1 + + def update_inputs(self, outputs): + self._update_inputs(outputs) + self.counter+=1 + + @abstractmethod + def _prepare_inputs(self, prompt, n_layer, padding_shape): + pass + + @abstractmethod + def _update_inputs(self, outputs): + pass + + + class InputHandler: def __init__(self, tokenizer, input_str, prompt_len, ctx_len): """ diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index c521bf3d1..fbfc2b968 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -9,9 +9,30 @@ import onnxruntime import torch -from .generate_inputs import InputHandler from QEfficient.utils.logging_utils import logger +from .generate_inputs import InputHandler + + +def run_hf_lm_model_with_pt(model_hf, tokenizer, prompt, gen_len): + input_ids = tokenizer.encode(prompt, return_tensors="pt") + + input_ids_len = len(input_ids[0]) + + with torch.no_grad(): + for _ in range(gen_len): + outputs = model_hf(input_ids) + logits = outputs.logits[:, -1, :] + predicted_token_id = torch.argmax(logits, dim=-1) + input_ids = torch.cat([input_ids, predicted_token_id.unsqueeze(1)], dim=-1) + + generated_ids = input_ids[0][input_ids_len:].detach().numpy() + generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True) + print("Original HF Model Outputs (Torch CPU): \n") + print("Prompt:", repr(prompt)) + print("Completion:", repr(generated_text)) + return generated_ids + class ApiRunner: """ @@ -32,7 +53,7 @@ def __init__(self, tokenizer, prompt, prompt_len, ctx_len): :param ctx_len: int """ if tokenizer.padding_side != "left": - logger.warning(f"Please use padding_side='left' while initializing the tokenizer") + logger.warning("Please use padding_side='left' while initializing the tokenizer") tokenizer.padding_side = "left" if tokenizer.pad_token_id is None: tokenizer.pad_token_id = tokenizer.eos_token_id @@ -50,24 +71,8 @@ def run_hf_model_on_pytorch(self, model_hf): :param model_hf: pytorch model :return generated_ids: numpy.ndarray - output tokens """ + return run_hf_lm_model_with_pt(model_hf, self.tokenizer, self.prompt[0], self.gen_len) - input_ids = self.tokenizer.encode(self.prompt[0], return_tensors="pt") - - input_ids_len = len(input_ids[0]) - - with torch.no_grad(): - for _ in range(self.gen_len): - outputs = model_hf(input_ids) - logits = outputs.logits[:, -1, :] - predicted_token_id = torch.argmax(logits, dim=-1) - input_ids = torch.cat([input_ids, predicted_token_id.unsqueeze(1)], dim=-1) - - generated_ids = input_ids[0][input_ids_len:].detach().numpy() - generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True) - print("Original HF Model Outputs (Torch CPU): \n") - print("Prompt:", repr(self.prompt)) - print("Completion:", repr(generated_text)) - return generated_ids def run_kv_model_on_pytorch(self, model, n_layer, padding_shape): """ From ca62618728ebebacfdb309e80bed8309a56ff5a5 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Tue, 28 May 2024 23:30:47 +0530 Subject: [PATCH 02/20] only loader changes Signed-off-by: Onkar Chougule --- QEfficient/__init__.py | 13 +- QEfficient/cloud/execute.py | 24 +- QEfficient/cloud/export.py | 30 +- QEfficient/cloud/infer.py | 132 +++---- .../exporter/export_hf_to_cloud_ai_100.py | 333 +++++++++--------- QEfficient/exporter/export_utils.py | 4 +- .../generation/text_generation_inference.py | 2 +- QEfficient/loader/loader_factory.py | 48 +-- QEfficient/utils/__init__.py | 1 + tests/utils.py | 8 +- 10 files changed, 288 insertions(+), 307 deletions(-) diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 0d623eeee..d9d032f27 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -5,18 +5,21 @@ # # ----------------------------------------------------------------------------- -import torch.nn as nn +from typing import Any, Union from QEfficient.loader import QEFFAutoModel # noqa: F401 +from QEfficient.loader.loader_factory import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFAutoModelForCausalLM from QEfficient.transformers.modeling_utils import transform as transform_hf -def transform(model: nn.Module, type="Transformers", form_factor="cloud"): +def transform(model: Union[QEFFAutoModelForCausalLM, Any], form_factor="cloud"): """Low level apis in library model : instance of nn.Module type : Transformers | Diffusers, default : Transformers """ - if type == "Transformers": - return transform_hf(model, form_factor) + assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!" + if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.LLM: + transform_hf(model.model, form_factor) + return model else: - raise NotImplementedError + raise NotImplementedError(f"Recieved unsupported class of type {type(model)}") diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index c1ec39abe..eee912494 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -6,17 +6,14 @@ # ----------------------------------------------------------------------------- import argparse -from typing import List - -from huggingface_hub import login -from transformers import AutoTokenizer +from typing import List, Optional from QEfficient.generation.text_generation_inference import ( check_batch_size_and_num_prompts, cloud_ai_100_exec_kv, get_compilation_batch_size, ) -from QEfficient.utils import hf_download +from QEfficient.utils import load_hf_tokenizer from QEfficient.utils.constants import Constants @@ -24,10 +21,10 @@ def main( model_name: str, qpc_path: str, device_group: List[int], - prompt: str = None, - prompts_txt_file_path: str = None, - cache_dir: str = Constants.CACHE_DIR, - hf_token: str = None, + prompt: Optional[str] = None, # type: ignore + prompts_txt_file_path: Optional[str] = None, + cache_dir: Optional[str] = Constants.CACHE_DIR, + hf_token: Optional[str] = None, ): """ APi() to run the Model on Cloud AI 100 Platform. @@ -39,15 +36,10 @@ def main( :prompts_txt_file_path: str. Path to txt file for multiple input prompts """ - if hf_token is not None: - login(hf_token) - - # Download tokenizer along with model if it doesn't exist - model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"]) - tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left") + tokenizer = load_hf_tokenizer(model_name, cache_dir, hf_token) batch_size = get_compilation_batch_size(qpc_path) - prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) + prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) # Execute cloud_ai_100_exec_kv( diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index f86e245c0..2b7201c8e 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -7,13 +7,12 @@ import argparse import os - -from huggingface_hub import login -from transformers import AutoModelForCausalLM, AutoTokenizer +from typing import Optional import QEfficient from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.utils import hf_download, onnx_exists +from QEfficient.loader import QEFFAutoModel +from QEfficient.utils import load_hf_tokenizer, onnx_exists from QEfficient.utils.constants import Constants from QEfficient.utils.logging_utils import logger @@ -24,7 +23,7 @@ def main( model_name: str, cache_dir: str, - hf_token: str = None, + hf_token: Optional[str] = None, ) -> None: """ Api() for exporting to Onnx Model. @@ -38,32 +37,23 @@ def main( logger.warning(f"Generated Onnx files found {onnx_model_path}! Please use Infer/Compile Apis()") return - if hf_token is not None: - login(hf_token) - model_hf_path = hf_download( - repo_id=model_name, - cache_dir=cache_dir, - ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"], - ) - tokenizer = AutoTokenizer.from_pretrained( - model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True - ) - model = AutoModelForCausalLM.from_pretrained(model_hf_path, use_cache=True) + tokenizer = load_hf_tokenizer(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token) + qeff_model = QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, cache_dir=cache_dir, hf_token=hf_token) # Easy and minimal api to update the model to QEff. - QEfficient.transform(model, type="Transformers", form_factor="cloud") - print(f"Model after Optimized transformations {model}") + QEfficient.transform(qeff_model, form_factor="cloud") + print(f"Model after Optimized transformations {qeff_model}") # Export to the Onnx print(f"Exporting to Pytorch {model_name} to Onnx") base_path, onnx_path = qualcomm_efficient_converter( - model_kv=model, + model_kv=qeff_model, model_name=model_name, tokenizer=tokenizer, kv=True, form_factor="cloud", return_path=True, - ) + ) # type: ignore print(f"Base Path is {base_path} and Onnx Model Path is : {onnx_path}") diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index f00a56883..326096573 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -9,9 +9,6 @@ import os from typing import List -from huggingface_hub import login -from transformers import AutoModelForCausalLM, AutoTokenizer - import QEfficient from QEfficient.cloud.compile import main as compile from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter @@ -19,7 +16,8 @@ check_batch_size_and_num_prompts, cloud_ai_100_exec_kv, ) -from QEfficient.utils import hf_download, onnx_exists, qpc_exists +from QEfficient.loader import QEFFAutoModel +from QEfficient.utils import load_hf_tokenizer, onnx_exists, qpc_exists from QEfficient.utils.constants import Constants from QEfficient.utils.logging_utils import logger @@ -56,35 +54,18 @@ def main( + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16") ) - prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) + prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) # Get tokenizer - if hf_token is not None: - login(hf_token) - model_hf_path = hf_download( - repo_id=model_name, - cache_dir=cache_dir, - ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"], - ) - tokenizer = AutoTokenizer.from_pretrained( - model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True - ) + tokenizer = load_hf_tokenizer(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token) qpc_path_exists, qpc_dir_path = qpc_exists(model_name, qpc_base_dir_name) + onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name) + if qpc_path_exists: # execute logger.info("Pre-compiled qpc found! Trying to execute with given prompt") - cloud_ai_100_exec_kv( - batch_size, - tokenizer=tokenizer, - qpc_path=qpc_dir_path, - device_id=device_group, - prompt=prompt, - ) - return - - onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name) - if onnx_path_exists: + elif onnx_path_exists: # Compile -> execute # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation generated_qpc_path = compile( @@ -103,61 +84,54 @@ def main( assert ( generated_qpc_path == qpc_dir_path ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" - cloud_ai_100_exec_kv( - batch_size, + else: + ############################################# + # hf model -> export -> compile -> execute + ############################################# + # Load hf model + qeff_model = QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, cache_dir=cache_dir, hf_token=hf_token) + + # Easy and minimal api to update the model to QEff. + qeff_opt_model = QEfficient.transform(qeff_model, form_factor="cloud") + logger.info(f"Model after Optimized transformations {qeff_opt_model}") + + # Export to the Onnx + logger.info(f"Exporting to Pytorch {model_name} to ONNX...") + # Need to split below function into two functions one which always takes QEFFAutoModel and other with same interface as below + base_path, generated_onnx_path = qualcomm_efficient_converter( + model_kv=qeff_opt_model, # type: ignore tokenizer=tokenizer, - qpc_path=qpc_dir_path, - device_id=device_group, - prompt=prompt, + onnx_dir_path=onnx_dir_path, + kv=True, + form_factor="cloud", + return_path=True, + ) # type: ignore + print(f"Generated Onnx_path {generated_onnx_path} and Onnx_model_path {onnx_model_path} and Onnx_dir_path is {onnx_dir_path}") + assert ( + generated_onnx_path == onnx_model_path + ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_path}" + logger.info(f"Base Path is {base_path} and Onnx Model Path is : {generated_onnx_path}") + + # Compile + # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation + generated_qpc_path = compile( + onnx_path=onnx_model_path, + qpc_path=os.path.dirname(qpc_dir_path), + num_cores=num_cores, + batch_size=batch_size, + prompt_len=prompt_len, + ctx_len=ctx_len, + mxfp6=mxfp6, + mxint8=mxint8, + aic_enable_depth_first=aic_enable_depth_first, + mos=mos, + device_group=device_group, ) - return - - ############################################# - # hf model -> export -> compile -> execute - ############################################# - model_hf = AutoModelForCausalLM.from_pretrained(model_hf_path, use_cache=True) - # Easy and minimal api to update the model to QEff. - model_transformed = QEfficient.transform(model_hf, type="Transformers", form_factor="cloud") - logger.info(f"Model after Optimized transformations {model_transformed}") - - # Export to the Onnx - logger.info(f"Exporting to Pytorch {model_name} to ONNX...") - base_path, generated_onnx_path = qualcomm_efficient_converter( - model_kv=model_transformed, - onnx_dir_path=onnx_dir_path, - model_name=model_name, - kv=True, - form_factor="cloud", - return_path=True, - tokenizer=tokenizer, - ) - print( - f"Generated Onnx_path {generated_onnx_path} and Onnx_model_path {onnx_model_path} and Onnx_dir_path is {onnx_dir_path}" - ) - assert ( - generated_onnx_path == onnx_model_path - ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_path}" - logger.info(f"Base Path is {base_path} and Onnx Model Path is : {generated_onnx_path}") - - # Compile - # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation - generated_qpc_path = compile( - onnx_path=onnx_model_path, - qpc_path=os.path.dirname(qpc_dir_path), - num_cores=num_cores, - batch_size=batch_size, - prompt_len=prompt_len, - ctx_len=ctx_len, - mxfp6=mxfp6, - mxint8=mxint8, - aic_enable_depth_first=aic_enable_depth_first, - mos=mos, - device_group=device_group, - ) - assert ( - qpc_dir_path == generated_qpc_path - ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" - logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}") + assert ( + qpc_dir_path == generated_qpc_path + ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" + logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}") + # Execute cloud_ai_100_exec_kv( diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index d9a1e9f8a..062ff27b0 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -7,30 +7,34 @@ import os import shutil -from typing import Optional, Tuple +from typing import Optional, Tuple, Union import torch -from huggingface_hub import login -from transformers import AutoTokenizer +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast +import QEfficient from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort -from QEfficient.transformers.modeling_utils import transform -from QEfficient.utils import hf_download +from QEfficient.loader.loader import QEFFAutoModel +from QEfficient.loader.loader_factory import ( + AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, + QEFF_MODEL_TYPE, + QEFFAutoModelForCausalLM, + QEFFBaseAutoModelFactory, +) from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants from QEfficient.utils.logging_utils import logger +from QEfficient.utils.utils import load_hf_tokenizer def convert_to_cloud_bertstyle( model_name: str, - model_class: type = None, - tokenizer=None, - onnx_dir_path=None, - hf_token: str = None, - seq_len: int = Constants.seq_length, - input_str: str = Constants.input_str, - return_path: bool = False, - save_fp32_onnx: bool = False, - save_fp16_onnx: bool = True, + qeff_model: QEFFAutoModelForCausalLM, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + onnx_dir_path: str, + seq_len: int, + return_path: bool, + save_fp32_onnx: bool, + save_fp16_onnx: bool, ): """ Function to convert the model to Bertstyle approach. @@ -40,23 +44,14 @@ def convert_to_cloud_bertstyle( 3. KV is everytime computed for all the tokens until EOS/max_length Args: - model_name (str): The name of the model to be used. - model_class (type): The class of the model. tokenizer (HF AutoTokenizer): Tokenzier to prepare inputs. model_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location. - hf_token (str): If hf_token passed, it will be used for authentication for gated. Default is None. seq_len (int, optional): The length of the sequence. Default is 128. - input_str (str): The input string to be processed. return_path (bool): If True, return the base path for models and exported onnx model path save_fp32_onnx (bool); If True, fp32 unclipped version of ONNX will be saved. Default is False. save_fp16_onnx (bool); If false, generation of fp32 clipped version of ONNX will be skipped. Default is True. """ - # todo (amitraj) Optimize the onnx export - if onnx_dir_path is None: - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) - onnx_dir_path = os.path.join(model_card_dir, "onnx_bertstyle") - if os.path.exists(onnx_dir_path): logger.warning(f"Overriding {onnx_dir_path}") shutil.rmtree(onnx_dir_path) @@ -64,37 +59,29 @@ def convert_to_cloud_bertstyle( if not (save_fp32_onnx or save_fp16_onnx): raise AttributeError("save_fp32_onnx and save_fp16_onnx can't be false") - seq_len = Constants.seq_length - input_str = Constants.input_str - - # Load tokenizer - if tokenizer is None: - tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=True) - else: - if tokenizer.padding_side != "left": - logger.warning("Please use padding_side='left' while initializing the tokenizer") - tokenizer.padding_side = "left" + if tokenizer.padding_side != "left": + logger.warning("Please use padding_side='left' while initializing the tokenizer") + tokenizer.padding_side = "left" if tokenizer.pad_token_id is None: tokenizer.pad_token_id = tokenizer.eos_token_id - try: - if hf_token: - login(hf_token) - model_hf_path = hf_download( - repo_id=model_name, - cache_dir=Constants.CACHE_DIR, - ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"], - ) - model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True) - except Exception as e: - print(f"Failed to download the {model_name} model from Huggingface:%s", e) - model.eval() - # Decide path for saving exported ONNX files. + fp32_model_name, fp16_model_name = export_bertstyle_model_to_onnx(model_name, qeff_model.model, tokenizer, onnx_dir_path, seq_len, save_fp32_onnx, save_fp16_onnx) # type: ignore + + # return the model path for automation. + if return_path: + if save_fp16_onnx: + return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx") + else: + return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx") + + +def export_bertstyle_model_to_onnx(model_name, model, tokenizer, onnx_dir_path, seq_len, save_fp32_onnx, save_fp16_onnx): model_base_name = model_name.replace("/", "_") + "_bertstyle" os.makedirs(onnx_dir_path, exist_ok=True) + input_str = Constants.input_str # Preprocess inputs if seq_len > 0: if tokenizer.pad_token_id is None: @@ -173,29 +160,19 @@ def convert_to_cloud_bertstyle( inputs=inputs, input_list_file=input_list_file, ) - - # return the model path for automation. - if return_path: - if save_fp16_onnx: - return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx") - else: - return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx") - else: - return + + return fp32_model_name,fp16_model_name def convert_to_cloud_kvstyle( model_name: str, - model_class: type = None, - model_kv: torch.nn.Module = None, - tokenizer=None, - onnx_dir_path=None, - hf_token: str = None, - seq_len: int = Constants.seq_length, - input_str: str = Constants.input_str, - return_path: bool = False, - save_fp32_onnx: bool = False, - save_fp16_onnx: bool = True, + qeff_model: QEFFAutoModelForCausalLM, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + onnx_dir_path: str, + seq_len: int, + return_path: bool, + save_fp32_onnx: bool, + save_fp16_onnx: bool, ): """ Function Modeling changes for kv retention and export to Onnx. @@ -219,58 +196,46 @@ def convert_to_cloud_kvstyle( save_fp16_onnx (bool); If false, generation of fp32 clipped version of ONNX will be skipped. Default is True. """ - if onnx_dir_path is None: - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) - onnx_dir_path = os.path.join(model_card_dir, "onnx") - if os.path.exists(onnx_dir_path): logger.warning(f"Overriding {onnx_dir_path}") shutil.rmtree(onnx_dir_path) if not (save_fp32_onnx or save_fp16_onnx): raise AttributeError("save_fp32_onnx and save_fp16_onnx can't be false") + - if model_class is None and model_kv is None: - raise AttributeError("model_class and model_kv both can't be None") + if tokenizer.padding_side != "left": + logger.warning("Please use padding_side='left' while initializing the tokenizer") + tokenizer.padding_side = "left" - if model_kv is not None: - if not getattr(model_kv, "qeff_transformed", False): - raise AttributeError( - "Model is not transformed, Please first use QEfficient.transform to tranform the model." - ) - model = model_kv - else: - try: - if hf_token: - login(hf_token) - model_hf_path = hf_download( - repo_id=model_name, - cache_dir=Constants.CACHE_DIR, - ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"], - ) - model = model_class.from_pretrained(model_hf_path, cache_dir=Constants.CACHE_DIR, use_cache=True) - except Exception as e: - print(f"Failed to download the {model_name} model from Huggingface:%s", e) - transform(model, form_factor="cloud") + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id + + assert qeff_model.is_transformed, f"please pass the {qeff_model.__class__.__name__} after transform API" # Decide path for saving exported ONNX files. - model_base_name = model_name.replace("/", "_") + "_kv" - os.makedirs(onnx_dir_path, exist_ok=True) + fp32_model_name, fp16_model_name = export_kvstyle_transformed_model_to_onnx(model_name, qeff_model.model, tokenizer, onnx_dir_path, seq_len, save_fp32_onnx, save_fp16_onnx) # type: ignore - # Load tokenizer - if tokenizer is None: - # todo(ochougul): use cache dir from snapshot download - tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") - else: - if tokenizer.padding_side != "left": - logger.warning("Please use padding_side='left' while initializing the tokenizer") - tokenizer.padding_side = "left" + # return the model path for automation. + if return_path: + if save_fp16_onnx: + return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx") + else: + return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx") - if tokenizer.pad_token_id is None: - tokenizer.pad_token_id = tokenizer.eos_token_id + +def export_kvstyle_transformed_model_to_onnx(model_name: str, transformed_model: torch.nn.Module, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + onnx_dir_path: str, seq_len: int, save_fp32_onnx: Optional[bool] = False, save_fp16_onnx: Optional[bool] = True): + + assert isinstance(transformed_model, QEFFBaseAutoModelFactory), f"Expected model_kv to be of type {QEFFBaseAutoModelFactory} but got {transformed_model.__class__.__name__}" + if tokenizer.padding_side != "left": + logger.warning("Please use padding_side='left' while initializing the tokenizer") + tokenizer.padding_side = "left" + + tokenizer.pad_token_id = tokenizer.eos_token_id if tokenizer.pad_token_id is None else tokenizer.pad_token_id # Disabling requires_grad on all parameters - for j, p in enumerate(model.parameters()): + for j, p in enumerate(transformed_model.parameters()): p.requires_grad_(False) # Preprocess inputs @@ -297,10 +262,10 @@ def convert_to_cloud_kvstyle( inputs = tokenizer(input_str, return_tensors="pt") try: - pt_outputs = model(**inputs) + pt_outputs = transformed_model(**inputs) output_names = list(pt_outputs.keys()) except Exception as e: - print(f"Model {model_name} Execution failed in pytorch:%s", e) + print(f"Model {transformed_model.__class__,__name__} Execution failed in pytorch:%s", e) # Raise error if expected outputs are not present assert "logits" in output_names, "logits not found in output" @@ -319,10 +284,10 @@ def convert_to_cloud_kvstyle( # Run PyTorch inference with past try: - pt_outputs = model(**inputs) + pt_outputs = transformed_model(**inputs) output_names = list(pt_outputs.keys()) except Exception as e: - print(f"Model {model_name} Execution failed in pytorch:%s", e) + print(f"Model {transformed_model.__class__,__name__} Execution failed in pytorch:%s", e) # Add pkv into output_names pkv = tuple([(key.detach(), value.detach()) for key, value in pt_outputs.past_key_values]) @@ -337,9 +302,12 @@ def convert_to_cloud_kvstyle( pt_outputs[f"past_key.{i}_RetainedState"] = key pt_outputs[f"past_value.{i}_RetainedState"] = value + + model_base_name = model_name.replace("/", "_") + "_kv" + os.makedirs(onnx_dir_path, exist_ok=True) # Export and simplify ONNX model fp32_model_name = export_onnx( - pt_model=model, + pt_model=transformed_model, inputs=inputs, output_names=output_names, gen_models_path=onnx_dir_path, @@ -398,39 +366,95 @@ def convert_to_cloud_kvstyle( inputs=inputs, input_list_file=input_list_file, ) + + return fp32_model_name, fp16_model_name + + +def export_for_edge() -> None: + # [TODO]: Apply the class transformation to make changes for the KV models in edge use cases + # model = QEfficient.transform(model_hf, type="Transformers", form_factor="edge") + # model.eval() + raise NotImplementedError("Oops...reached too far!!") + + +def export_for_cloud(model_name: str, qeff_model: QEFFBaseAutoModelFactory, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + onnx_dir_path: str, seq_length: int = Constants.seq_length, + return_path: bool = True, + save_fp32_onnx: bool = False, + save_fp16_onnx: bool = True): + if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.LLM: # type: ignore + return export_lm_model_for_cloud(model_name=model_name, + qeff_model=qeff_model, # type: ignore + tokenizer=tokenizer, + onnx_dir_path=onnx_dir_path, + seq_length=seq_length, + return_path=return_path, + save_fp16_onnx=save_fp16_onnx, + save_fp32_onnx=save_fp32_onnx) + else: + raise NotImplementedError(f"Only model type {QEFFAutoModelForCausalLM.__class__.__name__} is supported for export, got {type(qeff_model)}") + + +def export_lm_model_for_cloud(model_name:str, qeff_model: QEFFAutoModelForCausalLM, + tokenizer:Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + onnx_dir_path: str, seq_length: int, return_path:bool, + save_fp32_onnx:bool, save_fp16_onnx: bool): + if os.path.exists(onnx_dir_path): + logger.warning(f"Overriding {onnx_dir_path}") + shutil.rmtree(onnx_dir_path) + if not (save_fp32_onnx or save_fp16_onnx): + raise AttributeError("save_fp32_onnx and save_fp16_onnx can't be false") + + if tokenizer.padding_side != "left": + logger.warning("Please use padding_side='left' while initializing the tokenizer") + tokenizer.padding_side = "left" + + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id + + + if qeff_model.is_transformed: + fp32_model_name, fp16_model_name = export_bertstyle_model_to_onnx( + model_name=model_name, + model=qeff_model.model, + tokenizer=tokenizer, + onnx_dir_path=onnx_dir_path, + seq_len=seq_length, + save_fp32_onnx=save_fp32_onnx, + save_fp16_onnx=save_fp16_onnx) # type: ignore + else: + fp32_model_name, fp16_model_name = export_kvstyle_transformed_model_to_onnx( + model_name=model_name, + transformed_model=qeff_model.model, + tokenizer=tokenizer, + onnx_dir_path=onnx_dir_path, + seq_len=seq_length, + save_fp32_onnx=save_fp32_onnx, + save_fp16_onnx=save_fp16_onnx) # type: ignore + # return the model path for automation. if return_path: if save_fp16_onnx: return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp16_model_name}.onnx") else: return onnx_dir_path, os.path.join(onnx_dir_path, f"{fp32_model_name}.onnx") - else: - return - - -def convert_to_edge(self) -> None: - # [TODO]: Apply the class transformation to make changes for the KV models in edge use cases - # model = QEfficient.transform(model_hf, type="Transformers", form_factor="edge") - # model.eval() - raise NotImplementedError("Oops...reached too far!!") def qualcomm_efficient_converter( model_name: str, - model_class: type = None, - model_kv: torch.nn.Module = None, - tokenizer=None, - onnx_dir_path=None, - hf_token: str = "", + model_kv: Optional[QEFFBaseAutoModelFactory] = None, # type: ignore + tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]]=None, + onnx_dir_path: Optional[str]=None, + hf_token: Optional[str] = None, seq_length: int = Constants.seq_length, - input_str: str = Constants.input_str, kv: bool = True, - return_path: bool = False, - form_factor="cloud", + return_path: bool = True, + form_factor: str="cloud", save_fp32_onnx: bool = False, save_fp16_onnx: bool = True, -) -> Optional[Tuple[str, str]]: +) -> Union[Tuple[str, str], None]: """ Function to convert the input string using the specified model and returns the result. @@ -442,7 +466,6 @@ def qualcomm_efficient_converter( onnx_dir_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location. token (bool): If True, an authentication token will be used. Default is False. seq_len (int, optional): The length of the sequence. Default is 128. - input_str (str): The input string to be processed. kv (bool): If True, key-value pairs will be used. Default is True. return_path (bool): If True, return the base path for models and exported onnx model path save_fp32_onnx (bool); If True, fp32 unclipped version of ONNX will be saved. Default is False. @@ -452,36 +475,32 @@ def qualcomm_efficient_converter( None, if automation is False, else path to exported Onnx file """ - if model_kv is not None and not kv: - raise AttributeError("For Transformed model kv must be True") + # Get model_kv first + model_kv = model_kv if model_kv else QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token) + # Transform if required + if model_kv.is_transformed and not kv: + raise AttributeError("Transformed model is passed while requsting to convert non-transformed model") + + model_kv: QEFFBaseAutoModelFactory = QEfficient.transform(model_kv) if kv else model_kv + + + if onnx_dir_path is None: + model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) + onnx_dir_path = os.path.join(model_card_dir, "onnx") + + # Load tokenizer if not passed + tokenizer = load_hf_tokenizer(model_name=model_name, hf_token=hf_token) if tokenizer is None else tokenizer + if form_factor == "cloud": - if kv: - return convert_to_cloud_kvstyle( - model_name=model_name, - model_class=model_class, - model_kv=model_kv, - onnx_dir_path=onnx_dir_path, - tokenizer=tokenizer, - hf_token=hf_token, - seq_len=seq_length, - input_str=input_str, - return_path=return_path, - save_fp32_onnx=save_fp32_onnx, - save_fp16_onnx=save_fp16_onnx, - ) - else: - return convert_to_cloud_bertstyle( - model_name=model_name, - model_class=model_class, - tokenizer=tokenizer, - onnx_dir_path=onnx_dir_path, - hf_token=hf_token, - seq_len=seq_length, - input_str=input_str, - return_path=return_path, - save_fp32_onnx=save_fp32_onnx, - save_fp16_onnx=save_fp16_onnx, - ) + return export_for_cloud( + model_name=model_name, + qeff_model=model_kv, + tokenizer=tokenizer, + onnx_dir_path=onnx_dir_path, + seq_length=seq_length, + return_path=return_path, + save_fp16_onnx=save_fp16_onnx, + save_fp32_onnx=save_fp32_onnx) else: - return convert_to_edge() + return export_for_edge() diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py index 5654ac582..8ce7f6b26 100644 --- a/QEfficient/exporter/export_utils.py +++ b/QEfficient/exporter/export_utils.py @@ -83,8 +83,8 @@ def export_onnx( custom_opsets={"com.qti.aisw.onnx": 1}, ) except Exception as e: - error("Exporting to ONNX failed. {}".format(e)) - return + raise RuntimeError("Exporting to ONNX failed. {}".format(e)) + onnx.checker.check_model(f"{gen_models_path}_tmp/{model_base_name}.onnx") loaded_model = onnx.load(f"{gen_models_path}_tmp/{model_base_name}.onnx") diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index d5d626faa..141a545e1 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -107,7 +107,7 @@ def get_compilation_batch_size(qpc_path: str): return compilation_batch_size -def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size): +def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) -> List[str]: assert ( prompt is not None or prompts_txt_file_path is not None ), "Please pass atleast one argument either using --prompt or --prompts_txt_file_path" diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py index c5421fd25..49b8382c6 100644 --- a/QEfficient/loader/loader_factory.py +++ b/QEfficient/loader/loader_factory.py @@ -6,19 +6,17 @@ # ---------------------------------------------------------------------------- import os -from typing import Any from abc import ABC, abstractmethod from enum import Enum -from typing import Union +from typing import Any, Dict, Union -from qtpy import API import torch.nn as nn -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast +from transformers import AutoConfig, AutoModelForCausalLM from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING -from QEfficient.utils.run_utils import ApiRunner, run_hf_lm_model_with_pt import QEfficient + class QEFFBaseAutoModelFactory(ABC): def __init__(self) -> None: @@ -29,6 +27,10 @@ def __init__(self) -> None: @abstractmethod def from_pretrained(self, pretrained_model_name_or_path: str, *args, **kwargs): pass + + @property + def is_transformed(self) -> bool: + raise NotImplementedError("Must implement for child classes") @abstractmethod def execute(self, *args, **kwargs) -> Any: @@ -40,40 +42,36 @@ def transform(self, *args, **kwargs) -> Any: @abstractmethod def export(self, *args, **kwargs) -> Any: - raise NotImplementedError("Reached too far!!") + pass class QEFFAutoModelForCausalLM(QEFFBaseAutoModelFactory): - def __init__(self, model: nn.Module, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], pretrained_model_name_or_path: str) -> None: + def __init__(self, model: nn.Module, pretrained_model_name_or_path: str) -> None: assert model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values(), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore - self.model = model - self.tokenizer = tokenizer + self.model: nn.Module = model self.model_files_path = pretrained_model_name_or_path - self._model_executor = None + + @property + def is_transformed(self) -> bool: + return getattr(self.model, "qeff_transformed", False) @classmethod def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) - return cls(model=model, tokenizer=tokenizer, pretrained_model_name_or_path=pretrained_model_name_or_path) + return cls(model=model, pretrained_model_name_or_path=pretrained_model_name_or_path) - def _run_kv_lm_model_with_pt(self, prompt, prompt_len, ctx_len): - api_runner = ApiRunner(self.tokenizer, prompt=prompt, prompt_len=prompt_len, ctx_len=ctx_len) - return api_runner.run_kv_model_on_pytorch(self.model, ) - - def execute(self, prompt: str, prompt_len: int = None, ctx_len: int = None, max_gen_length: int = 128): # type: ignore - if getattr(self.model, "qeff_transformed", False): - output_ids = run_hf_lm_model_with_pt(self.model, self.tokenizer, prompt, max_gen_length) - else: - output_ids = self._run_kv_lm_model_with_pt(prompt, prompt_len, ctx_len) - return output_ids + def execute(self, *args, **kwargs): # type: ignore + raise NotImplementedError("Reached too far!!") def transform(self): QEfficient.transform(self.model) return self def export(self): - pass + raise NotImplementedError("Reached too far!!") + + def __repr__(self) -> None: + print(self.model) class QEFF_MODEL_TYPE(Enum): @@ -82,10 +80,12 @@ class QEFF_MODEL_TYPE(Enum): AWQ = "AWQ" -MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP= { +MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP = { QEFF_MODEL_TYPE.LLM: QEFFAutoModelForCausalLM } +AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()} + def get_hf_model_type(hf_model_path: str): assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model" diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index bed89942b..4e06598c9 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -7,6 +7,7 @@ from QEfficient.utils.utils import ( # noqa: F401 hf_download, + load_hf_tokenizer, login_and_download_hf_lm, onnx_exists, qpc_exists, diff --git a/tests/utils.py b/tests/utils.py index f8fd7566e..a26d84826 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -15,6 +15,9 @@ import QEfficient from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100 +from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM +import QEfficient.transformers +import QEfficient.transformers.modeling_utils from QEfficient.utils import hf_download from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants from QEfficient.utils.device_utils import get_available_device_id, is_multi_qranium_setup_available, is_qpc_size_gt_32gb @@ -98,7 +101,7 @@ def transform_pt_model_with_qeff(model_hf): :param model_hf: pytorch model :return model_kv """ - model_kv = QEfficient.transform(model_hf, type="Transformers", form_factor="cloud") + model_kv = QEfficient.transformers.modeling_utils.transform(model_hf, form_factor="cloud") model_kv.eval() return model_kv @@ -113,8 +116,7 @@ def export_onnx(model_kv, tokenizer, model_name, model_class): onnx_dir_path = os.path.join(QEFF_MODELS_DIR, model_name) base_path, onnx_model_path = qualcomm_efficient_converter( model_name=model_name, - model_class=model_class, - model_kv=model_kv, + model_kv=QEFFAutoModelForCausalLM(model=model_kv, pretrained_model_name_or_path=None), # type: ignore tokenizer=tokenizer, onnx_dir_path=onnx_dir_path, kv=True, From 55f6182f1575666813aeed6c692c7f416f111110 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Tue, 28 May 2024 23:37:01 +0530 Subject: [PATCH 03/20] removed unused imports Signed-off-by: Onkar Chougule --- QEfficient/loader/loader_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py index 49b8382c6..cd2104ba3 100644 --- a/QEfficient/loader/loader_factory.py +++ b/QEfficient/loader/loader_factory.py @@ -8,7 +8,7 @@ import os from abc import ABC, abstractmethod from enum import Enum -from typing import Any, Dict, Union +from typing import Any import torch.nn as nn from transformers import AutoConfig, AutoModelForCausalLM From 8ff782ae46c7675103941f7da21945774fffe8aa Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Tue, 28 May 2024 23:54:02 +0530 Subject: [PATCH 04/20] allowed to initialize QEFFAUtoLMModel Signed-off-by: Onkar Chougule --- QEfficient/loader/loader_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py index cd2104ba3..72e28f912 100644 --- a/QEfficient/loader/loader_factory.py +++ b/QEfficient/loader/loader_factory.py @@ -15,6 +15,7 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING import QEfficient +from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict class QEFFBaseAutoModelFactory(ABC): @@ -47,7 +48,8 @@ def export(self, *args, **kwargs) -> Any: class QEFFAutoModelForCausalLM(QEFFBaseAutoModelFactory): def __init__(self, model: nn.Module, pretrained_model_name_or_path: str) -> None: - assert model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values(), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore + assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or + model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore self.model: nn.Module = model self.model_files_path = pretrained_model_name_or_path From cc7aa257f0fd3761d77705b80cf7b65446d0899e Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Wed, 29 May 2024 00:18:07 +0530 Subject: [PATCH 05/20] fixed tests bugs Signed-off-by: Onkar Chougule --- .../exporter/export_hf_to_cloud_ai_100.py | 15 ++++++++------- QEfficient/loader/loader_factory.py | 4 ++-- tests/utils.py | 19 +++++++++---------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index 062ff27b0..4805f12d7 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -227,7 +227,6 @@ def convert_to_cloud_kvstyle( def export_kvstyle_transformed_model_to_onnx(model_name: str, transformed_model: torch.nn.Module, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], onnx_dir_path: str, seq_len: int, save_fp32_onnx: Optional[bool] = False, save_fp16_onnx: Optional[bool] = True): - assert isinstance(transformed_model, QEFFBaseAutoModelFactory), f"Expected model_kv to be of type {QEFFBaseAutoModelFactory} but got {transformed_model.__class__.__name__}" if tokenizer.padding_side != "left": logger.warning("Please use padding_side='left' while initializing the tokenizer") tokenizer.padding_side = "left" @@ -416,23 +415,25 @@ def export_lm_model_for_cloud(model_name:str, qeff_model: QEFFAutoModelForCausal if qeff_model.is_transformed: - fp32_model_name, fp16_model_name = export_bertstyle_model_to_onnx( + fp32_model_name, fp16_model_name = export_kvstyle_transformed_model_to_onnx( model_name=model_name, - model=qeff_model.model, - tokenizer=tokenizer, + transformed_model=qeff_model.model, + tokenizer=tokenizer, onnx_dir_path=onnx_dir_path, seq_len=seq_length, save_fp32_onnx=save_fp32_onnx, save_fp16_onnx=save_fp16_onnx) # type: ignore + else: - fp32_model_name, fp16_model_name = export_kvstyle_transformed_model_to_onnx( + fp32_model_name, fp16_model_name = export_bertstyle_model_to_onnx( model_name=model_name, - transformed_model=qeff_model.model, - tokenizer=tokenizer, + model=qeff_model.model, + tokenizer=tokenizer, onnx_dir_path=onnx_dir_path, seq_len=seq_length, save_fp32_onnx=save_fp32_onnx, save_fp16_onnx=save_fp16_onnx) # type: ignore + # return the model path for automation. if return_path: diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py index 72e28f912..2b13e59be 100644 --- a/QEfficient/loader/loader_factory.py +++ b/QEfficient/loader/loader_factory.py @@ -72,8 +72,8 @@ def transform(self): def export(self): raise NotImplementedError("Reached too far!!") - def __repr__(self) -> None: - print(self.model) + def __repr__(self) -> str: + return self.model.__repr__() class QEFF_MODEL_TYPE(Enum): diff --git a/tests/utils.py b/tests/utils.py index a26d84826..0760e3613 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -13,11 +13,11 @@ import transformers import QEfficient +import QEfficient.transformers +import QEfficient.transformers.modeling_utils from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100 from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM -import QEfficient.transformers -import QEfficient.transformers.modeling_utils from QEfficient.utils import hf_download from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants from QEfficient.utils.device_utils import get_available_device_id, is_multi_qranium_setup_available, is_qpc_size_gt_32gb @@ -161,14 +161,13 @@ def set_up(model_config, device_group=[0]): model_config["model_name"], model_config["model_class"], ) - try: - ort_tokens = api_runner.run_kv_model_on_ort( - onnx_model_path, - model_config["n_layer"], - model_config["padding_shape"], - ) - except Exception as e: - print(f"ONNX Model run on onnxrt failed due to : {e}") + + ort_tokens = api_runner.run_kv_model_on_ort( + onnx_model_path, + model_config["n_layer"], + model_config["padding_shape"], + ) + setup_info = {} setup_info["model_config"] = model_config From 59fcc74665d20fd83a25fccbc8f72dffe61a6014 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Wed, 29 May 2024 11:55:31 +0530 Subject: [PATCH 06/20] renamed utils.py to _utils.py Signed-off-by: Onkar Chougule --- .../exporter/export_hf_to_cloud_ai_100.py | 2 +- QEfficient/loader/loader.py | 2 +- QEfficient/utils/__init__.py | 2 +- QEfficient/utils/_utils.py | 124 ++++++++++++++++++ 4 files changed, 127 insertions(+), 3 deletions(-) create mode 100644 QEfficient/utils/_utils.py diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index 4805f12d7..c2568a735 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -23,7 +23,7 @@ ) from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants from QEfficient.utils.logging_utils import logger -from QEfficient.utils.utils import load_hf_tokenizer +from QEfficient.utils._utils import load_hf_tokenizer def convert_to_cloud_bertstyle( diff --git a/QEfficient/loader/loader.py b/QEfficient/loader/loader.py index 950fcb946..185434dae 100644 --- a/QEfficient/loader/loader.py +++ b/QEfficient/loader/loader.py @@ -13,7 +13,7 @@ QEFFBaseAutoModelFactory, get_hf_model_type, ) -from QEfficient.utils.utils import login_and_download_hf_lm +from QEfficient.utils._utils import login_and_download_hf_lm class QEFFAutoModel: diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index 4e06598c9..7a3cd4959 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from QEfficient.utils.utils import ( # noqa: F401 +from QEfficient.utils._utils import ( # noqa: F401 hf_download, load_hf_tokenizer, login_and_download_hf_lm, diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py new file mode 100644 index 000000000..80f1f0c46 --- /dev/null +++ b/QEfficient/utils/_utils.py @@ -0,0 +1,124 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import os +from typing import List, Optional, Tuple, Union + +import requests +from huggingface_hub import login, snapshot_download +from requests.exceptions import HTTPError +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast + +from QEfficient.utils.constants import QEFF_MODELS_DIR + + +def login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs): + hf_token = kwargs.pop("hf_token", None) + cache_dir = kwargs.pop("cache_dir", None) + if hf_token is not None: + login(hf_token) + pretrained_model_name_or_path = hf_download( + repo_id=pretrained_model_name_or_path, + cache_dir=cache_dir, + ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"], + ) + return pretrained_model_name_or_path + + +def hf_download( + repo_id: Optional[str] = None, + cache_dir: Optional[str] = None, + hf_token: Optional[str] = None, + allow_patterns: Optional[List[str]] = None, + ignore_patterns: Optional[List[str]] = None, +): + # Setup cache and local dir + local_dir = None + if cache_dir is not None: + cache_dir = f"{cache_dir}" + local_dir = f"{cache_dir}/{repo_id}" + + os.makedirs(f"{cache_dir}/{repo_id}", exist_ok=True) + max_retries = 5 + retry_count = 0 + while retry_count < max_retries: + try: + model_path = snapshot_download( + repo_id, + cache_dir=cache_dir, + local_dir=local_dir, + local_dir_use_symlinks=True, + revision="main", + resume_download=True, + token=hf_token, + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, + ) + break + except requests.ReadTimeout as e: + print(f"Read timeout: {e}") + retry_count += 1 + + except HTTPError as e: + retry_count = max_retries + if e.response.status_code == 401: + print("You need to pass a valid `--hf_token=...` to download private checkpoints.") + else: + raise e + + return model_path + + +def qpc_exists(model_name: str, qpc_base_dir_name: str) -> Tuple[bool, str]: + """ + Checks if qpc files already exists, removes the directory if files have been manipulated. + --------- + :param model_name: str. HF Model card name. + :param dir_path: str. Path of qpc directory. + :return: Union[Tuple[bool, str]]: qpc_exists and path to qpc directory + """ + model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) + os.makedirs(model_card_dir, exist_ok=True) + + qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs") + + # Compute the boolean indicating if the QPC exists + qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin")) + + return qpc_exists_bool, qpc_dir_path + + +def onnx_exists(model_name: str) -> Tuple[bool, str, str]: + """ + Checks if qpc files already exists, removes the directory if files have been manipulated. + --------- + :param model_name: str. HF Model card name. + :return: Union[Tuple[bool, str, str]]: onnx_exists and path to onnx file and directory + """ + model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) + os.makedirs(model_card_dir, exist_ok=True) + + onnx_dir_path = os.path.join(model_card_dir, "onnx") + onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") + + # Compute the boolean indicating if the ONNX model exists + onnx_exists_bool = os.path.isfile(onnx_model_path) and os.path.isfile( + os.path.join(os.path.dirname(onnx_model_path), "custom_io_fp16.yaml") + ) + + # Return the boolean, onnx_dir_path, and onnx_model_path + return onnx_exists_bool, onnx_dir_path, onnx_model_path + + +def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token: Optional[str] = None) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]: + if hf_token is not None: + login(hf_token) + + # Download tokenizer along with model if it doesn't exist + model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"]) + tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left") + return tokenizer From 836102f915da46fe389a58df6867d36cc83b5408 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Wed, 29 May 2024 13:49:07 +0530 Subject: [PATCH 07/20] added more type hinting and docstrings Signed-off-by: Onkar Chougule --- QEfficient/__init__.py | 20 +--- QEfficient/cloud/export.py | 2 +- QEfficient/cloud/infer.py | 2 +- .../exporter/export_hf_to_cloud_ai_100.py | 12 +- QEfficient/loader/loader.py | 17 ++- QEfficient/loader/loader_factory.py | 38 +++--- QEfficient/transformers/modeling_utils.py | 110 ++++++++++-------- QEfficient/utils/generate_inputs.py | 34 +----- QEfficient/utils/run_utils.py | 38 +++--- tests/utils.py | 2 +- 10 files changed, 131 insertions(+), 144 deletions(-) diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index d9d032f27..fb4c517f7 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -5,21 +5,9 @@ # # ----------------------------------------------------------------------------- -from typing import Any, Union - +from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.loader import QEFFAutoModel # noqa: F401 -from QEfficient.loader.loader_factory import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFAutoModelForCausalLM -from QEfficient.transformers.modeling_utils import transform as transform_hf - +from QEfficient.transformers.modeling_utils import transform # noqa: F401 -def transform(model: Union[QEFFAutoModelForCausalLM, Any], form_factor="cloud"): - """Low level apis in library - model : instance of nn.Module - type : Transformers | Diffusers, default : Transformers - """ - assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!" - if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.LLM: - transform_hf(model.model, form_factor) - return model - else: - raise NotImplementedError(f"Recieved unsupported class of type {type(model)}") +# Users can use QEfficient.export for exporting models to ONNX +export = qualcomm_efficient_converter diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 2b7201c8e..4ac2f6a05 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -9,7 +9,7 @@ import os from typing import Optional -import QEfficient +import QEfficient.transformers.modeling_utils from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.loader import QEFFAutoModel from QEfficient.utils import load_hf_tokenizer, onnx_exists diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 326096573..14b61465b 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -9,7 +9,7 @@ import os from typing import List -import QEfficient +import QEfficient.transformers.modeling_utils from QEfficient.cloud.compile import main as compile from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.generation.text_generation_inference import ( diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index c2568a735..e8b9a23ac 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -7,19 +7,19 @@ import os import shutil -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Type, Union import torch from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast -import QEfficient +import QEfficient.transformers.modeling_utils from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort from QEfficient.loader.loader import QEFFAutoModel from QEfficient.loader.loader_factory import ( AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFAutoModelForCausalLM, - QEFFBaseAutoModelFactory, + QEFFBaseModel, ) from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants from QEfficient.utils.logging_utils import logger @@ -376,7 +376,7 @@ def export_for_edge() -> None: raise NotImplementedError("Oops...reached too far!!") -def export_for_cloud(model_name: str, qeff_model: QEFFBaseAutoModelFactory, +def export_for_cloud(model_name: str, qeff_model: QEFFBaseModel, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], onnx_dir_path: str, seq_length: int = Constants.seq_length, return_path: bool = True, @@ -445,7 +445,7 @@ def export_lm_model_for_cloud(model_name:str, qeff_model: QEFFAutoModelForCausal def qualcomm_efficient_converter( model_name: str, - model_kv: Optional[QEFFBaseAutoModelFactory] = None, # type: ignore + model_kv: Optional[Type[QEFFBaseModel]] = None, # type: ignore tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]]=None, onnx_dir_path: Optional[str]=None, hf_token: Optional[str] = None, @@ -483,7 +483,7 @@ def qualcomm_efficient_converter( if model_kv.is_transformed and not kv: raise AttributeError("Transformed model is passed while requsting to convert non-transformed model") - model_kv: QEFFBaseAutoModelFactory = QEfficient.transform(model_kv) if kv else model_kv + model_kv: Type[QEFFBaseModel] = QEfficient.transform(model_kv) if kv else model_kv if onnx_dir_path is None: diff --git a/QEfficient/loader/loader.py b/QEfficient/loader/loader.py index 185434dae..94ae672be 100644 --- a/QEfficient/loader/loader.py +++ b/QEfficient/loader/loader.py @@ -6,28 +6,35 @@ # ----------------------------------------------------------------------------- import os -from typing import Any +from typing import Any, Type from QEfficient.loader.loader_factory import ( MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP, - QEFFBaseAutoModelFactory, + QEFFBaseModel, get_hf_model_type, ) from QEfficient.utils._utils import login_and_download_hf_lm class QEFFAutoModel: + """ + Provides HuggingFace model loading interface same as transformers APIs. + Supports loading any model on HuggingFace. + """ def __init__(self, *args: Any, **kwds: Any) -> None: raise EnvironmentError( f"{self.__class__.__name__} is designed to be instantiated " f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`") @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseAutoModelFactory: + def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> Type[QEFFBaseModel]: + """ + Downloads HuggingFace model if already doesn't exist locally, returns QEffAutoModel object based on type of model. + """ pretrained_model_name_or_path = pretrained_model_name_or_path if os.path.isdir(pretrained_model_name_or_path) \ else login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs) model_type = get_hf_model_type(hf_model_path=pretrained_model_name_or_path) qeff_auto_model_class = MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP[model_type] - assert issubclass(qeff_auto_model_class, QEFFBaseAutoModelFactory), f"Expected class that inherits {QEFFBaseAutoModelFactory}, got {type(qeff_auto_model_class)}" + assert issubclass(qeff_auto_model_class, QEFFBaseModel), f"Expected class that inherits {QEFFBaseModel}, got {type(qeff_auto_model_class)}" - return qeff_auto_model_class.from_pretrained(pretrained_model_name_or_path) + return qeff_auto_model_class.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path) diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py index 2b13e59be..a803b22fe 100644 --- a/QEfficient/loader/loader_factory.py +++ b/QEfficient/loader/loader_factory.py @@ -8,26 +8,31 @@ import os from abc import ABC, abstractmethod from enum import Enum -from typing import Any +from typing import Any, Dict, Type import torch.nn as nn from transformers import AutoConfig, AutoModelForCausalLM from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING -import QEfficient +import QEfficient.transformers.modeling_utils from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict -class QEFFBaseAutoModelFactory(ABC): - +class QEFFBaseModel(ABC): + """ + This class acts as parent class for all the varieties of model class (i.e. LLMs, SD, quantized etc.). + Enforces certain methods to be implemented by child classes. + + All the child classes must provide way to load, transform(optimize), exoprt to ONNX etc. capabilities. + """ def __init__(self) -> None: super().__init__() # Users can call generate or execute self.generate = self.execute - @abstractmethod - def from_pretrained(self, pretrained_model_name_or_path: str, *args, **kwargs): - pass + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): + raise NotImplementedError("Must implement for child classes") @property def is_transformed(self) -> bool: @@ -46,7 +51,10 @@ def export(self, *args, **kwargs) -> Any: pass -class QEFFAutoModelForCausalLM(QEFFBaseAutoModelFactory): +class QEFFAutoModelForCausalLM(QEFFBaseModel): + """ + QEFF class for manipulating any causal language model from HuggingFace hub. + """ def __init__(self, model: nn.Module, pretrained_model_name_or_path: str) -> None: assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore @@ -66,7 +74,7 @@ def execute(self, *args, **kwargs): # type: ignore raise NotImplementedError("Reached too far!!") def transform(self): - QEfficient.transform(self.model) + QEfficient.transformers.modeling_utils.transform_lm(self.model) return self def export(self): @@ -82,19 +90,23 @@ class QEFF_MODEL_TYPE(Enum): AWQ = "AWQ" -MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP = { +MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = { QEFF_MODEL_TYPE.LLM: QEFFAutoModelForCausalLM } -AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()} +AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()} -def get_hf_model_type(hf_model_path: str): - assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model" +def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE: + """ + Loads model config file and returns the type of the model (i.e. LLMs, SD, quantized etc.) as supported by the library. + """ + assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model" config, kwargs = AutoConfig.from_pretrained( hf_model_path, return_unused_kwargs=True, ) + if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING: # FIXME: Add logic to handle if quantization config is stored in separate quant_config.json outside of config, also create a separate function for this and below lines quant_config = getattr(config, "quantization_config", getattr(config, "quant_config", None)) diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index 5ad29ef3d..9253ae54c 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -7,6 +7,7 @@ import hashlib from collections import namedtuple +from typing import Dict, Type import torch.nn as nn import transformers @@ -34,17 +35,22 @@ ) from transformers.models.mixtral.modeling_mixtral import ( MixtralAttention, + MixtralBLockSparseTop2MLP, + MixtralDecoderLayer, MixtralForCausalLM, MixtralModel, - MixtralDecoderLayer, - MixtralSparseMoeBlock, - MixtralBLockSparseTop2MLP, - MixtralRotaryEmbedding, MixtralRMSNorm, + MixtralRotaryEmbedding, + MixtralSparseMoeBlock, ) from transformers.models.mpt.modeling_mpt import MptAttention, MptBlock, MptForCausalLM, MptModel from QEfficient.customop import CustomRMSNormAIC +from QEfficient.loader.loader_factory import ( + AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, + QEFF_MODEL_TYPE, + QEFFBaseModel, +) from QEfficient.utils.logging_utils import logger from .modeling_attn_mask_utils import ( @@ -81,13 +87,13 @@ QEffMistralRotaryEmbedding, ) from .models.mixtral_moe.modeling_mixtral import ( - QEffMixtralModel, - QEffMixtralRotaryEmbedding, QEffMixtralAttention, - QEffMixtralForCausalLM, + QEffMixtralBLockSparseTop2MLP, QEffMixtralDecoderLayer, + QEffMixtralForCausalLM, + QEffMixtralModel, + QEffMixtralRotaryEmbedding, QEffMixtralSparseMoeBlock, - QEffMixtralBLockSparseTop2MLP, ) from .models.mpt.modeling_mpt import QEffMptAttention, QEffMptBlock, QEffMptForCausalLM, QEFfMptModel @@ -108,7 +114,7 @@ # Define a transformers layers to QEff layers dictionary # While onboarding new models make sure to add the new layer maps to this dictionary. -TransformersToQEffModulesDict = { +TransformersToQEffModulesDict: Dict[Type[nn.Module], Type[nn.Module]] = { # GPT model layers GPT2Model: QEffGPT2Model, GPT2Block: QEffGPT2Block, @@ -179,13 +185,12 @@ def replace_module_with_qeff_layers(model: nn.Module) -> None: replace_module_with_qeff_layers(module) -def transform(model: nn.Module, form_factor: str = "cloud") -> nn.Module: +def transform_lm(model: nn.Module) -> nn.Module: """ - Replaces some Transformers' methods for equivalent methods optimized for AI 100. + Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for cloud AI 100. --------- Args: param model (torch.nn.Module): PyTorch model. - form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"]. Returns: torch.nn.Module: PyTorch Module with replaced QEff layers. @@ -196,38 +201,49 @@ def transform(model: nn.Module, form_factor: str = "cloud") -> nn.Module: print("Model is already transformed") return model - - if form_factor == "cloud": - # Get Hash of all params for checking later - prior_params_hash = get_params_hash(model) - logger.warning(f"The model {model.__class__} layers has been upadted to QEff layers in-place") - # Replace with QEff layers - replace_module_with_qeff_layers(model) - - # Check with new params hash - later_params_hash = get_params_hash(model) - assert ( - prior_params_hash == later_params_hash - ), "Weights were changed in the transform process, please report an issue" - - # Replace the modeling output classes - transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions = ( - QEffBaseModelOutputWithPastAndCrossAttentions - ) - transformers.modeling_outputs.CausalLMOutputWithCrossAttentions = QEffCausalLMOutputWithCrossAttentions - transformers.modeling_outputs.BaseModelOutputWithPast = QEffBaseModelOutputWithPast - transformers.modeling_outputs.CausalLMOutputWithPast = QEffCausalLMOutputWithPast - transformers.modeling_outputs.MoeCausalLMOutputWithPast = QEffMoeCausalLMOutputWithPast - transformers.modeling_outputs.MoeModelOutputWithPast = QEffMoeModelOutputWithPast - - # Replace the modeling attn util classes and functions - transformers.modeling_attn_mask_utils.AttentionMaskConverter = QEffAttentionMaskConverter - transformers.modeling_attn_mask_utils._prepare_4d_attention_mask = _qeff_prepare_4d_attention_mask - transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask = _qeff_prepare_4d_causal_attention_mask - - setattr(model,'qeff_transformed',True) - return model.eval() - - elif form_factor == "edge": - # Add changes for the edge usecase - raise NotImplementedError("We currently only support cloud form factor!") + # Get Hash of all params for checking later + prior_params_hash = get_params_hash(model) + logger.warning(f"The model {model.__class__} layers has been upadted to QEff layers in-place") + # Replace with QEff layers + replace_module_with_qeff_layers(model) + + # Check with new params hash + later_params_hash = get_params_hash(model) + assert ( + prior_params_hash == later_params_hash + ), "Weights were changed in the transform process, please report an issue" + + # Replace the modeling output classes + transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions = ( + QEffBaseModelOutputWithPastAndCrossAttentions + ) + transformers.modeling_outputs.CausalLMOutputWithCrossAttentions = QEffCausalLMOutputWithCrossAttentions + transformers.modeling_outputs.BaseModelOutputWithPast = QEffBaseModelOutputWithPast + transformers.modeling_outputs.CausalLMOutputWithPast = QEffCausalLMOutputWithPast + transformers.modeling_outputs.MoeCausalLMOutputWithPast = QEffMoeCausalLMOutputWithPast + transformers.modeling_outputs.MoeModelOutputWithPast = QEffMoeModelOutputWithPast + + # Replace the modeling attn util classes and functions + transformers.modeling_attn_mask_utils.AttentionMaskConverter = QEffAttentionMaskConverter + transformers.modeling_attn_mask_utils._prepare_4d_attention_mask = _qeff_prepare_4d_attention_mask + transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask = _qeff_prepare_4d_causal_attention_mask + + setattr(model,'qeff_transformed',True) + return model.eval() + + + +def transform(model: Type[QEFFBaseModel], form_factor="cloud"): + """ + This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for cloud AI 100. + Will replace the torch.nn.Module layers of passed QEffModel with optimized implementation of the same. + + model: object of any instance of class that is child of `QEFFBaseAutoModelFactory` + form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"]. + """ + assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!" + if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.LLM: + transform_lm(model.model, form_factor) # type: ignore + return model + else: + raise NotImplementedError(f"Recieved unsupported class of type {type(model)}") diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py index deb7bcf32..9818f6123 100644 --- a/QEfficient/utils/generate_inputs.py +++ b/QEfficient/utils/generate_inputs.py @@ -5,44 +5,12 @@ # # ----------------------------------------------------------------------------- -from abc import ABC, abstractmethod import numpy as np import torch from QEfficient.utils.logging_utils import logger -class AwesomeInputHandler(ABC): - - def __init__(self) -> None: - super().__init__() - self.counter = 0 - - def reset(self): - self.counter = 0 - - def prepare_inputs(self, prompt, n_layer, padding_shape): - if self.counter!=0: - logger.warning("Resetting Input Handler as prepare_inputs is called even though it's in the middle of generating outputs") - self.reset() - - self._prepare_inputs(prompt, n_layer, padding_shape) - self.counter+=1 - - def update_inputs(self, outputs): - self._update_inputs(outputs) - self.counter+=1 - - @abstractmethod - def _prepare_inputs(self, prompt, n_layer, padding_shape): - pass - - @abstractmethod - def _update_inputs(self, outputs): - pass - - - class InputHandler: def __init__(self, tokenizer, input_str, prompt_len, ctx_len): """ @@ -53,7 +21,7 @@ def __init__(self, tokenizer, input_str, prompt_len, ctx_len): :param ctx_len: int """ if tokenizer.padding_side != "left": - logger.warning(f"Please use padding_side='left' while initializing the tokenizer") + logger.warning("Please use padding_side='left' while initializing the tokenizer") tokenizer.padding_side = "left" if tokenizer.pad_token_id is None: tokenizer.pad_token_id = tokenizer.eos_token_id diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index fbfc2b968..bc50df37a 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -14,26 +14,6 @@ from .generate_inputs import InputHandler -def run_hf_lm_model_with_pt(model_hf, tokenizer, prompt, gen_len): - input_ids = tokenizer.encode(prompt, return_tensors="pt") - - input_ids_len = len(input_ids[0]) - - with torch.no_grad(): - for _ in range(gen_len): - outputs = model_hf(input_ids) - logits = outputs.logits[:, -1, :] - predicted_token_id = torch.argmax(logits, dim=-1) - input_ids = torch.cat([input_ids, predicted_token_id.unsqueeze(1)], dim=-1) - - generated_ids = input_ids[0][input_ids_len:].detach().numpy() - generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True) - print("Original HF Model Outputs (Torch CPU): \n") - print("Prompt:", repr(prompt)) - print("Completion:", repr(generated_text)) - return generated_ids - - class ApiRunner: """ ApiRunner class is responsible for: @@ -71,7 +51,23 @@ def run_hf_model_on_pytorch(self, model_hf): :param model_hf: pytorch model :return generated_ids: numpy.ndarray - output tokens """ - return run_hf_lm_model_with_pt(model_hf, self.tokenizer, self.prompt[0], self.gen_len) + input_ids = self.tokenizer.encode(self.prompt[0], return_tensors="pt") + + input_ids_len = len(input_ids[0]) + + with torch.no_grad(): + for _ in range(self.gen_len): + outputs = model_hf(input_ids) + logits = outputs.logits[:, -1, :] + predicted_token_id = torch.argmax(logits, dim=-1) + input_ids = torch.cat([input_ids, predicted_token_id.unsqueeze(1)], dim=-1) + + generated_ids = input_ids[0][input_ids_len:].detach().numpy() + generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True) + print("Original HF Model Outputs (Torch CPU): \n") + print("Prompt:", repr(self.prompt)) + print("Completion:", repr(generated_text)) + return generated_ids def run_kv_model_on_pytorch(self, model, n_layer, padding_shape): diff --git a/tests/utils.py b/tests/utils.py index 0760e3613..3ef42f82c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -101,7 +101,7 @@ def transform_pt_model_with_qeff(model_hf): :param model_hf: pytorch model :return model_kv """ - model_kv = QEfficient.transformers.modeling_utils.transform(model_hf, form_factor="cloud") + model_kv = QEfficient.transformers.modeling_utils.transform_lm(model_hf) model_kv.eval() return model_kv From 2cd787fbe599513258f15e601620893919c6fef7 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Thu, 30 May 2024 12:07:25 +0530 Subject: [PATCH 08/20] addressed review comments, added test file for new interface Signed-off-by: Onkar Chougule --- .../exporter/export_hf_to_cloud_ai_100.py | 19 +++++----- QEfficient/loader/loader_factory.py | 33 +++++++++++++---- QEfficient/transformers/modeling_utils.py | 2 +- QEfficient/utils/_utils.py | 7 ++-- tests/test_loader.py | 36 +++++++++++++++++++ tests/utils.py | 15 +++----- 6 files changed, 81 insertions(+), 31 deletions(-) create mode 100644 tests/test_loader.py diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index e8b9a23ac..8e012cb1b 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -260,11 +260,10 @@ def export_kvstyle_transformed_model_to_onnx(model_name: str, transformed_model: else: inputs = tokenizer(input_str, return_tensors="pt") - try: - pt_outputs = transformed_model(**inputs) - output_names = list(pt_outputs.keys()) - except Exception as e: - print(f"Model {transformed_model.__class__,__name__} Execution failed in pytorch:%s", e) + + pt_outputs = transformed_model(**inputs) + output_names = list(pt_outputs.keys()) + # Raise error if expected outputs are not present assert "logits" in output_names, "logits not found in output" @@ -282,11 +281,9 @@ def export_kvstyle_transformed_model_to_onnx(model_name: str, transformed_model: inputs["past_key_values"] = tuple([(key.detach(), value.detach()) for key, value in pt_outputs.past_key_values]) # Run PyTorch inference with past - try: - pt_outputs = transformed_model(**inputs) - output_names = list(pt_outputs.keys()) - except Exception as e: - print(f"Model {transformed_model.__class__,__name__} Execution failed in pytorch:%s", e) + pt_outputs = transformed_model(**inputs) + output_names = list(pt_outputs.keys()) + # Add pkv into output_names pkv = tuple([(key.detach(), value.detach()) for key, value in pt_outputs.past_key_values]) @@ -382,7 +379,7 @@ def export_for_cloud(model_name: str, qeff_model: QEFFBaseModel, return_path: bool = True, save_fp32_onnx: bool = False, save_fp16_onnx: bool = True): - if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.LLM: # type: ignore + if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore return export_lm_model_for_cloud(model_name=model_name, qeff_model=qeff_model, # type: ignore tokenizer=tokenizer, diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py index a803b22fe..66ca42154 100644 --- a/QEfficient/loader/loader_factory.py +++ b/QEfficient/loader/loader_factory.py @@ -38,6 +38,14 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): def is_transformed(self) -> bool: raise NotImplementedError("Must implement for child classes") + @abstractmethod + def transform_export(self, *args, **kwargs) -> Any: + pass + + @abstractmethod + def transform_export_compile(self, *args, **kwargs) -> Any: + pass + @abstractmethod def execute(self, *args, **kwargs) -> Any: pass @@ -49,6 +57,10 @@ def transform(self, *args, **kwargs) -> Any: @abstractmethod def export(self, *args, **kwargs) -> Any: pass + + @abstractmethod + def compile(self, *args, **kwargs) -> Any: + pass class QEFFAutoModelForCausalLM(QEFFBaseModel): @@ -61,6 +73,9 @@ def __init__(self, model: nn.Module, pretrained_model_name_or_path: str) -> None self.model: nn.Module = model self.model_files_path = pretrained_model_name_or_path + def __repr__(self) -> str: + return self.model.__repr__() + @property def is_transformed(self) -> bool: return getattr(self.model, "qeff_transformed", False) @@ -70,6 +85,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) return cls(model=model, pretrained_model_name_or_path=pretrained_model_name_or_path) + def transform_export(self, *args, **kwargs) -> Any: + raise NotImplementedError("Reached too far!!") + + def transform_export_compile(self, *args, **kwargs) -> Any: + raise NotImplementedError("Reached too far!!") + def execute(self, *args, **kwargs): # type: ignore raise NotImplementedError("Reached too far!!") @@ -80,18 +101,18 @@ def transform(self): def export(self): raise NotImplementedError("Reached too far!!") - def __repr__(self) -> str: - return self.model.__repr__() + def compile(self, *args, **kwargs) -> Any: + raise NotImplementedError("Reached too far!!") class QEFF_MODEL_TYPE(Enum): - LLM = "LLM" - STABLE_DIFFUSION = "STABLE_DIFFUSION" + CAUSALLM = "LLM" + DIFFUSION = "STABLE_DIFFUSION" AWQ = "AWQ" MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = { - QEFF_MODEL_TYPE.LLM: QEFFAutoModelForCausalLM + QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM } AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()} @@ -116,6 +137,6 @@ def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE: else: raise NotImplementedError(f"current model type is not yet supported {type(config)}") else: - return QEFF_MODEL_TYPE.LLM + return QEFF_MODEL_TYPE.CAUSALLM else: raise NotImplementedError(f"model type {type(config)} is not yet supported") diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index 9253ae54c..592b085ff 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -242,7 +242,7 @@ def transform(model: Type[QEFFBaseModel], form_factor="cloud"): form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"]. """ assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!" - if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.LLM: + if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: transform_lm(model.model, form_factor) # type: ignore return model else: diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 80f1f0c46..049462514 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -75,7 +75,10 @@ def hf_download( def qpc_exists(model_name: str, qpc_base_dir_name: str) -> Tuple[bool, str]: """ - Checks if qpc files already exists, removes the directory if files have been manipulated. + Checks if qpc dir exists. + Returns + 1. Boolean variable indicating if qpc files exist + 2. Path of the qpc dir if found. --------- :param model_name: str. HF Model card name. :param dir_path: str. Path of qpc directory. @@ -120,5 +123,5 @@ def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token # Download tokenizer along with model if it doesn't exist model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"]) - tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left") + tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True) return tokenizer diff --git a/tests/test_loader.py b/tests/test_loader.py new file mode 100644 index 000000000..56e81f666 --- /dev/null +++ b/tests/test_loader.py @@ -0,0 +1,36 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +from typing import Any, Dict + +import pytest +from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel + +import QEfficient.transformers.modeling_utils +from QEfficient import QEFFAutoModel +from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM + +model_name_to_params_dict : Dict[str, Dict[str, Any]] = { + "gpt2": { + "qeff_class": QEFFAutoModelForCausalLM, + "hf_class": GPT2LMHeadModel, + "prompt": "Equator is" + }, + +} +model_names = model_name_to_params_dict.keys() + + +@pytest.mark.parametrize("model_name", model_names) +def test_qeff_auto_model_for_causal_lm(model_name: str): + model = QEFFAutoModel.from_pretrained(model_name) + assert isinstance(model, model_name_to_params_dict[model_name]['qeff_class']) + assert isinstance(model.model, model_name_to_params_dict[model_name]['hf_class']) # type: ignore + + # Run transform + QEfficient.transform(model) + print(model) diff --git a/tests/utils.py b/tests/utils.py index 3ef42f82c..37dfd5795 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -10,15 +10,11 @@ import shutil import unittest -import transformers - -import QEfficient -import QEfficient.transformers -import QEfficient.transformers.modeling_utils from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100 from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM -from QEfficient.utils import hf_download +from QEfficient.transformers.modeling_utils import transform_lm +from QEfficient.utils import hf_download, load_hf_tokenizer from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants from QEfficient.utils.device_utils import get_available_device_id, is_multi_qranium_setup_available, is_qpc_size_gt_32gb from QEfficient.utils.run_utils import ApiRunner @@ -70,10 +66,7 @@ def get_tokenizer(model_name): :param model_name: str :return tokenizer """ - model_hf_path = hf_download(repo_id=model_name, allow_patterns=["*.json"]) - tokenizer = transformers.AutoTokenizer.from_pretrained(model_hf_path, padding_side="left") - if tokenizer.pad_token_id is None: - tokenizer.pad_token_id = tokenizer.eos_token_id + tokenizer = load_hf_tokenizer(model_name=model_name) return tokenizer @@ -101,7 +94,7 @@ def transform_pt_model_with_qeff(model_hf): :param model_hf: pytorch model :return model_kv """ - model_kv = QEfficient.transformers.modeling_utils.transform_lm(model_hf) + model_kv = transform_lm(model_hf) model_kv.eval() return model_kv From 1a895dc3e7c0e89b80c26d5db09a90ae3fc53962 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Thu, 30 May 2024 18:46:42 +0530 Subject: [PATCH 09/20] enabled CLI APIs Signed-off-by: Onkar Chougule --- QEfficient/__init__.py | 2 +- QEfficient/cloud/export.py | 2 +- QEfficient/cloud/infer.py | 25 +++- .../exporter/export_hf_to_cloud_ai_100.py | 11 +- QEfficient/loader/loader.py | 42 +++++- QEfficient/loader/loader_factory.py | 62 +++------ QEfficient/transformers/modeling_utils.py | 115 ---------------- QEfficient/transformers/transform.py | 123 ++++++++++++++++++ QEfficient/utils/_utils.py | 11 +- QEfficient/utils/logging_utils.py | 25 ++-- tests/test_loader.py | 2 +- tests/utils.py | 2 +- 12 files changed, 220 insertions(+), 202 deletions(-) create mode 100644 QEfficient/transformers/transform.py diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index fb4c517f7..09a6ae7fa 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -7,7 +7,7 @@ from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.loader import QEFFAutoModel # noqa: F401 -from QEfficient.transformers.modeling_utils import transform # noqa: F401 +from QEfficient.transformers.transform import transform # noqa: F401 # Users can use QEfficient.export for exporting models to ONNX export = qualcomm_efficient_converter diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 4ac2f6a05..2b7201c8e 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -9,7 +9,7 @@ import os from typing import Optional -import QEfficient.transformers.modeling_utils +import QEfficient from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.loader import QEFFAutoModel from QEfficient.utils import load_hf_tokenizer, onnx_exists diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 14b61465b..f1a56931c 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -6,10 +6,11 @@ # ----------------------------------------------------------------------------- import argparse +import logging import os -from typing import List +from typing import List, Optional -import QEfficient.transformers.modeling_utils +import QEfficient from QEfficient.cloud.compile import main as compile from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.generation.text_generation_inference import ( @@ -32,12 +33,12 @@ def main( model_name: str, num_cores: int, - prompt: str = None, - prompts_txt_file_path: str = None, + prompt: Optional[str] = None, # type: ignore + prompts_txt_file_path: Optional[str] = None, aic_enable_depth_first: bool = False, mos: int = -1, cache_dir: str = Constants.CACHE_DIR, - hf_token: str = None, + hf_token: Optional[str] = None, batch_size: int = 1, prompt_len: int = 32, ctx_len: int = 128, @@ -64,8 +65,9 @@ def main( if qpc_path_exists: # execute - logger.info("Pre-compiled qpc found! Trying to execute with given prompt") + logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt") elif onnx_path_exists: + logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation") # Compile -> execute # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation generated_qpc_path = compile( @@ -96,9 +98,10 @@ def main( logger.info(f"Model after Optimized transformations {qeff_opt_model}") # Export to the Onnx - logger.info(f"Exporting to Pytorch {model_name} to ONNX...") + logger.info(f"Exporting Pytorch {model_name} model to ONNX...") # Need to split below function into two functions one which always takes QEFFAutoModel and other with same interface as below base_path, generated_onnx_path = qualcomm_efficient_converter( + model_name=model_name, model_kv=qeff_opt_model, # type: ignore tokenizer=tokenizer, onnx_dir_path=onnx_dir_path, @@ -204,6 +207,14 @@ def main( default=-1, help="Effort level to reduce the on-chip memory", ) + #FIXME: Add verbose feature + parser.add_argument( + "--verbose","-v", + action="store_true", + help="pass to print info logs", + ) args = parser.parse_args() + if args.verbose: + logger.setLevel(logging.INFO) main(**args.__dict__) diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index 8e012cb1b..d5da3f422 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -12,18 +12,17 @@ import torch from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast -import QEfficient.transformers.modeling_utils +import QEfficient from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort -from QEfficient.loader.loader import QEFFAutoModel +from QEfficient.loader.loader import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFFAutoModel from QEfficient.loader.loader_factory import ( - AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFAutoModelForCausalLM, QEFFBaseModel, ) +from QEfficient.utils._utils import load_hf_tokenizer from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants from QEfficient.utils.logging_utils import logger -from QEfficient.utils._utils import load_hf_tokenizer def convert_to_cloud_bertstyle( @@ -442,7 +441,7 @@ def export_lm_model_for_cloud(model_name:str, qeff_model: QEFFAutoModelForCausal def qualcomm_efficient_converter( model_name: str, - model_kv: Optional[Type[QEFFBaseModel]] = None, # type: ignore + model_kv: QEFFBaseModel = None, # type: ignore tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]]=None, onnx_dir_path: Optional[str]=None, hf_token: Optional[str] = None, @@ -480,7 +479,7 @@ def qualcomm_efficient_converter( if model_kv.is_transformed and not kv: raise AttributeError("Transformed model is passed while requsting to convert non-transformed model") - model_kv: Type[QEFFBaseModel] = QEfficient.transform(model_kv) if kv else model_kv + model_kv = model_kv if model_kv.is_transformed else QEfficient.transform(model_kv) if kv else model_kv if onnx_dir_path is None: diff --git a/QEfficient/loader/loader.py b/QEfficient/loader/loader.py index 94ae672be..99295555f 100644 --- a/QEfficient/loader/loader.py +++ b/QEfficient/loader/loader.py @@ -6,15 +6,43 @@ # ----------------------------------------------------------------------------- import os -from typing import Any, Type +from typing import Any, Dict, Type -from QEfficient.loader.loader_factory import ( - MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP, - QEFFBaseModel, - get_hf_model_type, -) +from transformers import AutoConfig +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING + +from QEfficient.loader.loader_factory import QEFF_MODEL_TYPE, QEFFAutoModelForCausalLM, QEFFBaseModel from QEfficient.utils._utils import login_and_download_hf_lm +MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = { + QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM +} + +AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()} + +def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE: + """ + Loads model config file and returns the type of the model (i.e. LLMs, SD, quantized etc.) as supported by the library. + """ + assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model" + config, kwargs = AutoConfig.from_pretrained( + hf_model_path, + return_unused_kwargs=True, + ) + + if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING: + # FIXME: Add logic to handle if quantization config is stored in separate quant_config.json outside of config, also create a separate function for this and below lines + quant_config = getattr(config, "quantization_config", getattr(config, "quant_config", None)) + if quant_config is not None: + if quant_config.get("quant_method", None) == "awq": + return QEFF_MODEL_TYPE.AWQ + else: + raise NotImplementedError(f"current model type is not yet supported {type(config)}") + else: + return QEFF_MODEL_TYPE.CAUSALLM + else: + raise NotImplementedError(f"model type {type(config)} is not yet supported") + class QEFFAutoModel: """ @@ -27,7 +55,7 @@ def __init__(self, *args: Any, **kwds: Any) -> None: f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`") @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> Type[QEFFBaseModel]: + def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseModel: """ Downloads HuggingFace model if already doesn't exist locally, returns QEffAutoModel object based on type of model. """ diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py index 66ca42154..23d210f30 100644 --- a/QEfficient/loader/loader_factory.py +++ b/QEfficient/loader/loader_factory.py @@ -5,19 +5,24 @@ # # ---------------------------------------------------------------------------- -import os from abc import ABC, abstractmethod from enum import Enum -from typing import Any, Dict, Type +from typing import Any import torch.nn as nn -from transformers import AutoConfig, AutoModelForCausalLM +from transformers import AutoModelForCausalLM from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING -import QEfficient.transformers.modeling_utils +import QEfficient from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict +class QEFF_MODEL_TYPE(Enum): + CAUSALLM = "LLM" + DIFFUSION = "STABLE_DIFFUSION" + AWQ = "AWQ" + + class QEFFBaseModel(ABC): """ This class acts as parent class for all the varieties of model class (i.e. LLMs, SD, quantized etc.). @@ -29,11 +34,11 @@ def __init__(self) -> None: super().__init__() # Users can call generate or execute self.generate = self.execute - + @classmethod def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): raise NotImplementedError("Must implement for child classes") - + @property def is_transformed(self) -> bool: raise NotImplementedError("Must implement for child classes") @@ -49,11 +54,11 @@ def transform_export_compile(self, *args, **kwargs) -> Any: @abstractmethod def execute(self, *args, **kwargs) -> Any: pass - + @abstractmethod def transform(self, *args, **kwargs) -> Any: pass - + @abstractmethod def export(self, *args, **kwargs) -> Any: pass @@ -61,7 +66,7 @@ def export(self, *args, **kwargs) -> Any: @abstractmethod def compile(self, *args, **kwargs) -> Any: pass - + class QEFFAutoModelForCausalLM(QEFFBaseModel): """ @@ -95,7 +100,7 @@ def execute(self, *args, **kwargs): # type: ignore raise NotImplementedError("Reached too far!!") def transform(self): - QEfficient.transformers.modeling_utils.transform_lm(self.model) + QEfficient.transform(self) return self def export(self): @@ -103,40 +108,3 @@ def export(self): def compile(self, *args, **kwargs) -> Any: raise NotImplementedError("Reached too far!!") - - -class QEFF_MODEL_TYPE(Enum): - CAUSALLM = "LLM" - DIFFUSION = "STABLE_DIFFUSION" - AWQ = "AWQ" - - -MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = { - QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM -} - -AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {v:k for k,v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()} - - -def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE: - """ - Loads model config file and returns the type of the model (i.e. LLMs, SD, quantized etc.) as supported by the library. - """ - assert os.path.isdir(hf_model_path), "Pleae pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model" - config, kwargs = AutoConfig.from_pretrained( - hf_model_path, - return_unused_kwargs=True, - ) - - if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING: - # FIXME: Add logic to handle if quantization config is stored in separate quant_config.json outside of config, also create a separate function for this and below lines - quant_config = getattr(config, "quantization_config", getattr(config, "quant_config", None)) - if quant_config is not None: - if quant_config.get("quant_method", None) == "awq": - return QEFF_MODEL_TYPE.AWQ - else: - raise NotImplementedError(f"current model type is not yet supported {type(config)}") - else: - return QEFF_MODEL_TYPE.CAUSALLM - else: - raise NotImplementedError(f"model type {type(config)} is not yet supported") diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index 592b085ff..753d08204 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -5,12 +5,10 @@ # # ----------------------------------------------------------------------------- -import hashlib from collections import namedtuple from typing import Dict, Type import torch.nn as nn -import transformers from transformers.models.codegen.modeling_codegen import ( CodeGenAttention, CodeGenBlock, @@ -46,26 +44,7 @@ from transformers.models.mpt.modeling_mpt import MptAttention, MptBlock, MptForCausalLM, MptModel from QEfficient.customop import CustomRMSNormAIC -from QEfficient.loader.loader_factory import ( - AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, - QEFF_MODEL_TYPE, - QEFFBaseModel, -) -from QEfficient.utils.logging_utils import logger -from .modeling_attn_mask_utils import ( - QEffAttentionMaskConverter, - _qeff_prepare_4d_attention_mask, - _qeff_prepare_4d_causal_attention_mask, -) -from .modeling_outputs import ( - QEffBaseModelOutputWithPast, - QEffBaseModelOutputWithPastAndCrossAttentions, - QEffCausalLMOutputWithCrossAttentions, - QEffCausalLMOutputWithPast, - QEffMoeCausalLMOutputWithPast, - QEffMoeModelOutputWithPast, -) from .models.codegen.modeling_codegen import ( QEffCodeGenAttention, QEffCodeGenBlock, @@ -153,97 +132,3 @@ MixtralSparseMoeBlock: QEffMixtralSparseMoeBlock, MixtralBLockSparseTop2MLP:QEffMixtralBLockSparseTop2MLP, } - - -def get_params_hash(model: nn.Module) -> str: - """ - Creates a Hash of all the parameters values i.e. weights using SHA256 algo. - -------- - :param model: torch.nn.Module. Base PyTorch model. - :returns: str. Hash string - """ - hasher = hashlib.sha256() - for _, params in model.named_parameters(): - hasher.update(params.data.numpy().tobytes()) - - return hasher.hexdigest() - - -def replace_module_with_qeff_layers(model: nn.Module) -> None: - """ - Replaces the transformers nn.Module classes with optmized QEff classes in place. - ---------- - :param model: torch.nn.Module. Base PyTorch model. - """ - # Replace if module class is registed in TransformersToQEffModulesDict - target_module = TransformersToQEffModulesDict.get(model.__class__) - if target_module is not None: - model.__class__ = target_module - - # Iterate over child modules - for _, module in model.named_children(): - replace_module_with_qeff_layers(module) - - -def transform_lm(model: nn.Module) -> nn.Module: - """ - Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for cloud AI 100. - --------- - Args: - param model (torch.nn.Module): PyTorch model. - - Returns: - torch.nn.Module: PyTorch Module with replaced QEff layers. - """ - - # Introducnig qeff_transformed attribue in model to check status of transform - if getattr(model, "qeff_transformed", False): - print("Model is already transformed") - return model - - # Get Hash of all params for checking later - prior_params_hash = get_params_hash(model) - logger.warning(f"The model {model.__class__} layers has been upadted to QEff layers in-place") - # Replace with QEff layers - replace_module_with_qeff_layers(model) - - # Check with new params hash - later_params_hash = get_params_hash(model) - assert ( - prior_params_hash == later_params_hash - ), "Weights were changed in the transform process, please report an issue" - - # Replace the modeling output classes - transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions = ( - QEffBaseModelOutputWithPastAndCrossAttentions - ) - transformers.modeling_outputs.CausalLMOutputWithCrossAttentions = QEffCausalLMOutputWithCrossAttentions - transformers.modeling_outputs.BaseModelOutputWithPast = QEffBaseModelOutputWithPast - transformers.modeling_outputs.CausalLMOutputWithPast = QEffCausalLMOutputWithPast - transformers.modeling_outputs.MoeCausalLMOutputWithPast = QEffMoeCausalLMOutputWithPast - transformers.modeling_outputs.MoeModelOutputWithPast = QEffMoeModelOutputWithPast - - # Replace the modeling attn util classes and functions - transformers.modeling_attn_mask_utils.AttentionMaskConverter = QEffAttentionMaskConverter - transformers.modeling_attn_mask_utils._prepare_4d_attention_mask = _qeff_prepare_4d_attention_mask - transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask = _qeff_prepare_4d_causal_attention_mask - - setattr(model,'qeff_transformed',True) - return model.eval() - - - -def transform(model: Type[QEFFBaseModel], form_factor="cloud"): - """ - This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for cloud AI 100. - Will replace the torch.nn.Module layers of passed QEffModel with optimized implementation of the same. - - model: object of any instance of class that is child of `QEFFBaseAutoModelFactory` - form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"]. - """ - assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!" - if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: - transform_lm(model.model, form_factor) # type: ignore - return model - else: - raise NotImplementedError(f"Recieved unsupported class of type {type(model)}") diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py new file mode 100644 index 000000000..413e9f6fe --- /dev/null +++ b/QEfficient/transformers/transform.py @@ -0,0 +1,123 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import hashlib + +import torch.nn as nn +import transformers + +from QEfficient.loader.loader_factory import QEFF_MODEL_TYPE +from QEfficient.loader.loader import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP +from QEfficient.loader.loader_factory import QEFFBaseModel +from QEfficient.transformers.modeling_attn_mask_utils import ( + QEffAttentionMaskConverter, + _qeff_prepare_4d_attention_mask, + _qeff_prepare_4d_causal_attention_mask, +) +from QEfficient.transformers.modeling_outputs import ( + QEffBaseModelOutputWithPast, + QEffBaseModelOutputWithPastAndCrossAttentions, + QEffCausalLMOutputWithCrossAttentions, + QEffCausalLMOutputWithPast, + QEffMoeCausalLMOutputWithPast, + QEffMoeModelOutputWithPast, +) +from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict +from QEfficient.utils.logging_utils import logger + + +def replace_module_with_qeff_layers(model: nn.Module) -> None: + """ + Replaces the transformers nn.Module classes with optmized QEff classes in place. + ---------- + :param model: torch.nn.Module. Base PyTorch model. + """ + # Replace if module class is registed in TransformersToQEffModulesDict + target_module = TransformersToQEffModulesDict.get(model.__class__) + if target_module is not None: + model.__class__ = target_module + + # Iterate over child modules + for _, module in model.named_children(): + replace_module_with_qeff_layers(module) + + +def get_params_hash(model: nn.Module) -> str: + """ + Creates a Hash of all the parameters values i.e. weights using SHA256 algo. + -------- + :param model: torch.nn.Module. Base PyTorch model. + :returns: str. Hash string + """ + hasher = hashlib.sha256() + for _, params in model.named_parameters(): + hasher.update(params.data.numpy().tobytes()) + + return hasher.hexdigest() + + +def transform_lm(model: nn.Module) -> nn.Module: + """ + Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for cloud AI 100. + --------- + Args: + param model (torch.nn.Module): PyTorch model. + + Returns: + torch.nn.Module: PyTorch Module with replaced QEff layers. + """ + + # Introducnig qeff_transformed attribue in model to check status of transform + if getattr(model, "qeff_transformed", False): + print("Model is already transformed") + return model + + # Get Hash of all params for checking later + prior_params_hash = get_params_hash(model) + logger.warning(f"The model {model.__class__} layers has been upadted to QEff layers in-place") + # Replace with QEff layers + replace_module_with_qeff_layers(model) + + # Check with new params hash + later_params_hash = get_params_hash(model) + assert ( + prior_params_hash == later_params_hash + ), "Weights were changed in the transform process, please report an issue" + + # Replace the modeling output classes + transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions = ( + QEffBaseModelOutputWithPastAndCrossAttentions + ) + transformers.modeling_outputs.CausalLMOutputWithCrossAttentions = QEffCausalLMOutputWithCrossAttentions + transformers.modeling_outputs.BaseModelOutputWithPast = QEffBaseModelOutputWithPast + transformers.modeling_outputs.CausalLMOutputWithPast = QEffCausalLMOutputWithPast + transformers.modeling_outputs.MoeCausalLMOutputWithPast = QEffMoeCausalLMOutputWithPast + transformers.modeling_outputs.MoeModelOutputWithPast = QEffMoeModelOutputWithPast + + # Replace the modeling attn util classes and functions + transformers.modeling_attn_mask_utils.AttentionMaskConverter = QEffAttentionMaskConverter + transformers.modeling_attn_mask_utils._prepare_4d_attention_mask = _qeff_prepare_4d_attention_mask + transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask = _qeff_prepare_4d_causal_attention_mask + + setattr(model,'qeff_transformed',True) + return model.eval() + + +def transform(model: QEFFBaseModel, form_factor="cloud"): + """ + This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for cloud AI 100. + Will replace the torch.nn.Module layers of passed QEffModel with optimized implementation of the same. + + model: object of any instance of class that is child of `QEFFBaseAutoModelFactory` + form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"]. + """ + assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!" + if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: + transform_lm(model.model) # type: ignore + return model + else: + raise NotImplementedError(f"Recieved unsupported class of type {type(model)}") \ No newline at end of file diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 049462514..25eb52616 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -14,19 +14,21 @@ from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast from QEfficient.utils.constants import QEFF_MODELS_DIR +from QEfficient.utils.logging_utils import logger -def login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs): +def login_and_download_hf_lm(model_name, *args, **kwargs): + logger.info(f"loading HuggingFace model for {model_name}") hf_token = kwargs.pop("hf_token", None) cache_dir = kwargs.pop("cache_dir", None) if hf_token is not None: login(hf_token) - pretrained_model_name_or_path = hf_download( - repo_id=pretrained_model_name_or_path, + model_name = hf_download( + repo_id=model_name, cache_dir=cache_dir, ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"], ) - return pretrained_model_name_or_path + return model_name def hf_download( @@ -118,6 +120,7 @@ def onnx_exists(model_name: str) -> Tuple[bool, str, str]: def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token: Optional[str] = None) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]: + logger.info(f"Loading Tokenizer for {model_name}") if hf_token is not None: login(hf_token) diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py index fe42d5ed9..044e6e83f 100644 --- a/QEfficient/utils/logging_utils.py +++ b/QEfficient/utils/logging_utils.py @@ -13,19 +13,20 @@ class QEffFormatter(logging.Formatter): Formatter class used to set colors for printing different logging levels of messages on console. """ - grey = "\x1b[38;20m" - yellow = "\x1b[33;20m" - red = "\x1b[31;20m" - bold_red = "\x1b[31;1m" - reset = "\x1b[0m" - format = "%(levelname)s - %(name)s - %(message)s (%(filename)s:%(lineno)d)" + cyan: str = "\x1b[38;5;14m" + yellow: str = "\x1b[33;20m" + red: str = "\x1b[31;20m" + bold_red: str = "\x1b[31;1m" + reset: str = "\x1b[0m" + common_format: str = "%(levelname)s - %(name)s - %(message)s" # type: ignore + format_with_line_info = "%(levelname)s - %(name)s - %(message)s (%(filename)s:%(lineno)d)" # type: ignore FORMATS = { - logging.DEBUG: grey + format + reset, - logging.INFO: grey + format + reset, - logging.WARNING: yellow + format + reset, - logging.ERROR: red + format + reset, - logging.CRITICAL: bold_red + format + reset, + logging.DEBUG: cyan + format_with_line_info + reset, + logging.INFO: cyan + common_format + reset, + logging.WARNING: yellow + common_format + reset, + logging.ERROR: red + format_with_line_info + reset, + logging.CRITICAL: bold_red + format_with_line_info + reset, } def format(self, record): @@ -45,7 +46,7 @@ def create_logger() -> logging.Logger: # create console handler and set level to debug ch = logging.StreamHandler() - ch.setLevel(logging.WARNING) + ch.setLevel(logging.INFO) # define formatter ch.setFormatter(QEffFormatter()) diff --git a/tests/test_loader.py b/tests/test_loader.py index 56e81f666..0d3df3a01 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -10,7 +10,7 @@ import pytest from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel -import QEfficient.transformers.modeling_utils +import QEfficient from QEfficient import QEFFAutoModel from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM diff --git a/tests/utils.py b/tests/utils.py index 37dfd5795..f68dd20fb 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -13,7 +13,7 @@ from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100 from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM -from QEfficient.transformers.modeling_utils import transform_lm +from QEfficient.transformers.transform import transform_lm from QEfficient.utils import hf_download, load_hf_tokenizer from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants from QEfficient.utils.device_utils import get_available_device_id, is_multi_qranium_setup_available, is_qpc_size_gt_32gb From 9f5ff0a66a818b244c0888d4c4bdee73d66e0ac0 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Fri, 31 May 2024 15:50:28 +0530 Subject: [PATCH 10/20] *Updated README, notebooks *Removed circular import *Added comments on loader files * separated cross-compile script *separated utils funcs Signed-off-by: Onkar Chougule --- QEfficient/__init__.py | 1 + QEfficient/cloud/compile.py | 89 +-------------- QEfficient/cloud/infer.py | 3 +- QEfficient/cross_compile.py | 159 +++++++++++++++++++++++++++ QEfficient/exporter/export_utils.py | 97 +--------------- QEfficient/loader/loader.py | 6 + QEfficient/loader/loader_factory.py | 20 ++++ QEfficient/transformers/transform.py | 3 +- QEfficient/utils/_utils.py | 5 +- README.md | 81 ++++++-------- notebooks/QEfficientGPT2.ipynb | 44 +++----- notebooks/QEfficientMPT.ipynb | 39 +++---- tests/test_loader.py | 2 +- tests/utils.py | 2 +- 14 files changed, 264 insertions(+), 287 deletions(-) create mode 100644 QEfficient/cross_compile.py diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 09a6ae7fa..9804c4ea1 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +from QEfficient.cross_compile import compile # noqa: F401 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.loader import QEFFAutoModel # noqa: F401 from QEfficient.transformers.transform import transform # noqa: F401 diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py index 0171b2787..b46282da2 100644 --- a/QEfficient/cloud/compile.py +++ b/QEfficient/cloud/compile.py @@ -6,91 +6,8 @@ # ----------------------------------------------------------------------------- import argparse -import json -import os -from typing import List - -from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100 -from QEfficient.utils.logging_utils import logger - - -def create_and_dump_specializations(batch_size: int, prompt_len: int, ctx_len: int, path: str): - # Create - specializations = { - "specializations": [ - { - "batch_size": str(batch_size), - "seq_len": str(prompt_len), - "ctx_len": str(ctx_len), - }, - {"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)}, - ] - } - # Dump - with open(path, "w") as file: - json.dump(specializations, file, indent=4) - - -def main( - onnx_path: str, - qpc_path: str, - num_cores: int, - device_group: List[int], - aic_enable_depth_first: bool = False, - mos: int = -1, - batch_size: int = 1, - prompt_len: int = 32, - ctx_len: int = 128, - mxfp6: bool = True, - mxint8: bool = False, -) -> str: - # Dynamically create the specializations JSON - """ - Api() to compile the Onnx Model on Cloud AI 100 Platform with give config. - --------- - :param onnx_path: str. Generated Onnx Model Path. - :base_path: str. Base path for the generated models. - :batch_size: int. Batch size to compile the model for. - :prompt_len: int. prompt len for the model to compile. - :ctx_len: int. Maximum context length to compile the model. - :mxfp6: bool. Enable compilation for MXFP6 precision - :num_cores: int. Number of cores to compile model on. default: 16 available option: [1 to 16] - """ - - os.makedirs(qpc_path, exist_ok=True) - specialization_json_path = os.path.join(qpc_path, "specializations.json") - create_and_dump_specializations( - batch_size=batch_size, prompt_len=prompt_len, ctx_len=ctx_len, path=specialization_json_path - ) - - # Select the customIO config based on the mx flag. - if mxint8: - custom_io_file_name = "custom_io_int8.yaml" - else: - custom_io_file_name = "custom_io_fp16.yaml" - - custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name) - - if not os.path.isfile(custom_io_file_path): - raise FileNotFoundError( - f"file {custom_io_file_path} needs to exist in the same directory as onnx model files. Please rerun infer/export Api" - ) - - _, qpc_path = compile_kv_model_on_cloud_ai_100( - onnx_path=onnx_path, - specializations_json=specialization_json_path, - num_cores=num_cores, - custom_io_path=custom_io_file_path, - base_path=qpc_path, - mxfp6=mxfp6, - aic_enable_depth_first=aic_enable_depth_first, - mos=mos, - device_group=device_group, - ) - - logger.info(f"Compiled QPC files can be found here: {qpc_path}") - return qpc_path +import QEfficient if __name__ == "__main__": parser = argparse.ArgumentParser(description="Compilation script.") @@ -146,5 +63,7 @@ def main( default=-1, help=" Effort level to reduce the on-chip memory", ) + + # FIXME(ochougul): Allow extra compilation arguments args = parser.parse_args() - main(**vars(args)) + QEfficient.compile(**vars(args)) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index f1a56931c..e72a3329b 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -11,7 +11,6 @@ from typing import List, Optional import QEfficient -from QEfficient.cloud.compile import main as compile from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.generation.text_generation_inference import ( check_batch_size_and_num_prompts, @@ -117,7 +116,7 @@ def main( # Compile # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation - generated_qpc_path = compile( + generated_qpc_path = QEfficient.compile( onnx_path=onnx_model_path, qpc_path=os.path.dirname(qpc_dir_path), num_cores=num_cores, diff --git a/QEfficient/cross_compile.py b/QEfficient/cross_compile.py new file mode 100644 index 000000000..771d52f54 --- /dev/null +++ b/QEfficient/cross_compile.py @@ -0,0 +1,159 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import json +import os +import subprocess +from typing import List, Tuple + +from QEfficient.utils.logging_utils import logger + + +def create_and_dump_specializations(batch_size: int, prompt_len: int, ctx_len: int, path: str): + # Create + specializations = { + "specializations": [ + { + "batch_size": str(batch_size), + "seq_len": str(prompt_len), + "ctx_len": str(ctx_len), + }, + {"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)}, + ] + } + # Dump + with open(path, "w") as file: + json.dump(specializations, file, indent=4) + + +def compile_kv_model_on_cloud_ai_100( + onnx_path: str, + specializations_json: str, + num_cores: int, + base_path: str, + mxfp6: bool, + custom_io_path: str, + aic_enable_depth_first: bool, + mos: int = -1, + device_group: List[int] = [0], + **kwargs, +) -> Tuple[bool, str]: + import shutil + if kwargs: + # FIXME + raise NotImplementedError("Can't handle extra compilation args now!") + aic_binary_dir = os.path.join(base_path, "qpcs") + + if os.path.isdir(aic_binary_dir): + shutil.rmtree(aic_binary_dir) + + assert os.path.isfile( + specializations_json + ), f"Please use 'QEfficient.compile', as {specializations_json} file was not found" + assert os.path.isfile(custom_io_path), f"{custom_io_path} file was not found!" + command = [ + "/opt/qti-aic/exec/qaic-exec", + f"-m={onnx_path}", + "-aic-hw", + "-aic-hw-version=2.0", + f"-network-specialization-config={specializations_json}", + "-convert-to-fp16", + "-retained-state", + f"-aic-num-cores={num_cores}", + f"-custom-IO-list-file={custom_io_path}", + "-compile-only", + f"-aic-binary-dir={aic_binary_dir}", + ] + if mxfp6: + command.append("-mxfp6-matmul") + if mos > 0: + command.append(f"-mos={mos}") + if aic_enable_depth_first: + command.append("-aic-enable-depth-first") + if len(device_group) > 1: + mdp_ts_config = { + "connections": [{"devices": list(range(len(device_group))), "type": "p2p"}], + "partitions": [ + { + "name": "Partition0", + "devices": [{"deviceId": device, "numCores": num_cores} for device in range(len(device_group))], + } + ], + } + mdp_ts_config_path = os.path.join(base_path, "mdp_ts_config.json") + with open(mdp_ts_config_path, "w") as file: + json.dump(mdp_ts_config, file, indent=4) + command.append(f"-mdp-load-partition-config={mdp_ts_config_path}") + print("Running AI 100 compiler:", " ".join(command)) + result = subprocess.run(command, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f"Compilation Failed!!\n\nSTDOUT\n{result.stdout}\n\nSTDERR\n{result.stderr}") + + print("\n===================== Compilation Done! =====================\n") + return result.returncode == 0, aic_binary_dir + + +def compile( + onnx_path: str, + qpc_path: str, + num_cores: int, + device_group: List[int], + aic_enable_depth_first: bool = False, + mos: int = -1, + batch_size: int = 1, + prompt_len: int = 32, + ctx_len: int = 128, + mxfp6: bool = True, + mxint8: bool = False, + **kwargs +) -> str: + # Dynamically create the specializations JSON + """ + Api() to compile the Onnx Model on Cloud AI 100 Platform with give config. + --------- + :param onnx_path: str. Generated Onnx Model Path. + :base_path: str. Base path for the generated models. + :batch_size: int. Batch size to compile the model for. + :prompt_len: int. prompt len for the model to compile. + :ctx_len: int. Maximum context length to compile the model. + :mxfp6: bool. Enable compilation for MXFP6 precision + :num_cores: int. Number of cores to compile model on. default: 16 available option: [1 to 16] + """ + + os.makedirs(qpc_path, exist_ok=True) + specialization_json_path = os.path.join(qpc_path, "specializations.json") + create_and_dump_specializations( + batch_size=batch_size, prompt_len=prompt_len, ctx_len=ctx_len, path=specialization_json_path + ) + + # Select the customIO config based on the mx flag. + if mxint8: + custom_io_file_name = "custom_io_int8.yaml" + else: + custom_io_file_name = "custom_io_fp16.yaml" + + custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name) + + if not os.path.isfile(custom_io_file_path): + raise FileNotFoundError( + f"file {custom_io_file_path} needs to exist in the same directory as onnx model files. Please rerun infer/export Api" + ) + + _, qpc_path = compile_kv_model_on_cloud_ai_100( + onnx_path=onnx_path, + specializations_json=specialization_json_path, + num_cores=num_cores, + custom_io_path=custom_io_file_path, + base_path=qpc_path, + mxfp6=mxfp6, + aic_enable_depth_first=aic_enable_depth_first, + mos=mos, + device_group=device_group, + ) + + logger.info(f"Compiled QPC files can be found here: {qpc_path}") + return qpc_path diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py index 8ce7f6b26..417c3a214 100644 --- a/QEfficient/exporter/export_utils.py +++ b/QEfficient/exporter/export_utils.py @@ -5,12 +5,10 @@ # # ----------------------------------------------------------------------------- -import json import os import shutil -import subprocess import sys -from logging import error, info +from logging import info from typing import Dict, List, Tuple, Union import numpy as np @@ -285,7 +283,7 @@ def generate_input_files( fp.write(",".join(filenames)) fp.write("\n") - +# FIXME(ochougul/quic-mamta): Remove duplication with APIRunner def run_model_on_ort( onnx_path: str, inputs: Dict[str, torch.Tensor], @@ -331,94 +329,3 @@ def run_model_on_ort( print(f"Failed to run the onnx {onnx_path} model in onnx runtime:%s", e) print("\n=============================================================\n") return input_names, None - - -def run_model_on_cloud_ai_100( - onnx_path: str, - onnx_symbol_defs: Dict[str, int] = {}, - **kwargs, -) -> bool: - args = [ - "/opt/qti-aic/exec/qaic-exec", - f"-m={onnx_path}", - "-aic-hw", - "-aic-hw-version=2.0", - ] - for onnx_symbol, onnx_def in onnx_symbol_defs.items(): - args.append(f"-onnx-define-symbol={onnx_symbol},{onnx_def}") - for k, v in kwargs.items(): - k = k.replace("_", "-") - if isinstance(v, bool): - if v: - args.append(f"-{k}") - continue - args.append(f"-{k}={v}") - - info("Running compiler:", " ".join(args)) - result = subprocess.run(args) - return result.returncode == 0 - - -def compile_kv_model_on_cloud_ai_100( - onnx_path: str, - specializations_json: str, - num_cores: int, - base_path: str, - mxfp6: bool, - custom_io_path: str, - aic_enable_depth_first: bool, - mos: int = -1, - device_group: List[int] = [0], - **kwargs, -) -> bool: - import shutil - - aic_binary_dir = os.path.join(base_path, "qpcs") - - if os.path.isdir(aic_binary_dir): - shutil.rmtree(aic_binary_dir) - - assert os.path.isfile( - specializations_json - ), f"Please use 'from QEfficient.cloud.compile import main as compile', as {specializations_json} file was not found" - assert os.path.isfile(custom_io_path), f"{custom_io_path} file was not found!" - command = [ - "/opt/qti-aic/exec/qaic-exec", - f"-m={onnx_path}", - "-aic-hw", - "-aic-hw-version=2.0", - f"-network-specialization-config={specializations_json}", - "-convert-to-fp16", - "-retained-state", - f"-aic-num-cores={num_cores}", - f"-custom-IO-list-file={custom_io_path}", - "-compile-only", - f"-aic-binary-dir={aic_binary_dir}", - ] - if mxfp6: - command.append("-mxfp6-matmul") - if mos > 0: - command.append(f"-mos={mos}") - if aic_enable_depth_first: - command.append("-aic-enable-depth-first") - if len(device_group) > 1: - mdp_ts_config = { - "connections": [{"devices": list(range(len(device_group))), "type": "p2p"}], - "partitions": [ - { - "name": "Partition0", - "devices": [{"deviceId": device, "numCores": num_cores} for device in range(len(device_group))], - } - ], - } - mdp_ts_config_path = os.path.join(base_path, "mdp_ts_config.json") - with open(mdp_ts_config_path, "w") as file: - json.dump(mdp_ts_config, file, indent=4) - command.append(f"-mdp-load-partition-config={mdp_ts_config_path}") - print("Running AI 100 compiler:", " ".join(command)) - result = subprocess.run(command, capture_output=True, text=True) - if result.returncode != 0: - raise RuntimeError(f"Compilation Failed!!\n\nSTDOUT\n{result.stdout}\n\nSTDERR\n{result.stderr}") - - print("\n===================== Compilation Done! =====================\n") - return result.returncode == 0, aic_binary_dir diff --git a/QEfficient/loader/loader.py b/QEfficient/loader/loader.py index 99295555f..c0da35421 100644 --- a/QEfficient/loader/loader.py +++ b/QEfficient/loader/loader.py @@ -5,6 +5,12 @@ # # ----------------------------------------------------------------------------- +""" +MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP dictionary defines the mapping between names of the varities of Transformer model defined in +QEFF_MODEL_TYPE and the classes that implement the methods i.e.(compile, export etc.) for those types. + +QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model. +""" import os from typing import Any, Dict, Type diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py index 23d210f30..ede3a6c85 100644 --- a/QEfficient/loader/loader_factory.py +++ b/QEfficient/loader/loader_factory.py @@ -5,6 +5,23 @@ # # ---------------------------------------------------------------------------- +""" +** This file for holds the classes that handle main functions +1.load i.e. from_pretrained +2.execute +3.transform +4.export +5.compile +For different varities of Transformer Models + +** Each variety of the Transformer model that has different way of doing any of the above functions will have it's own class i.e. +following models type will have their own class which must inherit QEFFBaseModel abstract class. +1.Causal Language Models +2.Diffusion +3.Quantized models + +** QEFFBASEModel is abstract base class that defines the basic structure of these classes. +""" from abc import ABC, abstractmethod from enum import Enum from typing import Any @@ -18,6 +35,9 @@ class QEFF_MODEL_TYPE(Enum): + """ + Defines Names of the different varities of transformer models. + """ CAUSALLM = "LLM" DIFFUSION = "STABLE_DIFFUSION" AWQ = "AWQ" diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py index 413e9f6fe..aaddd75fa 100644 --- a/QEfficient/transformers/transform.py +++ b/QEfficient/transformers/transform.py @@ -10,9 +10,8 @@ import torch.nn as nn import transformers -from QEfficient.loader.loader_factory import QEFF_MODEL_TYPE from QEfficient.loader.loader import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP -from QEfficient.loader.loader_factory import QEFFBaseModel +from QEfficient.loader.loader_factory import QEFF_MODEL_TYPE, QEFFBaseModel from QEfficient.transformers.modeling_attn_mask_utils import ( QEffAttentionMaskConverter, _qeff_prepare_4d_attention_mask, diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 25eb52616..4c2ad177c 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -119,12 +119,13 @@ def onnx_exists(model_name: str) -> Tuple[bool, str, str]: return onnx_exists_bool, onnx_dir_path, onnx_model_path -def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token: Optional[str] = None) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]: +def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token: Optional[str] = None, padding_side:str = "left", **kwargs) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]: logger.info(f"Loading Tokenizer for {model_name}") if hf_token is not None: login(hf_token) # Download tokenizer along with model if it doesn't exist model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"]) - tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True) + #FIXME(ochougul): should this always return left padded tokenizer? + tokenizer = AutoTokenizer.from_pretrained(model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs) return tokenizer diff --git a/README.md b/README.md index 430a058c0..634e3add3 100644 --- a/README.md +++ b/README.md @@ -118,8 +118,8 @@ In summary: | High Level APIs | Sample use | Arguments | |-----------------|------------|-------------------| -| QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional
  • mxint8 : Optional
  • hf_token : Optional
  • cache_dir : Optional ["cache_dir" in current working directory]
  • **prompt : Optional
  • **prompts_txt_file_path : Optional
  • | -| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexcute) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • prompt : Optional [Default-"My name is"]
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • **prompt : Optional
  • **prompts_txt_file_path : Optional
  • | +| QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional
  • mxint8 : Optional
  • hf_token : Optional
  • cache_dir : Optional ["cache_dir" in current working directory]
  • **prompt : Optional
  • **prompts_txt_file_path : Optional
  • verbose : Optional
  • | +| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexcute) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • prompt : Optional [Default-"My name is"]
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • **prompt : Optional
  • **prompts_txt_file_path : Optional
  • | **One argument, prompt or prompts_txt_file_path must be passed.** @@ -187,46 +187,40 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3 | Low Level APIs | Sample use | Arguments | |-----------------|------------|-------------------| -| QEfficient.transform | [click here](#1-model-download-and-transform) |
  • model : $\color{green} {Mandatory}$
  • Type : Optional [Default- "Transformers"]
  • form_factor : Optional [Default-"cloud"]
  • | -| qualcomm_efficient_converter | [click here](#2-onnx-export-of-transformed-model) |
  • mode_name : $\color{green} {Mandatory}$
  • model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed]
  • model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]
  • tokenizer : Optional
  • onnx_path : Optional
  • hf_token : Optional
  • seq_length : Optional [Default-128]
  • input_str : Optional [Default-"My name is"]
  • kv : Optional [Default-$\color{green} {True}$]
  • return_path : Optional [Default-False]
  • form_factor : Optional [Default-"cloud"]
  • save_fp32_onnx : Optional [Default-False]
  • save_fp16_onnx : Optional [Default-True]
  • *Both save_fp32_onnx and save_fp16_onnx can't be false*
  • | -| compile | [click here](#3-compile-on-cloud-ai-100) |
  • onnx_path : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional [Default-True]
  • | +| QEfficient.transform | [click here](#1-model-download-and-transform) |
  • model : $\color{green} {Mandatory}$
  • form_factor : Optional [Default-"cloud"]
  • | +| QEfficient.export | [click here](#2-onnx-export-of-transformed-model) |
  • mode_name : $\color{green} {Mandatory}$
  • model_kv : Optional
  • tokenizer : Optional
  • onnx_path : Optional
  • hf_token : Optional
  • seq_length : Optional [Default-128]
  • kv : Optional [Default-$\color{green} {True}$]
  • return_path : Optional [Default-False]
  • form_factor : Optional [Default-"cloud"]
  • ***save_fp32_onnx : Optional [Default-False]
  • ***save_fp16_onnx : Optional [Default-True]
  • | +| QEfficient.compile | [click here](#3-compile-on-cloud-ai-100) |
  • onnx_path : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • aic_enable_depth_first : Optional [Default-False]
  • mos : Optional [Defaul= -1]
  • mxint8 : Optional [Defaul-False]
  • mxfp6 : Optional [Default-True]
  • | |cloud_ai_100_exec_kv | [click here](#4-run-benchmark) |
  • batch_size : $\color{green} {Mandatory}$
  • tokenizer : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • **prompt : Optional
  • **prompts_txt_file_path : Optional
  • input_len : Optional [Default-None]
  • generation_len : Optional [Default-None]
  • device_id : Optional [Default-[0]]
  • enable_debug_logs : Optional [Default-False]
  • stream : Optional [Default-True]
  • write_io_dir : Optional
  • automation : Optional [Default-False]
  • | -**One argument, prompt or prompts_txt_file_path must be passed. - +**One argument, prompt or prompts_txt_file_path must be passed.
    +***Both save_fp32_onnx and save_fp16_onnx can't be false. ### 1. Model download and transform Initialize QEfficient and transform the models, Check the list of supported architectures in the repo. -```bash +```Python # Initiate the Orignal Transformer model import os -from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel + + import QEfficient -from transformers import AutoTokenizer -from QEfficient.utils import hf_download -from QEfficient.utils.constants import Constants +from QEfficient import QEFFAutoModel + # Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir. # os.environ["TRANSFORMERS_CACHE"] = "/local/mnt/workspace/hf_cache" -ROOT_DIR = os.path.dirname(os.path.abspath("")) +#ROOT_DIR = os.path.dirname(os.path.abspath("")) +#CACHE_DIR = os.path.join(ROOT_DIR, "tmp"), you can use a different location for just one model by passing this param as cache_dir in below API. # Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl +model_name = "gpt2" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib. -model_name = "gpt2" - -# Similar, we can change model name and generate corresponding models, if we have added the support in the lib. - -model_hf_path = hf_download(repo_id=model_name, cache_dir=Constants.CACHE_DIR, ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"]) -model_hf = GPT2LMHeadModel.from_pretrained(model_hf_path, use_cache=True) -model_hf.eval() -print(f"{model_name} from hugging-face \n", model_hf) +qeff_model = QEFFAutoModel.from_pretrained(model_name, cache_dir=None) +print(f"{model_name} from hugging-face \n", qeff_model) # Easy and minimal api to update the model -model_transformed = QEfficient.transform(model_hf, type="Transformers", form_factor="cloud") - -model_transformed.eval() +model_transformed = QEfficient.transform(qeff_model, form_factor="cloud") print("Model after Optimized transformations \n", model_transformed) ``` @@ -234,31 +228,27 @@ print("Model after Optimized transformations \n", model_transformed) use the qualcomm_efficient_converter API to export the KV transformed Model to ONNX and Verify on Torch. -```bash -from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter - -# We can now export the modified models to ONNX framework -# This will generate single ONNX Model for both Prefill and Decode Variations which are optimized for +```Python +from QEfficient.utils import load_hf_tokenizer +# We can now export the modified models to Onnx framework +# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for # Cloud AI 100 Platform. -# This will generate ONNX model, clip the overflow constants to fp16 -# Verify the model on ONNXRuntime vs Pytorch -# Then generate inputs and custom_io.yaml file required for compilation. +# This will generate Onnx model, clip the overflow constants to fp16 +# Verify the model on Onnxruntime vs Pytorch +# Then generate inputs and customio yaml file required for compilation. # We can generate the KV Style models with the flag "kv" # Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version. # It is recommended to use kv=True for better performance. - -# For custom models defined on the Hub in their own modeling files. We need `trust_remote_code` option -# Should be set to `True` in `AutoTokenizer` for repositories you trust. -tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left") -base_path, onnx_path = qualcomm_efficient_converter( - model_kv=model_transformed, +tokenizer = load_hf_tokenizer(model_name, use_cache=True) +base_path, onnx_path = QEfficient.export( model_name=model_name, + model_kv=model_transformed, + tokenizer=tokenizer, kv=True, form_factor="cloud", return_path=True, - tokenizer=tokenizer, ) ``` @@ -266,29 +256,28 @@ base_path, onnx_path = qualcomm_efficient_converter( Once, the model is exported, Compile the model on Cloud AI 100 and generate QPC. -```bash +```Python # Please use platform SDk to Check num_cores for your card. -from QEfficient.cloud.compile import main as compile -generated_qpc_path = compile( +generated_qpc_path = QEfficient.compile( onnx_path=onnx_path, num_cores=14, - qpc_path=base_path, + qpc_path=os.path.dirname(base_path), + mxfp6=False, device_group=[0], - mxfp6=True, ) ``` ### 4. Run Benchmark Benchmark the model on Cloud AI 100, run the infer API to print tokens and tok/sec -```bash +```Python from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach batch_size = get_compilation_batch_size(generated_qpc_path) -cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt="My name is") +cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=["My name is"]) ``` End to End demo examples for various models are available in **notebooks** directory. Please check them out. diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 668a3b473..37d5ebb40 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -26,28 +26,19 @@ "# Initiate the Orignal Transformer model\n", "import os\n", "\n", - "from transformers import AutoTokenizer\n", - "from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel\n", - "\n", - "from QEfficient.utils import hf_download\n", - "from QEfficient.utils.constants import Constants\n", + "from QEfficient import QEFFAutoModel\n", "\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n", "\n", - "ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n", + "#ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n", + "#CACHE_DIR = os.path.join(ROOT_DIR, \"tmp\"), you can use a different location for just one model by passing this param as cache_dir in below API.\n", "\n", "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n", "model_name = \"gpt2\" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n", "\n", - "model_hf_path = hf_download(\n", - " repo_id=model_name,\n", - " cache_dir=Constants.CACHE_DIR,\n", - " ignore_patterns=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n", - ")\n", - "model_hf = GPT2LMHeadModel.from_pretrained(model_hf_path, use_cache=True)\n", - "model_hf.eval()\n", - "print(f\"{model_name} from hugging-face \\n\", model_hf)" + "qeff_model = QEFFAutoModel.from_pretrained(model_name, cache_dir=None)\n", + "print(f\"{model_name} from hugging-face \\n\", qeff_model)" ] }, { @@ -75,9 +66,8 @@ "import QEfficient\n", "\n", "# Easy and minimal api to update the model\n", - "model_transformed = QEfficient.transform(model_hf, type=\"Transformers\", form_factor=\"cloud\")\n", + "model_transformed = QEfficient.transform(qeff_model, form_factor=\"cloud\")\n", "\n", - "model_transformed.eval()\n", "print(\"Model after Optimized transformations \\n\", model_transformed)" ] }, @@ -96,8 +86,7 @@ "metadata": {}, "outputs": [], "source": [ - "from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter\n", - "\n", + "from QEfficient.utils import load_hf_tokenizer\n", "# We can now export the modified models to Onnx framework\n", "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n", "# Cloud AI 100 Platform.\n", @@ -109,14 +98,14 @@ "# We can generate the KV Style models with the flag \"kv\"\n", "# Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version.\n", "# It is recommended to use kv=True for better performance.\n", - "tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side=\"left\")\n", - "base_path, onnx_path = qualcomm_efficient_converter(\n", - " model_kv=model_transformed,\n", + "tokenizer = load_hf_tokenizer(model_name, use_cache=True)\n", + "base_path, onnx_path = QEfficient.export(\n", " model_name=model_name,\n", + " model_kv=model_transformed,\n", + " tokenizer=tokenizer,\n", " kv=True,\n", " form_factor=\"cloud\",\n", " return_path=True,\n", - " tokenizer=tokenizer,\n", ")" ] }, @@ -136,13 +125,12 @@ "outputs": [], "source": [ "# Please use platform SDk to Check num_cores for your card.\n", - "from QEfficient.cloud.compile import main as compile\n", "\n", - "generated_qpc_path = compile(\n", + "generated_qpc_path = QEfficient.compile(\n", " onnx_path=onnx_path,\n", " num_cores=14,\n", - " qpc_path=base_path,\n", - " mxfp6=True,\n", + " qpc_path=os.path.dirname(base_path),\n", + " mxfp6=False,\n", " device_group=[0],\n", ")" ] @@ -166,8 +154,8 @@ "\n", "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", - "batch_size = get_compilation_batch_size(generated_qpc_path)\n" - "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")" + "batch_size = get_compilation_batch_size(generated_qpc_path)\n", + "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])" ] } ], diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index 8533eedcc..9ca0c389a 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -26,27 +26,18 @@ "# Initiate the Orignal Transformer model\n", "import os\n", "\n", - "from transformers import AutoTokenizer\n", - "from transformers.models.mpt.modeling_mpt import MptForCausalLM\n", - "\n", - "from QEfficient.utils import hf_download\n", - "from QEfficient.utils.constants import Constants\n", + "from QEfficient import QEFFAutoModel\n", "\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n", "\n", - "ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n", + "#ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n", + "#CACHE_DIR = os.path.join(ROOT_DIR, \"tmp\"), you can use a different location for just one model by passing this param as cache_dir in below API.\n", "\n", "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n", "model_name = \"mosaicml/mpt-7b\" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n", - "model_hf_path = hf_download(\n", - " repo_id=model_name,\n", - " cache_dir=Constants.CACHE_DIR,\n", - " ignore_patterns=[\"*.txt\", \"*.onnx\", \"*.ot\", \"*.md\", \"*.tflite\", \"*.pdf\"],\n", - ")\n", - "model_hf = MptForCausalLM.from_pretrained(model_hf_path, use_cache=True)\n", - "model_hf.eval()\n", - "print(f\"{model_name} from hugging-face \\n\", model_hf)" + "qeff_model = QEFFAutoModel.from_pretrained(model_name)\n", + "print(f\"{qeff_model} from hugging-face \\n\", qeff_model)" ] }, { @@ -74,7 +65,7 @@ "import QEfficient\n", "\n", "# Easy and minimal api to update the model\n", - "model_transformed = QEfficient.transform(model_hf, type=\"Transformers\", form_factor=\"cloud\")\n", + "model_transformed = QEfficient.transform(qeff_model, form_factor=\"cloud\")\n", "\n", "model_transformed.eval()\n", "print(\"Model after Optimized transformations \\n\", model_transformed)" @@ -95,7 +86,7 @@ "metadata": {}, "outputs": [], "source": [ - "from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter\n", + "from QEfficient.utils import load_hf_tokenizer\n", "\n", "# We have the utils to export the modified models to Onnx framework\n", "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n", @@ -107,14 +98,14 @@ "\n", "# We can generate both bertstyle and KV Style models with the flag \"kv\"\n", "# Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version.\n", - "tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side=\"left\")\n", - "base_path, onnx_path = qualcomm_efficient_converter(\n", - " model_kv=model_transformed,\n", + "tokenizer = load_hf_tokenizer(model_hf_path, use_cache=True, padding_side=\"left\")\n", + "base_path, onnx_path = QEfficient.export(\n", " model_name=model_name,\n", + " model_kv=model_transformed,\n", + " tokenizer=tokenizer,\n", " kv=True,\n", " form_factor=\"cloud\",\n", " return_path=True,\n", - " tokenizer=tokenizer,\n", ")" ] }, @@ -134,12 +125,11 @@ "outputs": [], "source": [ "# Please use platform SDk to Check num_cores for your card.\n", - "from QEfficient.cloud.compile import main as compile\n", "\n", - "generated_qpc_path = compile(\n", + "generated_qpc_path = QEfficient.compile(\n", " onnx_path=onnx_path,\n", " num_cores=14,\n", - " qpc_path=base_path,\n", + " qpc_path=os.path.dirname(base_path),\n", " mxfp6=True,\n", " device_group=[0],\n", ")" @@ -165,8 +155,7 @@ "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "\n", - "batch_size = get_compilation_batch_size(generated_qpc_path)" - "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")" + "batch_size = get_compilation_batch_size(generated_qpc_path)cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])" ] } ], diff --git a/tests/test_loader.py b/tests/test_loader.py index 0d3df3a01..cc6aa050d 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -24,7 +24,7 @@ } model_names = model_name_to_params_dict.keys() - +#FIXME: Add test cases for passing cache_dir, pretrained_model_path instead of card name, etc., Passing other kwargs @pytest.mark.parametrize("model_name", model_names) def test_qeff_auto_model_for_causal_lm(model_name: str): model = QEFFAutoModel.from_pretrained(model_name) diff --git a/tests/utils.py b/tests/utils.py index f68dd20fb..0eeb94cd8 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -11,7 +11,7 @@ import unittest from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.exporter.export_utils import compile_kv_model_on_cloud_ai_100 +from QEfficient.cross_compile import compile_kv_model_on_cloud_ai_100 from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM from QEfficient.transformers.transform import transform_lm from QEfficient.utils import hf_download, load_hf_tokenizer From f00226103de85b7007da7fd9fb27b94cd7e97b13 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Fri, 31 May 2024 16:18:32 +0530 Subject: [PATCH 11/20] bug-fix infer Signed-off-by: Onkar Chougule --- QEfficient/cloud/infer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index e72a3329b..537ce2663 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -216,4 +216,5 @@ def main( args = parser.parse_args() if args.verbose: logger.setLevel(logging.INFO) + del args.verbose # type: ignore main(**args.__dict__) From 9eed62feb0ec3a43a851b7d254667ef1f60e20c7 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Fri, 31 May 2024 16:20:52 +0530 Subject: [PATCH 12/20] using QEfficient.export, compile in cloud APIs Signed-off-by: Onkar Chougule --- QEfficient/cloud/export.py | 3 +-- QEfficient/cloud/infer.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 2b7201c8e..30f93680b 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -10,7 +10,6 @@ from typing import Optional import QEfficient -from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.loader import QEFFAutoModel from QEfficient.utils import load_hf_tokenizer, onnx_exists from QEfficient.utils.constants import Constants @@ -46,7 +45,7 @@ def main( # Export to the Onnx print(f"Exporting to Pytorch {model_name} to Onnx") - base_path, onnx_path = qualcomm_efficient_converter( + base_path, onnx_path = QEfficient.export( model_kv=qeff_model, model_name=model_name, tokenizer=tokenizer, diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 537ce2663..ce05e39fa 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -69,7 +69,7 @@ def main( logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation") # Compile -> execute # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation - generated_qpc_path = compile( + generated_qpc_path = QEfficient.compile( onnx_path=onnx_model_path, qpc_path=os.path.dirname(qpc_dir_path), num_cores=num_cores, From 605ff865f528f29a3ec4b886320cee642fdb51fc Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Mon, 3 Jun 2024 23:56:17 +0530 Subject: [PATCH 13/20] cleaner infer,epxport APIs Signed-off-by: Onkar Chougule --- QEfficient/cloud/export.py | 63 ++++++----- QEfficient/cloud/infer.py | 107 +++++------------- .../exporter/export_hf_to_cloud_ai_100.py | 26 ++--- QEfficient/utils/__init__.py | 1 + QEfficient/utils/_utils.py | 11 ++ 5 files changed, 86 insertions(+), 122 deletions(-) diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 30f93680b..51d66570c 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -7,11 +7,12 @@ import argparse import os -from typing import Optional +from typing import Optional, Union -import QEfficient -from QEfficient.loader import QEFFAutoModel -from QEfficient.utils import load_hf_tokenizer, onnx_exists +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast + +from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter +from QEfficient.utils import onnx_exists from QEfficient.utils.constants import Constants from QEfficient.utils.logging_utils import logger @@ -19,6 +20,36 @@ ROOT_DIR = os.path.dirname(os.path.abspath("")) +def get_onnx_model_path(model_name: str, cache_dir: str, tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]]=None, hf_token: Optional[str] = None): + """ + exports the model to onnx if pre-exported file is not found and returns onnx_model_path + """ + onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name) + if onnx_path_exists: + logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation") + else: + ################### + # hf model -> export + #################### + # Export to the Onnx + logger.info(f"Exporting Pytorch {model_name} model to ONNX...") + _, generated_onnx_model_path = qualcomm_efficient_converter( + model_name=model_name, + tokenizer=tokenizer, + onnx_dir_path=onnx_dir_path, + kv=True, + form_factor="cloud", + return_path=True, + hf_token=hf_token, + cache_dir=cache_dir + ) # type: ignore + logger.info(f"Generated Onnx_path {generated_onnx_model_path} \nOnnx_model_path {onnx_model_path} \nand Onnx_dir_path is {onnx_dir_path}") + assert ( + generated_onnx_model_path == onnx_model_path + ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_model_path}" + return onnx_model_path + + def main( model_name: str, cache_dir: str, @@ -31,29 +62,7 @@ def main( :cache_dir: str. Cache dir to store the downloaded huggingface files. :hf_token: str. HuggingFace login token to access private repos. """ - onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name) - if onnx_path_exists: - logger.warning(f"Generated Onnx files found {onnx_model_path}! Please use Infer/Compile Apis()") - return - - tokenizer = load_hf_tokenizer(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token) - qeff_model = QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, cache_dir=cache_dir, hf_token=hf_token) - - # Easy and minimal api to update the model to QEff. - QEfficient.transform(qeff_model, form_factor="cloud") - print(f"Model after Optimized transformations {qeff_model}") - - # Export to the Onnx - print(f"Exporting to Pytorch {model_name} to Onnx") - base_path, onnx_path = QEfficient.export( - model_kv=qeff_model, - model_name=model_name, - tokenizer=tokenizer, - kv=True, - form_factor="cloud", - return_path=True, - ) # type: ignore - print(f"Base Path is {base_path} and Onnx Model Path is : {onnx_path}") + get_onnx_model_path(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token) if __name__ == "__main__": diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index ce05e39fa..c3ce86a98 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -11,13 +11,12 @@ from typing import List, Optional import QEfficient -from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter +from QEfficient.cloud.export import get_onnx_model_path from QEfficient.generation.text_generation_inference import ( check_batch_size_and_num_prompts, cloud_ai_100_exec_kv, ) -from QEfficient.loader import QEFFAutoModel -from QEfficient.utils import load_hf_tokenizer, onnx_exists, qpc_exists +from QEfficient.utils import get_qpc_dir_name_infer, load_hf_tokenizer, qpc_exists from QEfficient.utils.constants import Constants from QEfficient.utils.logging_utils import logger @@ -47,95 +46,43 @@ def main( 0, ], ) -> None: - qpc_base_dir_name = ( - f"qpc_{num_cores}cores_{batch_size}BS_{prompt_len}PL_{ctx_len}CL_{mos}MOS_" - + f"{len(device_group)}" - + "devices" - + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16") - ) - + qpc_base_dir_name = get_qpc_dir_name_infer(num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group) prompt: List[str] = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) - - # Get tokenizer tokenizer = load_hf_tokenizer(model_name=model_name, cache_dir=cache_dir, hf_token=hf_token) qpc_path_exists, qpc_dir_path = qpc_exists(model_name, qpc_base_dir_name) - onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name) - + # Handle qpc generation if qpc_path_exists: - # execute logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt") - elif onnx_path_exists: - logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation") - # Compile -> execute - # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation - generated_qpc_path = QEfficient.compile( - onnx_path=onnx_model_path, - qpc_path=os.path.dirname(qpc_dir_path), - num_cores=num_cores, - batch_size=batch_size, - prompt_len=prompt_len, - ctx_len=ctx_len, - mxfp6=mxfp6, - mxint8=mxint8, - aic_enable_depth_first=aic_enable_depth_first, - mos=mos, - device_group=device_group, - ) - assert ( - generated_qpc_path == qpc_dir_path - ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" else: - ############################################# - # hf model -> export -> compile -> execute - ############################################# - # Load hf model - qeff_model = QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, cache_dir=cache_dir, hf_token=hf_token) - - # Easy and minimal api to update the model to QEff. - qeff_opt_model = QEfficient.transform(qeff_model, form_factor="cloud") - logger.info(f"Model after Optimized transformations {qeff_opt_model}") - - # Export to the Onnx - logger.info(f"Exporting Pytorch {model_name} model to ONNX...") - # Need to split below function into two functions one which always takes QEFFAutoModel and other with same interface as below - base_path, generated_onnx_path = qualcomm_efficient_converter( - model_name=model_name, - model_kv=qeff_opt_model, # type: ignore - tokenizer=tokenizer, - onnx_dir_path=onnx_dir_path, - kv=True, - form_factor="cloud", - return_path=True, - ) # type: ignore - print(f"Generated Onnx_path {generated_onnx_path} and Onnx_model_path {onnx_model_path} and Onnx_dir_path is {onnx_dir_path}") - assert ( - generated_onnx_path == onnx_model_path - ), f"ONNX files were generated at an unusual location, expected {onnx_model_path}, got {generated_onnx_path}" - logger.info(f"Base Path is {base_path} and Onnx Model Path is : {generated_onnx_path}") + # ################## + # HF model -> export + #################### + onnx_model_path = get_onnx_model_path(model_name, cache_dir, tokenizer, hf_token) + ######### # Compile - # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation + ######### generated_qpc_path = QEfficient.compile( - onnx_path=onnx_model_path, - qpc_path=os.path.dirname(qpc_dir_path), - num_cores=num_cores, - batch_size=batch_size, - prompt_len=prompt_len, - ctx_len=ctx_len, - mxfp6=mxfp6, - mxint8=mxint8, - aic_enable_depth_first=aic_enable_depth_first, - mos=mos, - device_group=device_group, - ) + onnx_path=onnx_model_path, + qpc_path=os.path.dirname(qpc_dir_path), # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation + num_cores=num_cores, + batch_size=batch_size, + prompt_len=prompt_len, + ctx_len=ctx_len, + mxfp6=mxfp6, + mxint8=mxint8, + aic_enable_depth_first=aic_enable_depth_first, + mos=mos, + device_group=device_group, + ) assert ( - qpc_dir_path == generated_qpc_path - ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" - logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}") - - + generated_qpc_path == qpc_dir_path + ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" + + ######### # Execute + ######### cloud_ai_100_exec_kv( batch_size, tokenizer=tokenizer, diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index d5da3f422..38e0e34d5 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -365,19 +365,12 @@ def export_kvstyle_transformed_model_to_onnx(model_name: str, transformed_model: return fp32_model_name, fp16_model_name -def export_for_edge() -> None: - # [TODO]: Apply the class transformation to make changes for the KV models in edge use cases - # model = QEfficient.transform(model_hf, type="Transformers", form_factor="edge") - # model.eval() - raise NotImplementedError("Oops...reached too far!!") - - def export_for_cloud(model_name: str, qeff_model: QEFFBaseModel, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], onnx_dir_path: str, seq_length: int = Constants.seq_length, return_path: bool = True, save_fp32_onnx: bool = False, - save_fp16_onnx: bool = True): + save_fp16_onnx: bool = True)-> Tuple[str, str]: if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore return export_lm_model_for_cloud(model_name=model_name, qeff_model=qeff_model, # type: ignore @@ -443,6 +436,7 @@ def qualcomm_efficient_converter( model_name: str, model_kv: QEFFBaseModel = None, # type: ignore tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]]=None, + cache_dir: Optional[str] = None, onnx_dir_path: Optional[str]=None, hf_token: Optional[str] = None, seq_length: int = Constants.seq_length, @@ -451,17 +445,17 @@ def qualcomm_efficient_converter( form_factor: str="cloud", save_fp32_onnx: bool = False, save_fp16_onnx: bool = True, -) -> Union[Tuple[str, str], None]: +) -> Tuple[str, str]: """ Function to convert the input string using the specified model and returns the result. Args: model_name (str): The name of the model to be used. - model_class (type): The class of the model. model_kv (torch.nn.Module): Transformed KV torch model to be used tokenizer (HF AutoTokenizer): Tokenzier to prepare inputs. + cache_dir (str): Path to cache dir if not specified, default HF cache_dir will be used. onnx_dir_path (str, optional): The path where the model is stored. If None, the model is loaded from the default location. - token (bool): If True, an authentication token will be used. Default is False. + hf_token (bool): If True, an authentication token will be used. Default is False. seq_len (int, optional): The length of the sequence. Default is 128. kv (bool): If True, key-value pairs will be used. Default is True. return_path (bool): If True, return the base path for models and exported onnx model path @@ -473,7 +467,7 @@ def qualcomm_efficient_converter( """ # Get model_kv first - model_kv = model_kv if model_kv else QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token) + model_kv = model_kv if model_kv else QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token, cache_dir=cache_dir) # Transform if required if model_kv.is_transformed and not kv: @@ -481,13 +475,12 @@ def qualcomm_efficient_converter( model_kv = model_kv if model_kv.is_transformed else QEfficient.transform(model_kv) if kv else model_kv - if onnx_dir_path is None: model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) onnx_dir_path = os.path.join(model_card_dir, "onnx") # Load tokenizer if not passed - tokenizer = load_hf_tokenizer(model_name=model_name, hf_token=hf_token) if tokenizer is None else tokenizer + tokenizer = tokenizer if tokenizer else load_hf_tokenizer(model_name=model_name, hf_token=hf_token, cache_dir=cache_dir) if form_factor == "cloud": return export_for_cloud( @@ -500,4 +493,7 @@ def qualcomm_efficient_converter( save_fp16_onnx=save_fp16_onnx, save_fp32_onnx=save_fp32_onnx) else: - return export_for_edge() + # [TODO]: Apply the class transformation to make changes for the KV models in edge use cases + # model = QEfficient.transform(model_hf, type="Transformers", form_factor="edge") + # model.eval() + raise NotImplementedError("Oops! Reached too far!!") diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index 7a3cd4959..bd6b59120 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -6,6 +6,7 @@ # ----------------------------------------------------------------------------- from QEfficient.utils._utils import ( # noqa: F401 + get_qpc_dir_name_infer, hf_download, load_hf_tokenizer, login_and_download_hf_lm, diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 4c2ad177c..7a0d85828 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -129,3 +129,14 @@ def load_hf_tokenizer(model_name: str, cache_dir: Optional[str] = None, hf_token #FIXME(ochougul): should this always return left padded tokenizer? tokenizer = AutoTokenizer.from_pretrained(model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs) return tokenizer + + +def get_qpc_dir_name_infer(num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group): + qpc_base_dir_name = ( + f"qpc_{num_cores}cores_{batch_size}BS_{prompt_len}PL_{ctx_len}CL_{mos}MOS_" + + f"{len(device_group)}" + + "devices" + + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16") + ) + + return qpc_base_dir_name From b2c5fc7275b5efed3ba1c149fd281b48c96f631f Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Tue, 4 Jun 2024 19:37:53 +0530 Subject: [PATCH 14/20] addressed review comments Signed-off-by: Onkar Chougule --- QEfficient/__init__.py | 4 +- QEfficient/cloud/infer.py | 4 +- QEfficient/{ => compile}/cross_compile.py | 0 .../exporter/export_hf_to_cloud_ai_100.py | 14 +- QEfficient/loader/loader_factory.py | 130 ------------------ QEfficient/{loader => src}/__init__.py | 3 +- QEfficient/src/_transformers/__init__.py | 0 QEfficient/src/_transformers/auto.py | 111 +++++++++++++++ QEfficient/src/base.py | 47 +++++++ .../{loader/loader.py => src/common.py} | 16 ++- QEfficient/transformers/transform.py | 5 +- tests/test_loader.py | 5 +- tests/utils.py | 4 +- 13 files changed, 190 insertions(+), 153 deletions(-) rename QEfficient/{ => compile}/cross_compile.py (100%) delete mode 100644 QEfficient/loader/loader_factory.py rename QEfficient/{loader => src}/__init__.py (63%) create mode 100644 QEfficient/src/_transformers/__init__.py create mode 100644 QEfficient/src/_transformers/auto.py create mode 100644 QEfficient/src/base.py rename QEfficient/{loader/loader.py => src/common.py} (91%) diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 9804c4ea1..aac0bcd29 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -5,9 +5,9 @@ # # ----------------------------------------------------------------------------- -from QEfficient.cross_compile import compile # noqa: F401 +from QEfficient.compile.cross_compile import compile # noqa: F401 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.loader import QEFFAutoModel # noqa: F401 +from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader # noqa: F401 from QEfficient.transformers.transform import transform # noqa: F401 # Users can use QEfficient.export for exporting models to ONNX diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index c3ce86a98..58fe96660 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -55,9 +55,7 @@ def main( if qpc_path_exists: logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt") else: - # ################## - # HF model -> export - #################### + # Handle onnx model generation onnx_model_path = get_onnx_model_path(model_name, cache_dir, tokenizer, hf_token) ######### diff --git a/QEfficient/cross_compile.py b/QEfficient/compile/cross_compile.py similarity index 100% rename from QEfficient/cross_compile.py rename to QEfficient/compile/cross_compile.py diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index 38e0e34d5..d55fedffc 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -7,19 +7,16 @@ import os import shutil -from typing import Optional, Tuple, Type, Union +from typing import Optional, Tuple, Union import torch from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast import QEfficient from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort -from QEfficient.loader.loader import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFFAutoModel -from QEfficient.loader.loader_factory import ( - QEFF_MODEL_TYPE, - QEFFAutoModelForCausalLM, - QEFFBaseModel, -) +from QEfficient.src._transformers.auto import QEFFAutoModelForCausalLM +from QEfficient.src.base import QEFFBaseModel +from QEfficient.src.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader from QEfficient.utils._utils import load_hf_tokenizer from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants from QEfficient.utils.logging_utils import logger @@ -371,6 +368,7 @@ def export_for_cloud(model_name: str, qeff_model: QEFFBaseModel, return_path: bool = True, save_fp32_onnx: bool = False, save_fp16_onnx: bool = True)-> Tuple[str, str]: + # FIXME: move all this to class instead of here, and just call qeff_model.export here. if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore return export_lm_model_for_cloud(model_name=model_name, qeff_model=qeff_model, # type: ignore @@ -467,7 +465,7 @@ def qualcomm_efficient_converter( """ # Get model_kv first - model_kv = model_kv if model_kv else QEFFAutoModel.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token, cache_dir=cache_dir) + model_kv = model_kv if model_kv else QEFFCommonLoader.from_pretrained(pretrained_model_name_or_path=model_name, hf_token=hf_token, cache_dir=cache_dir) # Transform if required if model_kv.is_transformed and not kv: diff --git a/QEfficient/loader/loader_factory.py b/QEfficient/loader/loader_factory.py deleted file mode 100644 index ede3a6c85..000000000 --- a/QEfficient/loader/loader_factory.py +++ /dev/null @@ -1,130 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# ---------------------------------------------------------------------------- - -""" -** This file for holds the classes that handle main functions -1.load i.e. from_pretrained -2.execute -3.transform -4.export -5.compile -For different varities of Transformer Models - -** Each variety of the Transformer model that has different way of doing any of the above functions will have it's own class i.e. -following models type will have their own class which must inherit QEFFBaseModel abstract class. -1.Causal Language Models -2.Diffusion -3.Quantized models - -** QEFFBASEModel is abstract base class that defines the basic structure of these classes. -""" -from abc import ABC, abstractmethod -from enum import Enum -from typing import Any - -import torch.nn as nn -from transformers import AutoModelForCausalLM -from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING - -import QEfficient -from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict - - -class QEFF_MODEL_TYPE(Enum): - """ - Defines Names of the different varities of transformer models. - """ - CAUSALLM = "LLM" - DIFFUSION = "STABLE_DIFFUSION" - AWQ = "AWQ" - - -class QEFFBaseModel(ABC): - """ - This class acts as parent class for all the varieties of model class (i.e. LLMs, SD, quantized etc.). - Enforces certain methods to be implemented by child classes. - - All the child classes must provide way to load, transform(optimize), exoprt to ONNX etc. capabilities. - """ - def __init__(self) -> None: - super().__init__() - # Users can call generate or execute - self.generate = self.execute - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): - raise NotImplementedError("Must implement for child classes") - - @property - def is_transformed(self) -> bool: - raise NotImplementedError("Must implement for child classes") - - @abstractmethod - def transform_export(self, *args, **kwargs) -> Any: - pass - - @abstractmethod - def transform_export_compile(self, *args, **kwargs) -> Any: - pass - - @abstractmethod - def execute(self, *args, **kwargs) -> Any: - pass - - @abstractmethod - def transform(self, *args, **kwargs) -> Any: - pass - - @abstractmethod - def export(self, *args, **kwargs) -> Any: - pass - - @abstractmethod - def compile(self, *args, **kwargs) -> Any: - pass - - -class QEFFAutoModelForCausalLM(QEFFBaseModel): - """ - QEFF class for manipulating any causal language model from HuggingFace hub. - """ - def __init__(self, model: nn.Module, pretrained_model_name_or_path: str) -> None: - assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or - model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore - self.model: nn.Module = model - self.model_files_path = pretrained_model_name_or_path - - def __repr__(self) -> str: - return self.model.__repr__() - - @property - def is_transformed(self) -> bool: - return getattr(self.model, "qeff_transformed", False) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): - model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) - return cls(model=model, pretrained_model_name_or_path=pretrained_model_name_or_path) - - def transform_export(self, *args, **kwargs) -> Any: - raise NotImplementedError("Reached too far!!") - - def transform_export_compile(self, *args, **kwargs) -> Any: - raise NotImplementedError("Reached too far!!") - - def execute(self, *args, **kwargs): # type: ignore - raise NotImplementedError("Reached too far!!") - - def transform(self): - QEfficient.transform(self) - return self - - def export(self): - raise NotImplementedError("Reached too far!!") - - def compile(self, *args, **kwargs) -> Any: - raise NotImplementedError("Reached too far!!") diff --git a/QEfficient/loader/__init__.py b/QEfficient/src/__init__.py similarity index 63% rename from QEfficient/loader/__init__.py rename to QEfficient/src/__init__.py index a17f497b5..854686567 100644 --- a/QEfficient/loader/__init__.py +++ b/QEfficient/src/__init__.py @@ -5,4 +5,5 @@ # # ----------------------------------------------------------------------------- -from QEfficient.loader.loader import QEFFAutoModel # noqa: F401 +from QEfficient.src._transformers.auto import QEffAutoModel, QEFFAutoModelForCausalLM # noqa: F401 +from QEfficient.src.common import QEFFCommonLoader # noqa: F401 diff --git a/QEfficient/src/_transformers/__init__.py b/QEfficient/src/_transformers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/QEfficient/src/_transformers/auto.py b/QEfficient/src/_transformers/auto.py new file mode 100644 index 000000000..68f039060 --- /dev/null +++ b/QEfficient/src/_transformers/auto.py @@ -0,0 +1,111 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +""" +** This file for holds the classes that handle main functions +1.load i.e. from_pretrained +2.execute +3.transform +4.export +5.compile +For different varities of Transformer Models + +Representation of class inheritence followed keeping in line with transformers/diffusers repos -> + + QEFFBaseModel + ________________________________________________|________________________________________________________________ + | | + QEFFTransformersBase QEFFDiffusersBase + | | + ____________|________________________________________________________ ________________ _________________|______________ + _____ | | | | | | + | QEFFAutoModel QEFFAutoModelForCausalLM QEFFAWQModelForCausalLM ... ... ... +QEFFCommonLoader -| [Provides way to [Provides way to do 1-5 on [Supports 1-5 for +[Provides | do steps 1-5 on transformers.AutoModelForCausalLM] AWQ Models] +interface to |_____ transformers.AutoModel] +Load any of +These models +by automatically +detecting the type +of the model] + +** QEFFBASEModel is abstract base class that defines the basic structure of these classes. +** QEFFPipeline classes will stay at the same level as QEFFAutoModel in this hierarchy in future. +""" +from typing import Any + +import torch.nn as nn +from transformers.models.auto import AutoModel, AutoModelForCausalLM +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING + +import QEfficient +from QEfficient.src.base import QEFFBaseModel +from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict + +# Dictionary that defines the interface from transformers to be used underneath the QEFF interface +QEFFAutoModelToTransformersAutoModelMap = { + "QEFFAutoModelForCausalLM": AutoModelForCausalLM, + "QEFFAutoModel": AutoModel, +} + + +class QEFFTransformersBase(QEFFBaseModel): + """ + Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from src/transformers/models/auto/modeling_auto.py file. + """ + def __init__(self, model: nn.Module) -> None: + assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or + # FIXME: Use model architectures here instead of complete dictionary TransformersToQEffModulesDict + model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore + self.model: nn.Module = model + + def __repr__(self) -> str: + return self.model.__repr__() + + @property + def is_transformed(self) -> bool: + return getattr(self.model, "qeff_transformed", False) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): + model = QEFFAutoModelToTransformersAutoModelMap[cls.__name__].from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + return cls(model) + + def transform_export(self, *args, **kwargs) -> Any: + raise NotImplementedError("Reached too far!!") + + def transform_export_compile(self, *args, **kwargs) -> Any: + raise NotImplementedError("Reached too far!!") + + def transform(self): + QEfficient.transform(self) + return self + + +class QEFFAutoModelForCausalLM(QEFFTransformersBase): + """ + QEFF class for manipulating any causal language model from HuggingFace hub. + """ + def execute(self, *args, **kwargs): # type: ignore + raise NotImplementedError("Reached too far!!") + + def export(self): + raise NotImplementedError("Reached too far!!") + + def compile(self, *args, **kwargs) -> Any: + raise NotImplementedError("Reached too far!!") + + +class QEffAutoModel(QEFFTransformersBase): + def execute(self, *args, **kwargs): # type: ignore + raise NotImplementedError("Reached too far!!") + + def export(self): + raise NotImplementedError("Reached too far!!") + + def compile(self, *args, **kwargs) -> Any: + raise NotImplementedError("Reached too far!!") diff --git a/QEfficient/src/base.py b/QEfficient/src/base.py new file mode 100644 index 000000000..bb96fc075 --- /dev/null +++ b/QEfficient/src/base.py @@ -0,0 +1,47 @@ +from abc import ABC, abstractmethod +from typing import Any + + +class QEFFBaseModel(ABC): + """ + This class acts as parent class for all the varieties of model class (i.e. LLMs, SD, quantized etc.). + Enforces certain methods to be implemented by child classes. + + All the child classes must provide way to load, transform(optimize), exoprt to ONNX etc. capabilities. + """ + def __init__(self) -> None: + super().__init__() + # Users can call generate or execute + self.generate = self.execute + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): + raise NotImplementedError("Must implement for child classes") + + @property + def is_transformed(self) -> bool: + raise NotImplementedError("Must implement for child classes") + + @abstractmethod + def transform_export(self, *args, **kwargs) -> Any: + pass + + @abstractmethod + def transform_export_compile(self, *args, **kwargs) -> Any: + pass + + @abstractmethod + def execute(self, *args, **kwargs) -> Any: + pass + + @abstractmethod + def transform(self, *args, **kwargs) -> Any: + pass + + @abstractmethod + def export(self, *args, **kwargs) -> Any: + pass + + @abstractmethod + def compile(self, *args, **kwargs) -> Any: + pass \ No newline at end of file diff --git a/QEfficient/loader/loader.py b/QEfficient/src/common.py similarity index 91% rename from QEfficient/loader/loader.py rename to QEfficient/src/common.py index c0da35421..bca391097 100644 --- a/QEfficient/loader/loader.py +++ b/QEfficient/src/common.py @@ -12,14 +12,26 @@ QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model. """ import os +from enum import Enum from typing import Any, Dict, Type from transformers import AutoConfig from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING -from QEfficient.loader.loader_factory import QEFF_MODEL_TYPE, QEFFAutoModelForCausalLM, QEFFBaseModel +from QEfficient.src._transformers.auto import QEFFAutoModelForCausalLM +from QEfficient.src.base import QEFFBaseModel from QEfficient.utils._utils import login_and_download_hf_lm + +class QEFF_MODEL_TYPE(Enum): + """ + Defines Names of the different varities of transformer models. + """ + CAUSALLM = "LLM" + DIFFUSION = "STABLE_DIFFUSION" + AWQ = "AWQ" + + MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = { QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM } @@ -50,7 +62,7 @@ def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE: raise NotImplementedError(f"model type {type(config)} is not yet supported") -class QEFFAutoModel: +class QEFFCommonLoader: """ Provides HuggingFace model loading interface same as transformers APIs. Supports loading any model on HuggingFace. diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py index aaddd75fa..3a520d0ed 100644 --- a/QEfficient/transformers/transform.py +++ b/QEfficient/transformers/transform.py @@ -10,8 +10,8 @@ import torch.nn as nn import transformers -from QEfficient.loader.loader import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP -from QEfficient.loader.loader_factory import QEFF_MODEL_TYPE, QEFFBaseModel +from QEfficient.src.base import QEFFBaseModel +from QEfficient.src.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader from QEfficient.transformers.modeling_attn_mask_utils import ( QEffAttentionMaskConverter, _qeff_prepare_4d_attention_mask, @@ -115,6 +115,7 @@ def transform(model: QEFFBaseModel, form_factor="cloud"): form_factor(str): form factor configuration for optmizing the model, available options=["cloud", "edge"]. """ assert form_factor == "cloud", "Only form_factor='cloud' is supported as of now!" + #FIXME: move this to class and use model.transform() if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: transform_lm(model.model) # type: ignore return model diff --git a/tests/test_loader.py b/tests/test_loader.py index cc6aa050d..5c626361b 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -11,8 +11,7 @@ from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel import QEfficient -from QEfficient import QEFFAutoModel -from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM +from QEfficient import QEFFAutoModelForCausalLM, QEFFCommonLoader model_name_to_params_dict : Dict[str, Dict[str, Any]] = { "gpt2": { @@ -27,7 +26,7 @@ #FIXME: Add test cases for passing cache_dir, pretrained_model_path instead of card name, etc., Passing other kwargs @pytest.mark.parametrize("model_name", model_names) def test_qeff_auto_model_for_causal_lm(model_name: str): - model = QEFFAutoModel.from_pretrained(model_name) + model = QEFFCommonLoader.from_pretrained(model_name) assert isinstance(model, model_name_to_params_dict[model_name]['qeff_class']) assert isinstance(model.model, model_name_to_params_dict[model_name]['hf_class']) # type: ignore diff --git a/tests/utils.py b/tests/utils.py index 0eeb94cd8..18de39622 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -10,9 +10,9 @@ import shutil import unittest +from QEfficient import QEFFAutoModelForCausalLM +from QEfficient.compile.cross_compile import compile_kv_model_on_cloud_ai_100 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.cross_compile import compile_kv_model_on_cloud_ai_100 -from QEfficient.loader.loader_factory import QEFFAutoModelForCausalLM from QEfficient.transformers.transform import transform_lm from QEfficient.utils import hf_download, load_hf_tokenizer from QEfficient.utils.constants import QEFF_MODELS_DIR, ROOT_DIR, Constants From 0e1bb53cd281a3d86f3903eec8c7ec8f71543673 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Tue, 4 Jun 2024 20:01:02 +0530 Subject: [PATCH 15/20] *updated notebooks, readme *moved class desc to base.py *Added Runtime Enum Signed-off-by: Onkar Chougule --- QEfficient/src/_transformers/auto.py | 31 ----------------- QEfficient/src/base.py | 52 ++++++++++++++++++++++++++++ README.md | 4 +-- notebooks/QEfficientGPT2.ipynb | 8 ++--- notebooks/QEfficientMPT.ipynb | 4 +-- 5 files changed, 60 insertions(+), 39 deletions(-) diff --git a/QEfficient/src/_transformers/auto.py b/QEfficient/src/_transformers/auto.py index 68f039060..40877ea99 100644 --- a/QEfficient/src/_transformers/auto.py +++ b/QEfficient/src/_transformers/auto.py @@ -5,37 +5,6 @@ # # ---------------------------------------------------------------------------- -""" -** This file for holds the classes that handle main functions -1.load i.e. from_pretrained -2.execute -3.transform -4.export -5.compile -For different varities of Transformer Models - -Representation of class inheritence followed keeping in line with transformers/diffusers repos -> - - QEFFBaseModel - ________________________________________________|________________________________________________________________ - | | - QEFFTransformersBase QEFFDiffusersBase - | | - ____________|________________________________________________________ ________________ _________________|______________ - _____ | | | | | | - | QEFFAutoModel QEFFAutoModelForCausalLM QEFFAWQModelForCausalLM ... ... ... -QEFFCommonLoader -| [Provides way to [Provides way to do 1-5 on [Supports 1-5 for -[Provides | do steps 1-5 on transformers.AutoModelForCausalLM] AWQ Models] -interface to |_____ transformers.AutoModel] -Load any of -These models -by automatically -detecting the type -of the model] - -** QEFFBASEModel is abstract base class that defines the basic structure of these classes. -** QEFFPipeline classes will stay at the same level as QEFFAutoModel in this hierarchy in future. -""" from typing import Any import torch.nn as nn diff --git a/QEfficient/src/base.py b/QEfficient/src/base.py index bb96fc075..ddc23fc87 100644 --- a/QEfficient/src/base.py +++ b/QEfficient/src/base.py @@ -1,7 +1,54 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +""" +** This file for holds the classes that handle main functions +1.load i.e. from_pretrained +2.execute +3.transform +4.export +5.compile +For different varities of Transformer Models + +Representation of class inheritence followed keeping in line with transformers/diffusers repos -> + + QEFFBaseModel + ________________________________________________|________________________________________________________________ + | | + QEFFTransformersBase QEFFDiffusersBase + | | + ____________|________________________________________________________ ________________ _________________|______________ + _____ | | | | | | + | QEFFAutoModel QEFFAutoModelForCausalLM QEFFAWQModelForCausalLM ... ... ... +QEFFCommonLoader -| [Provides way to [Provides way to do 1-5 on [Supports 1-5 for +[Provides | do steps 1-5 on transformers.AutoModelForCausalLM] AWQ Models] +interface to |_____ transformers.AutoModel] +Load any of +These models +by automatically +detecting the type +of the model] + +** QEFFBASEModel is abstract base class that defines the basic structure of these classes. +** QEFFPipeline classes will stay at the same level as QEFFAutoModel in this hierarchy in future. +""" + from abc import ABC, abstractmethod +from enum import Enum from typing import Any +#Defining placeholder ENUM for execute function +class Runtime(Enum): + CPU_ORT = "CPU ONNX Runtime" + CPU_PT = "CPU PyTorch Runtime" + AI_100 = "AI_100" + + class QEFFBaseModel(ABC): """ This class acts as parent class for all the varieties of model class (i.e. LLMs, SD, quantized etc.). @@ -13,6 +60,11 @@ def __init__(self) -> None: super().__init__() # Users can call generate or execute self.generate = self.execute + self._runtime = Runtime.CPU_PT + + @property + def runtime(self) -> Runtime: + return self._runtime @classmethod def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): diff --git a/README.md b/README.md index 634e3add3..1d830f0b8 100644 --- a/README.md +++ b/README.md @@ -205,7 +205,7 @@ import os import QEfficient -from QEfficient import QEFFAutoModel +from QEfficient import QEFFAutoModelForCausalLM # Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir. # os.environ["TRANSFORMERS_CACHE"] = "/local/mnt/workspace/hf_cache" @@ -216,7 +216,7 @@ from QEfficient import QEFFAutoModel # Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl model_name = "gpt2" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib. -qeff_model = QEFFAutoModel.from_pretrained(model_name, cache_dir=None) +qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=None) print(f"{model_name} from hugging-face \n", qeff_model) # Easy and minimal api to update the model diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 37d5ebb40..bedb14049 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -26,18 +26,18 @@ "# Initiate the Orignal Transformer model\n", "import os\n", "\n", - "from QEfficient import QEFFAutoModel\n", + "from QEfficient import QEFFAutoModelForCausalLM\n", "\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n", "\n", - "#ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n", - "#CACHE_DIR = os.path.join(ROOT_DIR, \"tmp\"), you can use a different location for just one model by passing this param as cache_dir in below API.\n", + "# ROOT_DIR = os.path.dirname(os.path.abspath(\"\"))\n", + "# CACHE_DIR = os.path.join(ROOT_DIR, \"tmp\") #, you can use a different location for just one model by passing this param as cache_dir in below API.\n", "\n", "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n", "model_name = \"gpt2\" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n", "\n", - "qeff_model = QEFFAutoModel.from_pretrained(model_name, cache_dir=None)\n", + "qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=CACHE_DIR)\n", "print(f\"{model_name} from hugging-face \\n\", qeff_model)" ] }, diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index 9ca0c389a..6d5204f55 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -26,7 +26,7 @@ "# Initiate the Orignal Transformer model\n", "import os\n", "\n", - "from QEfficient import QEFFAutoModel\n", + "from QEfficient import QEFFAutoModelForCausalLM\n", "\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n", @@ -36,7 +36,7 @@ "\n", "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n", "model_name = \"mosaicml/mpt-7b\" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n", - "qeff_model = QEFFAutoModel.from_pretrained(model_name)\n", + "qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)\n", "print(f\"{qeff_model} from hugging-face \\n\", qeff_model)" ] }, From 5f751f57b15c079936af48679eb962e099aa2a05 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Tue, 4 Jun 2024 20:16:04 +0530 Subject: [PATCH 16/20] updated cloud_ai_100_exec_kv to be callable from QEfficient package Signed-off-by: Onkar Chougule --- QEfficient/__init__.py | 1 + README.md | 2 +- notebooks/QEfficientGPT2.ipynb | 212 +++++++++++++++++++++++++++++++-- notebooks/QEfficientMPT.ipynb | 5 +- 4 files changed, 204 insertions(+), 16 deletions(-) diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index aac0bcd29..04e0f825c 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -7,6 +7,7 @@ from QEfficient.compile.cross_compile import compile # noqa: F401 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter +from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv # noqa: F401 from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader # noqa: F401 from QEfficient.transformers.transform import transform # noqa: F401 diff --git a/README.md b/README.md index 1d830f0b8..89a02859c 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,7 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3 | QEfficient.transform | [click here](#1-model-download-and-transform) |
  • model : $\color{green} {Mandatory}$
  • form_factor : Optional [Default-"cloud"]
  • | | QEfficient.export | [click here](#2-onnx-export-of-transformed-model) |
  • mode_name : $\color{green} {Mandatory}$
  • model_kv : Optional
  • tokenizer : Optional
  • onnx_path : Optional
  • hf_token : Optional
  • seq_length : Optional [Default-128]
  • kv : Optional [Default-$\color{green} {True}$]
  • return_path : Optional [Default-False]
  • form_factor : Optional [Default-"cloud"]
  • ***save_fp32_onnx : Optional [Default-False]
  • ***save_fp16_onnx : Optional [Default-True]
  • | | QEfficient.compile | [click here](#3-compile-on-cloud-ai-100) |
  • onnx_path : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • aic_enable_depth_first : Optional [Default-False]
  • mos : Optional [Defaul= -1]
  • mxint8 : Optional [Defaul-False]
  • mxfp6 : Optional [Default-True]
  • | -|cloud_ai_100_exec_kv | [click here](#4-run-benchmark) |
  • batch_size : $\color{green} {Mandatory}$
  • tokenizer : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • **prompt : Optional
  • **prompts_txt_file_path : Optional
  • input_len : Optional [Default-None]
  • generation_len : Optional [Default-None]
  • device_id : Optional [Default-[0]]
  • enable_debug_logs : Optional [Default-False]
  • stream : Optional [Default-True]
  • write_io_dir : Optional
  • automation : Optional [Default-False]
  • | +|QEfficient.cloud_ai_100_exec_kv | [click here](#4-run-benchmark) |
  • batch_size : $\color{green} {Mandatory}$
  • tokenizer : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • **prompt : Optional
  • **prompts_txt_file_path : Optional
  • input_len : Optional [Default-None]
  • generation_len : Optional [Default-None]
  • device_id : Optional [Default-[0]]
  • enable_debug_logs : Optional [Default-False]
  • stream : Optional [Default-True]
  • write_io_dir : Optional
  • automation : Optional [Default-False]
  • | **One argument, prompt or prompts_txt_file_path must be passed.
    ***Both save_fp32_onnx and save_fp16_onnx can't be false. diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index bedb14049..40e7b6a21 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -18,10 +18,53 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "c21f82d5-17df-4fc9-a180-05edd032f02d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr2/ochougul/.pyenv/versions/3.8.19/envs/py38/lib/python3.8/site-packages/transformers/utils/hub.py:123: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "gpt2 from hugging-face \n", + " GPT2LMHeadModel(\n", + " (transformer): GPT2Model(\n", + " (wte): Embedding(50257, 768)\n", + " (wpe): Embedding(1024, 768)\n", + " (drop): Dropout(p=0.1, inplace=False)\n", + " (h): ModuleList(\n", + " (0-11): 12 x GPT2Block(\n", + " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (attn): GPT2Attention(\n", + " (c_attn): Conv1D()\n", + " (c_proj): Conv1D()\n", + " (attn_dropout): Dropout(p=0.1, inplace=False)\n", + " (resid_dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (mlp): GPT2MLP(\n", + " (c_fc): Conv1D()\n", + " (c_proj): Conv1D()\n", + " (act): NewGELUActivation()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n", + ")\n" + ] + } + ], "source": [ "# Initiate the Orignal Transformer model\n", "import os\n", @@ -37,7 +80,7 @@ "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n", "model_name = \"gpt2\" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n", "\n", - "qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=CACHE_DIR)\n", + "qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)\n", "print(f\"{model_name} from hugging-face \\n\", qeff_model)" ] }, @@ -58,10 +101,52 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "a4543b94-9b50-4bcc-90c6-484ab694c9a6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[33;20mWARNING - QEfficient - The model layers has been upadted to QEff layers in-place\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model after Optimized transformations \n", + " QEffGPT2LMHeadModel(\n", + " (transformer): QEffGPT2Model(\n", + " (wte): Embedding(50257, 768)\n", + " (wpe): Embedding(1024, 768)\n", + " (drop): Dropout(p=0.1, inplace=False)\n", + " (h): ModuleList(\n", + " (0-11): 12 x QEffGPT2Block(\n", + " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (attn): QEffGPT2Attention(\n", + " (c_attn): Conv1D()\n", + " (c_proj): Conv1D()\n", + " (attn_dropout): Dropout(p=0.1, inplace=False)\n", + " (resid_dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (mlp): GPT2MLP(\n", + " (c_fc): Conv1D()\n", + " (c_proj): Conv1D()\n", + " (act): NewGELUActivation()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n", + ")\n" + ] + } + ], "source": [ "import QEfficient\n", "\n", @@ -81,10 +166,69 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "3fb4d6dd-9973-4608-b68b-ec6825cfef0e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5e7fe36d84a24006ba52887588e9935a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 11 files: 0%| | 0/11 [00:00 0 else None,\n", + "[W export.cpp:565] Warning: Custom opset domain: 'com.qti.aisw.onnx' provided is not used in the model. Please verify custom opset domain names. (function GraphEncoder)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============== Diagnostic Run torch.onnx.export version 2.0.0+cpu ==============\n", + "verbose: False, log level: Level.ERROR\n", + "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n", + "\n", + "\n", + "=============== PyTorch vs. fp32 ONNXRT (MAD) ===============\n", + "\n", + "logits \t\t 7.62939453125e-05\n", + "attention_mask_RetainedState \t\t 0.0\n", + "past_keys (mean) \t\t 2.635022004445394e-06\n", + "past_value (mean) \t\t 5.5730342864990234e-06\n", + "\n", + "=============================================================\n", + "\n", + "\n", + "=============== PyTorch vs. fp16 ONNXRT (MAD) ===============\n", + "\n", + "logits \t\t 7.62939453125e-05\n", + "attention_mask_RetainedState \t\t 0.0\n", + "past_keys (mean) \t\t 2.635022004445394e-06\n", + "past_value (mean) \t\t 5.5730342864990234e-06\n", + "\n", + "=============================================================\n", + "\n" + ] + } + ], "source": [ "from QEfficient.utils import load_hf_tokenizer\n", "# We can now export the modified models to Onnx framework\n", @@ -119,10 +263,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "e48be5da-02a1-4d7e-9b5f-a6dcca141d4b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running AI 100 compiler: /opt/qti-aic/exec/qaic-exec -m=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/onnx/gpt2_kv_clipped_fp16.onnx -aic-hw -aic-hw-version=2.0 -network-specialization-config=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/specializations.json -convert-to-fp16 -retained-state -aic-num-cores=14 -custom-IO-list-file=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/onnx/custom_io_fp16.yaml -compile-only -aic-binary-dir=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/qpcs\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "===================== Compilation Done! =====================\n", + "\n" + ] + } + ], "source": [ "# Please use platform SDk to Check num_cores for your card.\n", "\n", @@ -145,17 +316,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "4711fc74-aa5d-4e20-af0e-0d461d2e19bb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 My name is John . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man \n", + "\n", + "===================== Performance Stats =====================\n", + "Prefill time a.k.a TTFT is= 0.01 s\n", + "Decode: 220.31 tok/s\n", + "E2E: 216.88 tok/s\n", + "Total (E2E) inference time is= 0.44 s\n", + "=============================================================\n" + ] + } + ], "source": [ - "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n", + "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n", "\n", "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "batch_size = get_compilation_batch_size(generated_qpc_path)\n", - "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])" + "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])" ] } ], diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index 6d5204f55..023369a0f 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -150,12 +150,13 @@ "metadata": {}, "outputs": [], "source": [ - "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n", + "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n", "\n", "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "\n", - "batch_size = get_compilation_batch_size(generated_qpc_path)cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])" + "batch_size = get_compilation_batch_size(generated_qpc_path)\n", + "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])" ] } ], From 0c9dc74970f472c3f1f81255304c099e747edc80 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Tue, 4 Jun 2024 20:20:02 +0530 Subject: [PATCH 17/20] fixed tests Signed-off-by: Onkar Chougule --- tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index 18de39622..b7fb8a2a9 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -109,7 +109,7 @@ def export_onnx(model_kv, tokenizer, model_name, model_class): onnx_dir_path = os.path.join(QEFF_MODELS_DIR, model_name) base_path, onnx_model_path = qualcomm_efficient_converter( model_name=model_name, - model_kv=QEFFAutoModelForCausalLM(model=model_kv, pretrained_model_name_or_path=None), # type: ignore + model_kv=QEFFAutoModelForCausalLM(model=model_kv), # type: ignore tokenizer=tokenizer, onnx_dir_path=onnx_dir_path, kv=True, From df303fb34029a753cc5cf9ef9b87331ad13cf280 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Tue, 4 Jun 2024 20:43:12 +0530 Subject: [PATCH 18/20] clenaed notebook Signed-off-by: Onkar Chougule --- notebooks/QEfficientGPT2.ipynb | 206 ++------------------------------- 1 file changed, 10 insertions(+), 196 deletions(-) diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 40e7b6a21..a8661844a 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -18,53 +18,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "c21f82d5-17df-4fc9-a180-05edd032f02d", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr2/ochougul/.pyenv/versions/3.8.19/envs/py38/lib/python3.8/site-packages/transformers/utils/hub.py:123: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "gpt2 from hugging-face \n", - " GPT2LMHeadModel(\n", - " (transformer): GPT2Model(\n", - " (wte): Embedding(50257, 768)\n", - " (wpe): Embedding(1024, 768)\n", - " (drop): Dropout(p=0.1, inplace=False)\n", - " (h): ModuleList(\n", - " (0-11): 12 x GPT2Block(\n", - " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (attn): GPT2Attention(\n", - " (c_attn): Conv1D()\n", - " (c_proj): Conv1D()\n", - " (attn_dropout): Dropout(p=0.1, inplace=False)\n", - " (resid_dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (mlp): GPT2MLP(\n", - " (c_fc): Conv1D()\n", - " (c_proj): Conv1D()\n", - " (act): NewGELUActivation()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " )\n", - " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " )\n", - " (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n", - ")\n" - ] - } - ], + "outputs": [], "source": [ "# Initiate the Orignal Transformer model\n", "import os\n", @@ -101,52 +58,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "a4543b94-9b50-4bcc-90c6-484ab694c9a6", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[33;20mWARNING - QEfficient - The model layers has been upadted to QEff layers in-place\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model after Optimized transformations \n", - " QEffGPT2LMHeadModel(\n", - " (transformer): QEffGPT2Model(\n", - " (wte): Embedding(50257, 768)\n", - " (wpe): Embedding(1024, 768)\n", - " (drop): Dropout(p=0.1, inplace=False)\n", - " (h): ModuleList(\n", - " (0-11): 12 x QEffGPT2Block(\n", - " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (attn): QEffGPT2Attention(\n", - " (c_attn): Conv1D()\n", - " (c_proj): Conv1D()\n", - " (attn_dropout): Dropout(p=0.1, inplace=False)\n", - " (resid_dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " (mlp): GPT2MLP(\n", - " (c_fc): Conv1D()\n", - " (c_proj): Conv1D()\n", - " (act): NewGELUActivation()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " )\n", - " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", - " )\n", - " (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n", - ")\n" - ] - } - ], + "outputs": [], "source": [ "import QEfficient\n", "\n", @@ -166,69 +81,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "3fb4d6dd-9973-4608-b68b-ec6825cfef0e", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5e7fe36d84a24006ba52887588e9935a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 11 files: 0%| | 0/11 [00:00 0 else None,\n", - "[W export.cpp:565] Warning: Custom opset domain: 'com.qti.aisw.onnx' provided is not used in the model. Please verify custom opset domain names. (function GraphEncoder)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "============== Diagnostic Run torch.onnx.export version 2.0.0+cpu ==============\n", - "verbose: False, log level: Level.ERROR\n", - "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n", - "\n", - "\n", - "=============== PyTorch vs. fp32 ONNXRT (MAD) ===============\n", - "\n", - "logits \t\t 7.62939453125e-05\n", - "attention_mask_RetainedState \t\t 0.0\n", - "past_keys (mean) \t\t 2.635022004445394e-06\n", - "past_value (mean) \t\t 5.5730342864990234e-06\n", - "\n", - "=============================================================\n", - "\n", - "\n", - "=============== PyTorch vs. fp16 ONNXRT (MAD) ===============\n", - "\n", - "logits \t\t 7.62939453125e-05\n", - "attention_mask_RetainedState \t\t 0.0\n", - "past_keys (mean) \t\t 2.635022004445394e-06\n", - "past_value (mean) \t\t 5.5730342864990234e-06\n", - "\n", - "=============================================================\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "from QEfficient.utils import load_hf_tokenizer\n", "# We can now export the modified models to Onnx framework\n", @@ -263,37 +119,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "e48be5da-02a1-4d7e-9b5f-a6dcca141d4b", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Running AI 100 compiler: /opt/qti-aic/exec/qaic-exec -m=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/onnx/gpt2_kv_clipped_fp16.onnx -aic-hw -aic-hw-version=2.0 -network-specialization-config=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/specializations.json -convert-to-fp16 -retained-state -aic-num-cores=14 -custom-IO-list-file=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/onnx/custom_io_fp16.yaml -compile-only -aic-binary-dir=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/qpcs\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "===================== Compilation Done! =====================\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# Please use platform SDk to Check num_cores for your card.\n", "\n", @@ -316,25 +145,10 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "4711fc74-aa5d-4e20-af0e-0d461d2e19bb", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 My name is John . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man of God . I 'm a man \n", - "\n", - "===================== Performance Stats =====================\n", - "Prefill time a.k.a TTFT is= 0.01 s\n", - "Decode: 220.31 tok/s\n", - "E2E: 216.88 tok/s\n", - "Total (E2E) inference time is= 0.44 s\n", - "=============================================================\n" - ] - } - ], + "outputs": [], "source": [ "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n", "\n", From abea97dafb62bc8c16216b67f68facc9ee845694 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Wed, 5 Jun 2024 15:27:22 +0530 Subject: [PATCH 19/20] *Added transfrom call within init *reanmed cross_compile *updated notebooks *updated README Signed-off-by: Onkar Chougule --- QEfficient/__init__.py | 2 +- .../{cross_compile.py => compile_helper.py} | 0 QEfficient/src/_transformers/auto.py | 16 +++++- QEfficient/transformers/transform.py | 2 +- README.md | 27 ++++------ notebooks/QEfficientGPT2.ipynb | 50 ++++++------------ notebooks/QEfficientMPT.ipynb | 52 ++++++------------- tests/utils.py | 2 +- 8 files changed, 58 insertions(+), 93 deletions(-) rename QEfficient/compile/{cross_compile.py => compile_helper.py} (100%) diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 04e0f825c..ac6c1b629 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from QEfficient.compile.cross_compile import compile # noqa: F401 +from QEfficient.compile.compile_helper import compile # noqa: F401 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv # noqa: F401 from QEfficient.src import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader # noqa: F401 diff --git a/QEfficient/compile/cross_compile.py b/QEfficient/compile/compile_helper.py similarity index 100% rename from QEfficient/compile/cross_compile.py rename to QEfficient/compile/compile_helper.py diff --git a/QEfficient/src/_transformers/auto.py b/QEfficient/src/_transformers/auto.py index 40877ea99..de01a0840 100644 --- a/QEfficient/src/_transformers/auto.py +++ b/QEfficient/src/_transformers/auto.py @@ -26,11 +26,13 @@ class QEFFTransformersBase(QEFFBaseModel): """ Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from src/transformers/models/auto/modeling_auto.py file. """ - def __init__(self, model: nn.Module) -> None: + def __init__(self, model: nn.Module, transform:bool = True) -> None: assert (model.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING.values() or # FIXME: Use model architectures here instead of complete dictionary TransformersToQEffModulesDict model.__class__ in TransformersToQEffModulesDict.values()), f"Given model{model.__class__.__name__} could not be found in transformers library i.e. {MODEL_FOR_CAUSAL_LM_MAPPING.values()}" # type: ignore self.model: nn.Module = model + if transform: + self.transform() def __repr__(self) -> str: return self.model.__repr__() @@ -41,8 +43,17 @@ def is_transformed(self) -> bool: @classmethod def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): + """ + This method accepts All the parameters that are acceptable by transformers.AutoModelForCausalLM. + There are few additional parameters that this method can take: + :param transform:bool. Whether to optimize model for KV retention; default is True. Pass False to get BertStyle model. + """ + transform: bool = kwargs.get("transform", True) + kwargs.update({"use_cache": True}) # Always pass use_cache = True, to get KV values as output during ONNX export + model = QEFFAutoModelToTransformersAutoModelMap[cls.__name__].from_pretrained(pretrained_model_name_or_path, *args, **kwargs) - return cls(model) + return cls(model, transform=transform) + def transform_export(self, *args, **kwargs) -> Any: raise NotImplementedError("Reached too far!!") @@ -51,6 +62,7 @@ def transform_export_compile(self, *args, **kwargs) -> Any: raise NotImplementedError("Reached too far!!") def transform(self): + # FIXME: break down transform into optmization passes i.e. HW specific optimization(RMSNorm), KV retention pass etc. QEfficient.transform(self) return self diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py index 3a520d0ed..dfd0de5c7 100644 --- a/QEfficient/transformers/transform.py +++ b/QEfficient/transformers/transform.py @@ -11,7 +11,7 @@ import transformers from QEfficient.src.base import QEFFBaseModel -from QEfficient.src.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader +from QEfficient.src.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE from QEfficient.transformers.modeling_attn_mask_utils import ( QEffAttentionMaskConverter, _qeff_prepare_4d_attention_mask, diff --git a/README.md b/README.md index 89a02859c..fc429f88e 100644 --- a/README.md +++ b/README.md @@ -195,7 +195,7 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3 **One argument, prompt or prompts_txt_file_path must be passed.
    ***Both save_fp32_onnx and save_fp16_onnx can't be false. -### 1. Model download and transform +### 1. Model download and Optimize for Cloud AI 100 Initialize QEfficient and transform the models, Check the list of supported architectures in the repo. @@ -203,25 +203,19 @@ Initialize QEfficient and transform the models, Check the list of supported arch # Initiate the Orignal Transformer model import os - -import QEfficient -from QEfficient import QEFFAutoModelForCausalLM +from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM # Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir. # os.environ["TRANSFORMERS_CACHE"] = "/local/mnt/workspace/hf_cache" -#ROOT_DIR = os.path.dirname(os.path.abspath("")) -#CACHE_DIR = os.path.join(ROOT_DIR, "tmp"), you can use a different location for just one model by passing this param as cache_dir in below API. +# ROOT_DIR = os.path.dirname(os.path.abspath("")) +# CACHE_DIR = os.path.join(ROOT_DIR, "tmp") #, you can use a different location for just one model by passing this param as cache_dir in below API. # Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl model_name = "gpt2" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib. -qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=None) -print(f"{model_name} from hugging-face \n", qeff_model) - -# Easy and minimal api to update the model -model_transformed = QEfficient.transform(qeff_model, form_factor="cloud") -print("Model after Optimized transformations \n", model_transformed) +qeff_model = AutoModelForCausalLM.from_pretrained(model_name) +print(f"{model_name} optmized for AI 100 \n", qeff_model) ``` ### 2. ONNX export of transformed model @@ -229,6 +223,7 @@ print("Model after Optimized transformations \n", model_transformed) use the qualcomm_efficient_converter API to export the KV transformed Model to ONNX and Verify on Torch. ```Python +import QEfficient from QEfficient.utils import load_hf_tokenizer # We can now export the modified models to Onnx framework # This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for @@ -244,7 +239,7 @@ from QEfficient.utils import load_hf_tokenizer tokenizer = load_hf_tokenizer(model_name, use_cache=True) base_path, onnx_path = QEfficient.export( model_name=model_name, - model_kv=model_transformed, + model_kv=qeff_model, tokenizer=tokenizer, kv=True, form_factor="cloud", @@ -261,7 +256,7 @@ Once, the model is exported, Compile the model on Cloud AI 100 and generate QPC. generated_qpc_path = QEfficient.compile( onnx_path=onnx_path, - num_cores=14, + num_cores=14, # You can use `/opt/qti-aic/tools/qaic-util | grep "Nsp Total"` from Apps SDK for this. qpc_path=os.path.dirname(base_path), mxfp6=False, device_group=[0], @@ -272,12 +267,12 @@ generated_qpc_path = QEfficient.compile( Benchmark the model on Cloud AI 100, run the infer API to print tokens and tok/sec ```Python -from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size +from QEfficient.generation.text_generation_inference import get_compilation_batch_size # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach batch_size = get_compilation_batch_size(generated_qpc_path) -cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=["My name is"]) +QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=["My name is"]) ``` End to End demo examples for various models are available in **notebooks** directory. Please check them out. diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index a8661844a..3095c7044 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -13,7 +13,16 @@ "id": "88eef7ea-3488-414c-9e36-e960abba30c9", "metadata": {}, "source": [ - "##### Download the OpenSource GPT2 based HuggingFace Model and Save in local *Cache* directory" + "##### Download the OpenSource GPT2 based HuggingFace Model and Save in local *Cache* directory\n", + "###### We Modify the GPT2 Classes using the Optimized Software Library to generate model for Cloud AI 100.\n", + "###### User can disable this optmization by passing `transfrom=False` in the `from_pretrained` call\n", + "###### Here we generate models with below Optimizations:\n", + "\n", + "* RMS Norm Fixes for FP16 Overflows and Underflow\n", + "* Causal Mask Fix\n", + "* Handling FP16 Overflows.\n", + "* KV Cache (Retention Changes).\n", + "* Triu/Tril Ops support." ] }, { @@ -26,7 +35,7 @@ "# Initiate the Orignal Transformer model\n", "import os\n", "\n", - "from QEfficient import QEFFAutoModelForCausalLM\n", + "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n", "\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n", @@ -37,38 +46,8 @@ "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n", "model_name = \"gpt2\" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n", "\n", - "qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)\n", - "print(f\"{model_name} from hugging-face \\n\", qeff_model)" - ] - }, - { - "cell_type": "markdown", - "id": "a89dfa0a-d8fe-4472-bf00-55e563ae9058", - "metadata": {}, - "source": [ - "##### Now we Modify the GPT2 Classes using the Optimized Software Library to generate model for Cloud AI 100.\n", - "##### Here we generate models with below Optimizations:\n", - "\n", - "* RMS Norm Fixes for FP16 Overflows and Underflow\n", - "* Causal Mask Fix\n", - "* Handling FP16 Overflows.\n", - "* KV Cache (Retention Changes).\n", - "* Triu/Tril Ops support." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4543b94-9b50-4bcc-90c6-484ab694c9a6", - "metadata": {}, - "outputs": [], - "source": [ - "import QEfficient\n", - "\n", - "# Easy and minimal api to update the model\n", - "model_transformed = QEfficient.transform(qeff_model, form_factor=\"cloud\")\n", - "\n", - "print(\"Model after Optimized transformations \\n\", model_transformed)" + "qeff_model = AutoModelForCausalLM.from_pretrained(model_name)\n", + "print(f\"{model_name} optmized for AI 100 \\n\", qeff_model)" ] }, { @@ -86,6 +65,7 @@ "metadata": {}, "outputs": [], "source": [ + "import QEfficient\n", "from QEfficient.utils import load_hf_tokenizer\n", "# We can now export the modified models to Onnx framework\n", "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n", @@ -101,7 +81,7 @@ "tokenizer = load_hf_tokenizer(model_name, use_cache=True)\n", "base_path, onnx_path = QEfficient.export(\n", " model_name=model_name,\n", - " model_kv=model_transformed,\n", + " model_kv=qeff_model,\n", " tokenizer=tokenizer,\n", " kv=True,\n", " form_factor=\"cloud\",\n", diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index 023369a0f..15e84399a 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -13,7 +13,15 @@ "id": "88eef7ea-3488-414c-9e36-e960abba30c9", "metadata": {}, "source": [ - "##### Download the OpenSource MPT based HuggingFace Model and Save in local *Cache* directory" + "##### Download the OpenSource MPT based HuggingFace Model and Save in local *Cache* directory\n", + "###### Now we Modify the MPT Classes using the Optimized Software Library to generate model for Cloud AI 100.\n", + "###### Here we generate models with below Optimizations:\n", + "\n", + "* RMS Norm Fixes for FP16 Overflows and Underflow\n", + "* Causal Mask Fix\n", + "* Handling FP16 Overflows.\n", + "* KV Cache (Retention Changes).\n", + "* Triu/Tril Ops support." ] }, { @@ -26,7 +34,7 @@ "# Initiate the Orignal Transformer model\n", "import os\n", "\n", - "from QEfficient import QEFFAutoModelForCausalLM\n", + "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n", "\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n", @@ -36,39 +44,8 @@ "\n", "# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl\n", "model_name = \"mosaicml/mpt-7b\" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.\n", - "qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)\n", - "print(f\"{qeff_model} from hugging-face \\n\", qeff_model)" - ] - }, - { - "cell_type": "markdown", - "id": "a89dfa0a-d8fe-4472-bf00-55e563ae9058", - "metadata": {}, - "source": [ - "##### Now we Modify the MPT Classes using the Optimized Software Library to generate model for Cloud AI 100.\n", - "##### Here we generate models with below Optimizations:\n", - "\n", - "* RMS Norm Fixes for FP16 Overflows and Underflow\n", - "* Causal Mask Fix\n", - "* Handling FP16 Overflows.\n", - "* KV Cache (Retention Changes).\n", - "* Triu/Tril Ops support." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4543b94-9b50-4bcc-90c6-484ab694c9a6", - "metadata": {}, - "outputs": [], - "source": [ - "import QEfficient\n", - "\n", - "# Easy and minimal api to update the model\n", - "model_transformed = QEfficient.transform(qeff_model, form_factor=\"cloud\")\n", - "\n", - "model_transformed.eval()\n", - "print(\"Model after Optimized transformations \\n\", model_transformed)" + "qeff_model = AutoModelForCausalLM.from_pretrained(model_name)\n", + "print(f\"{model_name} optmized for AI 100 \\n\", qeff_model)" ] }, { @@ -86,6 +63,7 @@ "metadata": {}, "outputs": [], "source": [ + "import QEfficient\n", "from QEfficient.utils import load_hf_tokenizer\n", "\n", "# We have the utils to export the modified models to Onnx framework\n", @@ -98,10 +76,10 @@ "\n", "# We can generate both bertstyle and KV Style models with the flag \"kv\"\n", "# Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version.\n", - "tokenizer = load_hf_tokenizer(model_hf_path, use_cache=True, padding_side=\"left\")\n", + "tokenizer = load_hf_tokenizer(model_name, use_cache=True, padding_side=\"left\")\n", "base_path, onnx_path = QEfficient.export(\n", " model_name=model_name,\n", - " model_kv=model_transformed,\n", + " model_kv=qeff_model,\n", " tokenizer=tokenizer,\n", " kv=True,\n", " form_factor=\"cloud\",\n", diff --git a/tests/utils.py b/tests/utils.py index b7fb8a2a9..ace803f8f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -11,7 +11,7 @@ import unittest from QEfficient import QEFFAutoModelForCausalLM -from QEfficient.compile.cross_compile import compile_kv_model_on_cloud_ai_100 +from QEfficient.compile.compile_helper import compile_kv_model_on_cloud_ai_100 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.transformers.transform import transform_lm from QEfficient.utils import hf_download, load_hf_tokenizer From 4ae5825a36d5116197632a434ffcc2eaa6605c23 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Wed, 5 Jun 2024 23:31:54 +0530 Subject: [PATCH 20/20] addressed review comments Signed-off-by: Onkar Chougule --- QEfficient/compile/compile_helper.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py index 771d52f54..8b5272e8d 100644 --- a/QEfficient/compile/compile_helper.py +++ b/QEfficient/compile/compile_helper.py @@ -7,6 +7,7 @@ import json import os +import shutil import subprocess from typing import List, Tuple @@ -42,7 +43,6 @@ def compile_kv_model_on_cloud_ai_100( device_group: List[int] = [0], **kwargs, ) -> Tuple[bool, str]: - import shutil if kwargs: # FIXME raise NotImplementedError("Can't handle extra compilation args now!") @@ -101,7 +101,7 @@ def compile( onnx_path: str, qpc_path: str, num_cores: int, - device_group: List[int], + device_group: List[int], # FIXME: use num_devices instead aic_enable_depth_first: bool = False, mos: int = -1, batch_size: int = 1, @@ -116,12 +116,16 @@ def compile( Api() to compile the Onnx Model on Cloud AI 100 Platform with give config. --------- :param onnx_path: str. Generated Onnx Model Path. - :base_path: str. Base path for the generated models. + :param qpc_path: str. Path for saving compiled qpc binaries. + :num_cores: int. Number of cores to compile model on. + :device_group: List[int]. Used for finding number of devices to compile for. + :aic_enable_depth_first: bool. Enables DFS with default memory size, disabled by default. + :mos: int. Effort level to reduce the on-chip memory. :batch_size: int. Batch size to compile the model for. :prompt_len: int. prompt len for the model to compile. :ctx_len: int. Maximum context length to compile the model. :mxfp6: bool. Enable compilation for MXFP6 precision - :num_cores: int. Number of cores to compile model on. default: 16 available option: [1 to 16] + :mxint8: Compress Present/Past KV to MXINT8 using CustomIO config, default is False. """ os.makedirs(qpc_path, exist_ok=True)