Skip to content

Commit

Permalink
QNN Compilation support in High Level APIs of QEFFAutoModelForCausalLM (
Browse files Browse the repository at this point in the history
#187)

* QNN Compilation support in QEFFAutoModelForCausalLM High Level APIs

	1. Modified qnn_compiler.py to include qnn_binary_dir path to support hash suffix in qpc directory name.
	2. Added tests/qnn_tests/test_causal_lm_models_qnn.py for unit testing.
	3. Modified qnn_config.json to enable compiler_enable_depth_first if qnn_config file is passed.
	4. Added _qnn_compile function in QEFFBaseModel to support QNN Compilation.

Signed-off-by: Shubham Agrawal <[email protected]>

* Increased Non-CLI Non-QAIC Tests timeout

Signed-off-by: Rishin Raj <[email protected]>

* Added sudo for executing QNN Docker commands

Signed-off-by: Rishin Raj <[email protected]>

---------

Signed-off-by: Shubham Agrawal <[email protected]>
Signed-off-by: Rishin Raj <[email protected]>
Co-authored-by: Rishin Raj <[email protected]>
  • Loading branch information
shubhagr-quic and quic-rishinr authored Jan 7, 2025
1 parent 20ed2a8 commit ed4cbdf
Show file tree
Hide file tree
Showing 8 changed files with 375 additions and 49 deletions.
98 changes: 98 additions & 0 deletions QEfficient/base/modeling_qeff.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@

from QEfficient.base.onnx_transforms import OnnxTransform
from QEfficient.base.pytorch_transforms import PytorchTransform
from QEfficient.compile.qnn_compiler import compile as qnn_compile
from QEfficient.generation.cloud_infer import QAICInferenceSession
from QEfficient.utils import constants
from QEfficient.utils._utils import load_json
from QEfficient.utils.cache import QEFF_HOME, to_hashable

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -319,3 +321,99 @@ def _compile(

self.qpc_path = qpc_path
return qpc_path

def _qnn_compile(
self,
onnx_path: Optional[str] = None,
compile_dir: Optional[str] = None,
*,
specializations: Optional[List[Dict[str, int]]] = None,
prefill_seq_len: int = 32,
ctx_len: int = 128,
batch_size: int = 1,
full_batch_size: Optional[int] = None,
mdp_ts_num_devices: int = 1,
num_cores: int = 16,
mxfp6_matmul: bool = False,
mxint8_kv_cache: bool = False,
qnn_config: Optional[str] = None,
) -> str:
"""
Interface for QNN compiler
Args:
:onnx_path (str): Onnx file to compile
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
:specializations (list): List of specializations to compile for
:prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
:ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
:batch_size (int, optional): Batch size. ``Defaults to 1``.
:full_batch_size (int, optional): Continuous batching batch size.
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
:num_cores (int): Number of cores used to compile the model.
:mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
"""
if onnx_path is None and self.onnx_path is None:
self.export()

onnx_path = Path(onnx_path or self.onnx_path)
compile_dir = Path(compile_dir or onnx_path.parent)
qpc_path = compile_dir / "qpc"
if not onnx_path.is_file():
raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")

compile_hash = hashlib.sha256(to_hashable("qnn"))

if specializations is not None:
compile_hash.update(to_hashable(specializations))

if qnn_config is not None:
qnn_config_values = load_json(qnn_config)
compile_hash.update(to_hashable(qnn_config_values))

if mdp_ts_num_devices > 1:
compile_hash.update(to_hashable({"mdp_ts_num_devices": mdp_ts_num_devices}))

compile_hash.update(to_hashable({"num_cores": num_cores}))
compile_hash.update(to_hashable({"mxfp6_matmul": mxfp6_matmul}))
compile_hash.update(to_hashable({"mxint8_kv_cache": mxint8_kv_cache}))

# Check if already compiled
compile_hash = compile_hash.hexdigest()[:16]
qpc_path = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
if qpc_path.is_dir():
if (qpc_path / "programqpc.bin").is_file():
self.qpc_path = qpc_path
return qpc_path
# Probably compilation failure last time, delete directory to start over
shutil.rmtree(qpc_path)

# Write specializations.json file
if specializations is not None:
specializations_json = compile_dir / "specializations.json"
with open(specializations_json, "w") as fp:
json.dump(
{"specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]},
fp,
indent=4,
)

qnn_compile(
onnx_path=onnx_path,
qpc_base_path=compile_dir,
num_cores=num_cores,
device_group=list(range(mdp_ts_num_devices)),
batch_size=batch_size,
prompt_len=prefill_seq_len,
ctx_len=ctx_len,
mxfp6=mxfp6_matmul,
mxint8=mxint8_kv_cache,
full_batch_size=full_batch_size,
qnn_config=qnn_config,
qnn_binary_dir=qpc_path,
)

self.qpc_path = qpc_path
return qpc_path
2 changes: 1 addition & 1 deletion QEfficient/compile/compile_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def compile(
if enable_qnn:
qpc_path = qnn_compile(
onnx_path=onnx_path,
qpc_path=qpc_path,
qpc_base_path=qpc_path,
num_cores=num_cores,
batch_size=batch_size,
prompt_len=prompt_len,
Expand Down
48 changes: 27 additions & 21 deletions QEfficient/compile/qnn_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class QNN:
def __init__(
self,
onnx_path: str,
qpc_path: str,
qpc_base_path: str,
num_cores: int,
custom_io_path: str,
device_group: Optional[List[int]] = None,
Expand All @@ -37,10 +37,11 @@ def __init__(
compiler_mxfp6_matmul_weights: bool = True,
qnn_target: str = QnnConstants.TARGET,
qnn_config_path: Optional[str] = None,
qnn_binary_dir: Optional[str] = None,
**kwargs,
) -> None:
self.onnx_path = onnx_path
self.qpc_path = qpc_path
self.qpc_base_path = qpc_base_path
self.num_cores = num_cores
self.device_group = device_group
self.compiler_enable_depth_first = compiler_enable_depth_first
Expand All @@ -50,8 +51,9 @@ def __init__(
self.ctx_len = ctx_len
self.compiler_mxfp6_matmul_weights = compiler_mxfp6_matmul_weights
self.qnn_config_path = qnn_config_path
self.qnn_binary_dir = qnn_binary_dir
self.custom_io_path = custom_io_path
self.dlc_model_path = os.path.join(qpc_path, f"{QnnConstants.MODEL_NAME}.dlc")
self.dlc_model_path = os.path.join(qpc_base_path, f"{QnnConstants.MODEL_NAME}.dlc")
self.qnn_target = qnn_target
self.qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME)
if not self.qnn_sdk_path:
Expand Down Expand Up @@ -118,7 +120,7 @@ def create_qnn_tensor_slicing_json(self) -> str:
}
],
}
tensor_slicing_json_path = os.path.join(self.qpc_path, "tensor_slicing.json")
tensor_slicing_json_path = os.path.join(self.qpc_base_path, "tensor_slicing.json")
create_json(tensor_slicing_json_path, tensor_slicing)
return tensor_slicing_json_path

Expand Down Expand Up @@ -157,7 +159,7 @@ def create_qnn_compile_backend_json(self) -> str:
for key, value in self.qnn_config[QnnConstants.QNN_COMPILATION_BACKEND_STR].items():
qnn_compile_backend[key] = value

qnn_compile_backend_json_path = os.path.join(self.qpc_path, "qnn_compile_backend.json")
qnn_compile_backend_json_path = os.path.join(self.qpc_base_path, "qnn_compile_backend.json")
create_json(qnn_compile_backend_json_path, qnn_compile_backend)
return qnn_compile_backend_json_path

Expand All @@ -177,13 +179,13 @@ def create_qnn_compiler_config_json(self) -> str:
),
}
}
qnn_compiler_config_json_path = os.path.join(self.qpc_path, "qnn_compiler_config.json")
qnn_compiler_config_json_path = os.path.join(self.qpc_base_path, "qnn_compiler_config.json")
create_json(qnn_compiler_config_json_path, qnn_compiler_config)
return qnn_compiler_config_json_path

def compile(self) -> str:
"""
Compiles the given ``ONNX`` model during object creation using QNN compiler and saves the compiled ``qpc`` package at ``qpc_path``.
Compiles the given ``ONNX`` model during object creation using QNN compiler and saves the compiled ``qpc`` package at ``qnn_binary_dir``.
- Creates convertor command and convert onnx model to model.dlc using qairt-convertor
- command line arguments and qnn_config.json (if provided) are used to create qnn_compiler_config.json for context-binary-generator
- model.dlc from convertor stage is passed into context-binary-generator command to create programqpc.bin.
Expand All @@ -197,20 +199,21 @@ def compile(self) -> str:
and self.qnn_config[QnnConstants.SKIP_QNN_CONVERTOR_STEP_STR]
):
converter_cmd = self.converter()
execute_command("convertor", converter_cmd, self.qpc_path)
execute_command("convertor", converter_cmd, self.qpc_base_path)

if not os.path.isfile(self.dlc_model_path):
raise FileNotFoundError(
f"file {self.dlc_model_path} needs to exist in the qpc_path{self.qpc_path}. Please rerun infer/compile Api"
f"file {self.dlc_model_path} needs to exist in the qpc_base_path{self.qpc_base_path}. Please rerun infer/compile Api"
)

self.qnn_binary_dir = os.path.join(self.qpc_path, "qpcs")
if self.qnn_binary_dir is None:
self.qnn_binary_dir = os.path.join(self.qpc_base_path, "qpcs")
if os.path.isdir(self.qnn_binary_dir):
shutil.rmtree(self.qnn_binary_dir)
os.makedirs(self.qnn_binary_dir)

ctx_bin_cmd = self.generate_context_binary()
execute_command("context_binary", ctx_bin_cmd, self.qpc_path)
execute_command("context_binary", ctx_bin_cmd, self.qpc_base_path)

print("\n===================== Compilation Done! =====================\n")
return self.qnn_binary_dir
Expand All @@ -221,7 +224,7 @@ def converter(self) -> str:
IMMUTABLE parameters which can not be overridden by the user using qnn_config.json:
:input_network (str): Generated ``ONNX`` Model Path.
:output_path (str): Path to generated DLC file, which is provided qpc_path/model.dlc
:output_path (str): Path to generated DLC file, which is provided qpc_base_path/model.dlc
:io_config (str): Path to custom_io_config.yaml file created using GenerateQNNnetworkSpecializationconfig.py
:float_bias_bitwidth (int): Bitwidth to use for float bias tensor
:float_bitwidth (int): Converts the graph to the specified float bitwidth, either 32 or 16(Default).
Expand Down Expand Up @@ -255,8 +258,8 @@ def generate_context_binary(self) -> str:
IMMUTABLE parameters which can not be modified by the user using qnn_config.json:
:binary_file (str): QNN Binary Graph name to be generated (qnngraph.serialized).
:backend_binary (str): Path to generated QPC binary file, which is provided qpc_path/qpcs/programqpc.bin
:output_dir (str): Path to store generated Binaries (qpc_path/qpcs/).
:backend_binary (str): Generated QPC binary file name, which is provided programqpc.bin
:output_dir (str): Path to store generated Binaries (qnn_binary_dir).
:model (str): Path to the <qnn_model_name.so> file containing a QNN network.
:dlc_path (str): Path to DLC file generated by QNN-Convertor.
:config_file(str): Path to created qnn_compiler_config.json containing qnn_compile_backend.json & shared_library_path.
Expand Down Expand Up @@ -305,7 +308,7 @@ def generate_profiling(self):

def compile(
onnx_path: str,
qpc_path: str,
qpc_base_path: str,
num_cores: int,
device_group: Optional[List[int]] = None,
aic_enable_depth_first: bool = False,
Expand All @@ -318,16 +321,17 @@ def compile(
allow_mxint8_mdp_io: Optional[bool] = False,
full_batch_size=None,
qnn_config: Optional[str] = None,
qnn_binary_dir: Optional[str] = None,
**kwargs,
) -> str:
"""
Compiles the given ``ONNX`` model using QNN compiler and saves the compiled ``qpc`` package at ``qpc_path``.
Compiles the given ``ONNX`` model using QNN compiler and saves the compiled ``qpc`` package at ``qnn_binary_dir``.
Generates model.dlc during convertor stage, qnn_compile_backend.json for backend parameters of context-binary-generator.
Generates tensor-slicing configuration if multiple devices are passed in ``device_group``.
``Mandatory`` Args:
:onnx_path (str): Generated ``ONNX`` Model Path.
:qpc_path (str): Path for saving compiled qpc binaries.
:qpc_base_path (str): base directory for QNN compilation config & binary file.
:num_cores (int): Number of cores to compile the model on.
``Optional`` Args:
:device_group (List[int]): Used for finding the number of devices to compile for.
Expand All @@ -341,6 +345,7 @@ def compile(
:allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.``
:mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.``
:qnn_config (str): Path to ``qnn_config.json`` file (formatted as a string). ``Defaults to None.``
:qnn_binary_dir (str): Path for saving qnn binaries.
Returns:
:str: Path to compiled ``qpc`` package.
Expand All @@ -357,11 +362,11 @@ def compile(
if mxint8:
logger.warning("QNN doesn't support mxint8. Bypassing the value passed for mxint8")

os.makedirs(qpc_path, exist_ok=True)
os.makedirs(qpc_base_path, exist_ok=True)

# Created custom_io_config.yaml file for QNN-Convertor stage.
# TODO To make custom_io_config.yaml configurable as not all models need it.
custom_io_file_path = os.path.join(qpc_path, "custom_io_config.yaml")
custom_io_file_path = os.path.join(qpc_base_path, "custom_io_config.yaml")
fetch_nodes_info(
onnx_graph_path=onnx_path,
batch_size=batch_size,
Expand All @@ -373,12 +378,12 @@ def compile(

if not os.path.isfile(custom_io_file_path):
raise FileNotFoundError(
f"file {custom_io_file_path} needs to exist in the qpc_path for Compilation. Please rerun infer/compile Api"
f"file {custom_io_file_path} needs to exist in the qpc_base_path for Compilation. Please rerun infer/compile Api"
)

qnn_obj = QNN(
onnx_path=onnx_path,
qpc_path=qpc_path,
qpc_base_path=qpc_base_path,
num_cores=num_cores,
device_group=device_group,
qnn_config_path=qnn_config,
Expand All @@ -389,6 +394,7 @@ def compile(
prompt_len=prompt_len,
ctx_len=ctx_len,
compiler_mxfp6_matmul_weights=mxfp6,
qnn_binary_dir=qnn_binary_dir,
)

compiled_binary_path = qnn_obj.compile()
Expand Down
1 change: 1 addition & 0 deletions QEfficient/compile/qnn_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"context_binary_generator_args_extension": "--log_level debug",
"qnn_compilation_backend":
{
"compiler_enable_depth_first": true,
"compiler_printDDRStats": false,
"compiler_printPerfMetrics": false,
"compiler_stat_level": 10
Expand Down
Loading

0 comments on commit ed4cbdf

Please sign in to comment.