From 28c666cb1ba96eb8ae58b555f4b13ea096001df2 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 2 Nov 2023 12:16:40 -0600 Subject: [PATCH 1/2] Add hf: stub support to model_to_path (#1378) * Add hf: stub support to model_to_path * Refactor to use model_to_path in transformers/helpers --- src/deepsparse/transformers/helpers.py | 26 ++++++++------------------ src/deepsparse/utils/onnx.py | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py index d7acc71a99..cb3f43e484 100644 --- a/src/deepsparse/transformers/helpers.py +++ b/src/deepsparse/transformers/helpers.py @@ -28,8 +28,11 @@ from onnx import ModelProto from deepsparse.log import get_main_logger -from deepsparse.utils.onnx import _MODEL_DIR_ONNX_NAME, truncate_onnx_model -from sparsezoo import Model +from deepsparse.utils.onnx import ( + _MODEL_DIR_ONNX_NAME, + model_to_path, + truncate_onnx_model, +) from sparsezoo.utils import save_onnx @@ -71,22 +74,9 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]: ) return model_path, os.path.join(model_path, _MODEL_DIR_ONNX_NAME) - elif model_path.startswith("zoo:"): - zoo_model = Model(model_path) - deployment_path = zoo_model.deployment_directory_path - return deployment_path, os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME) - elif model_path.startswith("hf:"): - from huggingface_hub import snapshot_download - - deployment_path = snapshot_download(repo_id=model_path.replace("hf:", "", 1)) - onnx_path = os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME) - if not os.path.isfile(onnx_path): - raise ValueError( - f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory " - f"{deployment_path}. Be sure that an export of the model is written to " - f"{onnx_path}" - ) - return deployment_path, onnx_path + elif model_path.startswith("zoo:") or model_path.startswith("hf:"): + onnx_model_path = model_to_path(model_path) + return os.path.dirname(onnx_model_path), onnx_model_path else: raise ValueError( f"model_path {model_path} is not a valid file, directory, or zoo stub" diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index 86fcb1cef6..a3358c8f41 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -138,6 +138,21 @@ def model_to_path(model: Union[str, Model, File]) -> str: # get the downloaded_path -- will auto download if not on local system model = model.path + if isinstance(model, str) and model.startswith("hf:"): + # load Hugging Face model from stub + from huggingface_hub import snapshot_download + + deployment_path = snapshot_download(repo_id=model.replace("hf:", "", 1)) + onnx_path = os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME) + if not os.path.isfile(onnx_path): + raise ValueError( + f"Could not find the ONNX model file '{_MODEL_DIR_ONNX_NAME}' in the " + f"Hugging Face Hub repository located at {deployment_path}. Please " + f"ensure the model has been correctly exported to ONNX format and " + f"exists in the repository." + ) + return onnx_path + if not isinstance(model, str): raise ValueError("unsupported type for model: {}".format(type(model))) From 02348f2310c376595e5046646a3f69a9ecdef388 Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Thu, 2 Nov 2023 22:57:26 +0100 Subject: [PATCH 2/2] [Fix] The benchmark logic when internal kv cache is involved (#1377) * initial commit * fix indentation of logging --- src/deepsparse/benchmark/benchmark_model.py | 65 +++++++++++++-------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py index d41488a306..2b32dc69af 100644 --- a/src/deepsparse/benchmark/benchmark_model.py +++ b/src/deepsparse/benchmark/benchmark_model.py @@ -364,37 +364,52 @@ def benchmark_model( model_path = model_to_path(model_path) cached_outputs = None - if not disable_kv_cache_overrides and has_model_kv_cache(model_path): - if not sequence_length: - sequence_length = infer_sequence_length(model_path) - if input_ids_length > sequence_length: + if has_model_kv_cache(model_path): + if not disable_kv_cache_overrides: + if not sequence_length: + sequence_length = infer_sequence_length(model_path) + if input_ids_length > sequence_length: + raise ValueError( + f"input_ids_length: {input_ids_length} " + f"must be less than sequence_length: {sequence_length}" + ) + + _LOGGER.info( + "Found model with KV cache support. " + "Benchmarking the autoregressive model with " + f"input_ids_length: {input_ids_length} and " + f"sequence length: {sequence_length}." + ) + + ( + model_path, + cached_outputs, + _, + ) = overwrite_onnx_model_inputs_for_kv_cache_models( + onnx_file_path=model_path, + input_ids_length=input_ids_length, + sequence_length=sequence_length, + batch_size=batch_size, + ) + + if internal_kv_cache and engine != DEEPSPARSE_ENGINE: raise ValueError( - f"input_ids_length: {input_ids_length} " - f"must be less than sequence_length: {sequence_length}" + "Attempting to benchmark a model using engine: " + f"{engine} and internal_kv_cache set to True. " + "The use of internal_kv_cache is only " + f"supported for the engine: {DEEPSPARSE_ENGINE}. " + f"To disable the use of the internal_kv_cache, " + f"set the flag: --no-internal-kv-cache" ) _LOGGER.info( - "Found model with KV cache support. " - "Benchmarking the autoregressive model with " - f"input_ids_length: {input_ids_length} and " - f"sequence length: {sequence_length}." + f"Benchmarking Engine: {engine} with " + f"{'internal' if internal_kv_cache else 'external'} KV cache management" ) - - model_path, cached_outs, _ = overwrite_onnx_model_inputs_for_kv_cache_models( - onnx_file_path=model_path, - input_ids_length=input_ids_length, - sequence_length=sequence_length, - batch_size=batch_size, - ) - - if internal_kv_cache: - _LOGGER.info( - "Benchmarking DeepSparse Engine with internal KV Cache management" - ) - cached_outputs = cached_outs else: input_ids_length = None sequence_length = None + internal_kv_cache = False num_streams = parse_num_streams(num_streams, num_cores, scenario) @@ -407,7 +422,7 @@ def benchmark_model( num_streams=num_streams, scheduler=scheduler, input_shapes=input_shapes, - cached_outputs=cached_outputs, + cached_outputs=cached_outputs if internal_kv_cache else None, ) elif engine == ORT_ENGINE: model = ORTEngine( @@ -450,7 +465,7 @@ def benchmark_model( seconds_to_run=time, seconds_to_warmup=warmup_time, num_streams=num_streams, - internal_kv_cache=cached_outputs, + internal_kv_cache=internal_kv_cache, ) export_dict = { "engine": str(model),