Merge branch 'main' into feature/damian/llm-harness

neuralmagic · Nov 3, 2023 · cea51e1 · cea51e1
2 parents edead87 + 02348f2
commit cea51e1
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 43 deletions.
diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
@@ -364,37 +364,52 @@ def benchmark_model(
     model_path = model_to_path(model_path)
 
     cached_outputs = None
-    if not disable_kv_cache_overrides and has_model_kv_cache(model_path):
-        if not sequence_length:
-            sequence_length = infer_sequence_length(model_path)
-        if input_ids_length > sequence_length:
+    if has_model_kv_cache(model_path):
+        if not disable_kv_cache_overrides:
+            if not sequence_length:
+                sequence_length = infer_sequence_length(model_path)
+            if input_ids_length > sequence_length:
+                raise ValueError(
+                    f"input_ids_length: {input_ids_length} "
+                    f"must be less than sequence_length: {sequence_length}"
+                )
+
+            _LOGGER.info(
+                "Found model with KV cache support. "
+                "Benchmarking the autoregressive model with "
+                f"input_ids_length: {input_ids_length} and "
+                f"sequence length: {sequence_length}."
+            )
+
+            (
+                model_path,
+                cached_outputs,
+                _,
+            ) = overwrite_onnx_model_inputs_for_kv_cache_models(
+                onnx_file_path=model_path,
+                input_ids_length=input_ids_length,
+                sequence_length=sequence_length,
+                batch_size=batch_size,
+            )
+
+        if internal_kv_cache and engine != DEEPSPARSE_ENGINE:
             raise ValueError(
-                f"input_ids_length: {input_ids_length} "
-                f"must be less than sequence_length: {sequence_length}"
+                "Attempting to benchmark a model using engine: "
+                f"{engine} and internal_kv_cache set to True. "
+                "The use of internal_kv_cache is only "
+                f"supported for the engine: {DEEPSPARSE_ENGINE}. "
+                f"To disable the use of the internal_kv_cache, "
+                f"set the flag: --no-internal-kv-cache"
             )
 
         _LOGGER.info(
-            "Found model with KV cache support. "
-            "Benchmarking the autoregressive model with "
-            f"input_ids_length: {input_ids_length} and "
-            f"sequence length: {sequence_length}."
+            f"Benchmarking Engine: {engine} with "
+            f"{'internal' if internal_kv_cache else 'external'} KV cache management"
         )
-
-        model_path, cached_outs, _ = overwrite_onnx_model_inputs_for_kv_cache_models(
-            onnx_file_path=model_path,
-            input_ids_length=input_ids_length,
-            sequence_length=sequence_length,
-            batch_size=batch_size,
-        )
-
-        if internal_kv_cache:
-            _LOGGER.info(
-                "Benchmarking DeepSparse Engine with internal KV Cache management"
-            )
-            cached_outputs = cached_outs
     else:
         input_ids_length = None
         sequence_length = None
+        internal_kv_cache = False
 
     num_streams = parse_num_streams(num_streams, num_cores, scenario)
 
@@ -407,7 +422,7 @@ def benchmark_model(
             num_streams=num_streams,
             scheduler=scheduler,
             input_shapes=input_shapes,
-            cached_outputs=cached_outputs,
+            cached_outputs=cached_outputs if internal_kv_cache else None,
         )
     elif engine == ORT_ENGINE:
         model = ORTEngine(
@@ -450,7 +465,7 @@ def benchmark_model(
         seconds_to_run=time,
         seconds_to_warmup=warmup_time,
         num_streams=num_streams,
-        internal_kv_cache=cached_outputs,
+        internal_kv_cache=internal_kv_cache,
     )
     export_dict = {
         "engine": str(model),

diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
@@ -28,8 +28,11 @@
 from onnx import ModelProto
 
 from deepsparse.log import get_main_logger
-from deepsparse.utils.onnx import _MODEL_DIR_ONNX_NAME, truncate_onnx_model
-from sparsezoo import Model
+from deepsparse.utils.onnx import (
+    _MODEL_DIR_ONNX_NAME,
+    model_to_path,
+    truncate_onnx_model,
+)
 from sparsezoo.utils import save_onnx
 
 
@@ -71,22 +74,9 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]:
             )
         return model_path, os.path.join(model_path, _MODEL_DIR_ONNX_NAME)
 
-    elif model_path.startswith("zoo:"):
-        zoo_model = Model(model_path)
-        deployment_path = zoo_model.deployment_directory_path
-        return deployment_path, os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME)
-    elif model_path.startswith("hf:"):
-        from huggingface_hub import snapshot_download
-
-        deployment_path = snapshot_download(repo_id=model_path.replace("hf:", "", 1))
-        onnx_path = os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME)
-        if not os.path.isfile(onnx_path):
-            raise ValueError(
-                f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory "
-                f"{deployment_path}. Be sure that an export of the model is written to "
-                f"{onnx_path}"
-            )
-        return deployment_path, onnx_path
+    elif model_path.startswith("zoo:") or model_path.startswith("hf:"):
+        onnx_model_path = model_to_path(model_path)
+        return os.path.dirname(onnx_model_path), onnx_model_path
     else:
         raise ValueError(
             f"model_path {model_path} is not a valid file, directory, or zoo stub"

diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
@@ -138,6 +138,21 @@ def model_to_path(model: Union[str, Model, File]) -> str:
         # get the downloaded_path -- will auto download if not on local system
         model = model.path
 
+    if isinstance(model, str) and model.startswith("hf:"):
+        # load Hugging Face model from stub
+        from huggingface_hub import snapshot_download
+
+        deployment_path = snapshot_download(repo_id=model.replace("hf:", "", 1))
+        onnx_path = os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME)
+        if not os.path.isfile(onnx_path):
+            raise ValueError(
+                f"Could not find the ONNX model file '{_MODEL_DIR_ONNX_NAME}' in the "
+                f"Hugging Face Hub repository located at {deployment_path}. Please "
+                f"ensure the model has been correctly exported to ONNX format and "
+                f"exists in the repository."
+            )
+        return onnx_path
+
     if not isinstance(model, str):
         raise ValueError("unsupported type for model: {}".format(type(model)))