Merge branch 'main' into bug/onnx_params

onnx · Mar 28, 2024 · 7d48583 · 7d48583
2 parents 1aa3e7d + b814d0b
commit 7d48583
Show file tree

Hide file tree

Showing 20 changed files with 367 additions and 54 deletions.
diff --git a/examples/cli/plugins/example_combined/turnkeyml_plugin_example_combined/runtime.py b/examples/cli/plugins/example_combined/turnkeyml_plugin_example_combined/runtime.py
@@ -28,7 +28,7 @@ def __init__(
         inputs=None,
         delay_before_benchmarking: str = "0",
     ):
-        # Custom runtime args always arive as strings, so we need to convert them
+        # Custom runtime args always arrive as strings, so we need to convert them
         # to the appropriate data type here
         self.delay_before_benchmarking = int(delay_before_benchmarking)
 
@@ -86,7 +86,7 @@ def benchmark(self):
         return MeasuredPerformance(
             mean_latency=self.mean_latency,
             throughput=self.throughput,
-            device=self.device_name,
+            device=self.device_name(),
             device_type=self.device_type,
             runtime=self.runtime,
             runtime_version=self.runtime_version,
@@ -111,9 +111,6 @@ def throughput(self) -> float:
                 "Queried throughput before self.benchmark() was called"
             )
 
-    @property
-    def device_name(self) -> str:
-        return (
-            f"Device Family {self.device_type.family}, Device Part {self.device_type.part}, "
-            f"Device Configuration {self.device_type.config}"
-        )
+    @staticmethod
+    def device_name() -> str:
+        return "Example Device"
diff --git a/examples/cli/plugins/example_rt/turnkeyml_plugin_example_rt/runtime.py b/examples/cli/plugins/example_rt/turnkeyml_plugin_example_rt/runtime.py
@@ -57,7 +57,7 @@ def benchmark(self) -> MeasuredPerformance:
         return MeasuredPerformance(
             mean_latency=self.mean_latency,
             throughput=self.throughput,
-            device=self.device_name,
+            device=self.device_name(),
             device_type=self.device_type,
             runtime=self.runtime,
             runtime_version=self.runtime_version,
@@ -82,6 +82,6 @@ def throughput(self) -> float:
                 "Queried throughput before self.benchmark() was called"
             )
 
-    @property
-    def device_name(self) -> str:
+    @staticmethod
+    def device_name() -> str:
         return "the x86 cpu of your dreams"
diff --git a/models/transformers/llama2_13b.py b/models/transformers/llama2_13b.py
@@ -0,0 +1,48 @@
+# labels: name::llama2_13b author::transformers task::Generative_AI license::apache-2.0
+from turnkeyml.parser import parse
+from transformers import LlamaConfig, LlamaForCausalLM
+import torch
+
+torch.manual_seed(0)
+
+# Parsing command-line arguments
+pretrained, batch_size, max_seq_length, model_path = parse(
+    ["pretrained", "batch_size", "max_seq_length", "model_path"]
+)
+
+# Model and input configurations
+if pretrained:
+    if not model_path:
+        raise ValueError(
+            "TurnkeyML does not include pretrained weights for LLaMA2 "
+            "because it has special licensing terms. See for details: "
+            "https://huggingface.co/docs/transformers/model_doc/llama2"
+        )
+
+    model = LlamaForCausalLM.from_pretrained(model_path)
+else:
+    config = LlamaConfig(
+        architectures=["LlamaForCausalLM"],
+        hidden_size=5120,
+        intermediate_size=13824,
+        max_position_embeddings=4096,
+        num_attention_heads=40,
+        num_hidden_layers=40,
+        num_key_value_heads=40,
+        pad_token_id=0,
+        vocab_size=32000,
+        use_cache=True,
+    )
+    model = LlamaForCausalLM(config)
+
+inputs = {
+    "input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long),
+    "attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.float),
+}
+
+# Call model
+# Generate two tokens so that we can instrument both the prefill
+#   and token generation stages.
+# The token generation stage is the invocation that has "past_key_values"
+#   in the input shape.
+model.generate(**inputs, max_length=max_seq_length + 2)
diff --git a/models/transformers/llama2_34b.py b/models/transformers/llama2_34b.py
@@ -0,0 +1,48 @@
+# labels: name::llama2_34b author::transformers task::Generative_AI license::apache-2.0
+from turnkeyml.parser import parse
+from transformers import LlamaConfig, LlamaForCausalLM
+import torch
+
+torch.manual_seed(0)
+
+# Parsing command-line arguments
+pretrained, batch_size, max_seq_length, model_path = parse(
+    ["pretrained", "batch_size", "max_seq_length", "model_path"]
+)
+
+# Model and input configurations
+if pretrained:
+    if not model_path:
+        raise ValueError(
+            "TurnkeyML does not include pretrained weights for LLaMA2 "
+            "because it has special licensing terms. See for details: "
+            "https://huggingface.co/docs/transformers/model_doc/llama2"
+        )
+
+    model = LlamaForCausalLM.from_pretrained(model_path)
+else:
+    config = LlamaConfig(
+        architectures=["LlamaForCausalLM"],
+        hidden_size=8192,
+        intermediate_size=22016,
+        max_position_embeddings=4096,
+        num_attention_heads=64,
+        num_hidden_layers=48,
+        num_key_value_heads=8,
+        pad_token_id=0,
+        vocab_size=32000,
+        use_cache=True,
+    )
+    model = LlamaForCausalLM(config)
+
+inputs = {
+    "input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long),
+    "attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.float),
+}
+
+# Call model
+# Generate two tokens so that we can instrument both the prefill
+#   and token generation stages.
+# The token generation stage is the invocation that has "past_key_values"
+#   in the input shape.
+model.generate(**inputs, max_length=max_seq_length + 2)
diff --git a/models/transformers/llama2_70b.py b/models/transformers/llama2_70b.py
@@ -0,0 +1,48 @@
+# labels: name::llama2_70b author::transformers task::Generative_AI license::apache-2.0
+from turnkeyml.parser import parse
+from transformers import LlamaConfig, LlamaForCausalLM
+import torch
+
+torch.manual_seed(0)
+
+# Parsing command-line arguments
+pretrained, batch_size, max_seq_length, model_path = parse(
+    ["pretrained", "batch_size", "max_seq_length", "model_path"]
+)
+
+# Model and input configurations
+if pretrained:
+    if not model_path:
+        raise ValueError(
+            "TurnkeyML does not include pretrained weights for LLaMA2 "
+            "because it has special licensing terms. See for details: "
+            "https://huggingface.co/docs/transformers/model_doc/llama2"
+        )
+
+    model = LlamaForCausalLM.from_pretrained(model_path)
+else:
+    config = LlamaConfig(
+        architectures=["LlamaForCausalLM"],
+        hidden_size=8192,
+        intermediate_size=28672,
+        max_position_embeddings=4096,
+        num_attention_heads=64,
+        num_hidden_layers=80,
+        num_key_value_heads=8,
+        pad_token_id=0,
+        vocab_size=32000,
+        use_cache=True,
+    )
+    model = LlamaForCausalLM(config)
+
+inputs = {
+    "input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long),
+    "attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.float),
+}
+
+# Call model
+# Generate two tokens so that we can instrument both the prefill
+#   and token generation stages.
+# The token generation stage is the invocation that has "past_key_values"
+#   in the input shape.
+model.generate(**inputs, max_length=max_seq_length + 2)
diff --git a/models/transformers/llama2_7b.py b/models/transformers/llama2_7b.py
@@ -0,0 +1,48 @@
+# labels: name::llama2_7b author::transformers task::Generative_AI license::apache-2.0
+from turnkeyml.parser import parse
+from transformers import LlamaConfig, LlamaForCausalLM
+import torch
+
+torch.manual_seed(0)
+
+# Parsing command-line arguments
+pretrained, batch_size, max_seq_length, model_path = parse(
+    ["pretrained", "batch_size", "max_seq_length", "model_path"]
+)
+
+# Model and input configurations
+if pretrained:
+    if not model_path:
+        raise ValueError(
+            "TurnkeyML does not include pretrained weights for LLaMA2 "
+            "because it has special licensing terms. See for details: "
+            "https://huggingface.co/docs/transformers/model_doc/llama2"
+        )
+
+    model = LlamaForCausalLM.from_pretrained(model_path)
+else:
+    config = LlamaConfig(
+        architectures=["LlamaForCausalLM"],
+        hidden_size=4096,
+        intermediate_size=11008,
+        max_position_embeddings=4096,
+        num_attention_heads=32,
+        num_hidden_layers=32,
+        num_key_value_heads=32,
+        pad_token_id=0,
+        vocab_size=32000,
+        use_cache=True,
+    )
+    model = LlamaForCausalLM(config)
+
+inputs = {
+    "input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long),
+    "attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.float),
+}
+
+# Call model
+# Generate two tokens so that we can instrument both the prefill
+#   and token generation stages.
+# The token generation stage is the invocation that has "past_key_values"
+#   in the input shape.
+model.generate(**inputs, max_length=max_seq_length + 2)
diff --git a/models/transformers/phi2.py b/models/transformers/phi2.py
@@ -0,0 +1,35 @@
+# labels: name::phi2 author::transformers task::Generative_AI license::mit
+from turnkeyml.parser import parse
+from transformers import AutoModelForCausalLM
+import torch
+
+torch.manual_seed(0)
+
+# Parsing command-line arguments
+pretrained, batch_size, max_seq_length = parse(
+    ["pretrained", "batch_size", "max_seq_length"]
+)
+
+# Model and input configurations
+if pretrained:
+    model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")
+else:
+    raise ValueError(
+        "This model is only supported with pretrained weights, try again with --pretrained"
+    )
+
+# Make sure the user's sequence length fits within the model's maximum
+assert max_seq_length <= model.config.max_position_embeddings
+
+
+inputs = {
+    "input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long),
+    "attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.float),
+}
+
+# Call model
+# Generate two tokens so that we can instrument both the prefill
+#   and token generation stages.
+# The token generation stage is the invocation that has "past_key_values"
+#   in the input shape.
+model.generate(**inputs, max_length=max_seq_length + 2)
diff --git a/src/turnkeyml/analyze/script.py b/src/turnkeyml/analyze/script.py
@@ -371,6 +371,11 @@ def explore_invocation(
             fs.Keys.BENCHMARK_STATUS, build.FunctionStatus.NOT_STARTED.value
         )
 
+        # Save the device name that will be used for the benchmark
+        stats.save_model_eval_stat(
+            fs.Keys.DEVICE, runtime_info["RuntimeClass"].device_name()
+        )
+
     build_state = None
     perf = None
     benchmark_logfile_path = ""
@@ -667,7 +672,7 @@ def explore_frame(
 
             # Starting in version 2.2.0, torch dynamo added wrappers to callbacks
             # while tracing frames, which conflicts with TurnkeML's analysis. Here,
-            # we supress errors caused by those callback wrappers and only raise an
+            # we suppress errors caused by those callback wrappers and only raise an
             # error if the compiled model actually tries to execute within TurnkeyML.
             td = torch._dynamo  # pylint: disable=protected-access
             td.config.suppress_errors = True
@@ -788,7 +793,7 @@ def forward_spy(*args, **kwargs):
                 and invocation_info.is_target
                 and (model_info.build_model)
             ):
-                # Disable all modifications while we evalute the model
+                # Disable all modifications while we evaluate the model
                 # This is needed in case a tool called during evaluation wants to
                 # trace the model. There are some scenarios (e.g., ipex.quantization.prepare),
                 # that raise an exception when they encounter forward_spy()

diff --git a/src/turnkeyml/build/ignition.py b/src/turnkeyml/build/ignition.py
@@ -117,7 +117,7 @@ def validate_cached_model(
         msg = (
             f"Your build {state.config.build_name} was previously built against "
             f"turnkey version {state.turnkey_version}, "
-            f"however you are now using onxxflow version {turnkey_version}. The previous build is "
+            f"however you are now using turnkey version {turnkey_version}. The previous build is "
             f"incompatible with this version of turnkey, as indicated by the {out_of_date} "
             "version number changing. See **docs/versioning.md** for details."
         )

diff --git a/src/turnkeyml/cli/spawn.py b/src/turnkeyml/cli/spawn.py
@@ -13,6 +13,7 @@
 import getpass
 from typing import List, Optional, Dict, Union
 from enum import Enum
+import psutil
 import turnkeyml.common.filesystem as filesystem
 import turnkeyml.common.printing as printing
 import turnkeyml.common.build as build
@@ -22,7 +23,7 @@
 
 class WatchdogTimer(Thread):
     """
-    Run *callback* in *timeout* seconds unless the timer is restarted.
+    Kill process in *timeout* seconds unless the timer is restarted.
 
     This is needed because Popen natively supports streaming output to the terminal,
     checking that output, and timeouts--but not all 3 at the same time.
@@ -31,11 +32,10 @@ class WatchdogTimer(Thread):
     to stream and check output.
     """
 
-    def __init__(self, timeout, callback, *args, timer=monotonic, **kwargs):
+    def __init__(self, timeout, pid, timer=monotonic, **kwargs):
         super().__init__(**kwargs)
         self.timeout = timeout
-        self.callback = callback
-        self.args = args
+        self.pid = pid
         self.timer = timer
         self.cancelled = Event()
         self.blocked = Lock()
@@ -50,14 +50,20 @@ def run(self):
             with self.blocked:
                 if self.deadline <= self.timer() and not self.cancelled.is_set():
                     self.timeout_reached = True
-                    return self.callback(*self.args)
+                    return self.kill_process_tree()
 
     def restart(self):
         self.deadline = self.timer() + self.timeout
 
     def cancel(self):
         self.cancelled.set()
 
+    def kill_process_tree(self):
+        parent = psutil.Process(self.pid)
+        for child in parent.children(recursive=True):
+            child.kill()
+        parent.kill()
+
 
 def parse_evaluation_id(line: str, current_value: str) -> Optional[str]:
     """
@@ -284,7 +290,7 @@ def run_turnkey(
                 # Create our own watchdog timer in a thread
                 # This is needed because the `for line in p.stdout` is a blocking
                 # call that is incompatible with Popen's native timeout features
-                watchdog = WatchdogTimer(timeout, callback=p.kill, daemon=True)
+                watchdog = WatchdogTimer(timeout, p.pid)
                 watchdog.start()
 
                 # Print the subprocess's output to the command line as it comes in,
@@ -374,6 +380,9 @@ def run_turnkey(
                             ):
                                 stats.save_model_eval_stat(key, evaluation_status.value)
 
+                        # Save the exception into the error log stat
+                        stats.save_model_eval_stat(filesystem.Keys.ERROR_LOG, str(e))
+
                     except Exception as stats_exception:  # pylint: disable=broad-except
                         printing.log_info(
                             "Stats file found, but unable to perform cleanup due to "