Skip to content

Commit

Permalink
Merge branch 'main' into bug/onnx_params
Browse files Browse the repository at this point in the history
  • Loading branch information
danielholanda authored Mar 28, 2024
2 parents 1aa3e7d + b814d0b commit 7d48583
Show file tree
Hide file tree
Showing 20 changed files with 367 additions and 54 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(
inputs=None,
delay_before_benchmarking: str = "0",
):
# Custom runtime args always arive as strings, so we need to convert them
# Custom runtime args always arrive as strings, so we need to convert them
# to the appropriate data type here
self.delay_before_benchmarking = int(delay_before_benchmarking)

Expand Down Expand Up @@ -86,7 +86,7 @@ def benchmark(self):
return MeasuredPerformance(
mean_latency=self.mean_latency,
throughput=self.throughput,
device=self.device_name,
device=self.device_name(),
device_type=self.device_type,
runtime=self.runtime,
runtime_version=self.runtime_version,
Expand All @@ -111,9 +111,6 @@ def throughput(self) -> float:
"Queried throughput before self.benchmark() was called"
)

@property
def device_name(self) -> str:
return (
f"Device Family {self.device_type.family}, Device Part {self.device_type.part}, "
f"Device Configuration {self.device_type.config}"
)
@staticmethod
def device_name() -> str:
return "Example Device"
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def benchmark(self) -> MeasuredPerformance:
return MeasuredPerformance(
mean_latency=self.mean_latency,
throughput=self.throughput,
device=self.device_name,
device=self.device_name(),
device_type=self.device_type,
runtime=self.runtime,
runtime_version=self.runtime_version,
Expand All @@ -82,6 +82,6 @@ def throughput(self) -> float:
"Queried throughput before self.benchmark() was called"
)

@property
def device_name(self) -> str:
@staticmethod
def device_name() -> str:
return "the x86 cpu of your dreams"
48 changes: 48 additions & 0 deletions models/transformers/llama2_13b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# labels: name::llama2_13b author::transformers task::Generative_AI license::apache-2.0
from turnkeyml.parser import parse
from transformers import LlamaConfig, LlamaForCausalLM
import torch

torch.manual_seed(0)

# Parsing command-line arguments
pretrained, batch_size, max_seq_length, model_path = parse(
["pretrained", "batch_size", "max_seq_length", "model_path"]
)

# Model and input configurations
if pretrained:
if not model_path:
raise ValueError(
"TurnkeyML does not include pretrained weights for LLaMA2 "
"because it has special licensing terms. See for details: "
"https://huggingface.co/docs/transformers/model_doc/llama2"
)

model = LlamaForCausalLM.from_pretrained(model_path)
else:
config = LlamaConfig(
architectures=["LlamaForCausalLM"],
hidden_size=5120,
intermediate_size=13824,
max_position_embeddings=4096,
num_attention_heads=40,
num_hidden_layers=40,
num_key_value_heads=40,
pad_token_id=0,
vocab_size=32000,
use_cache=True,
)
model = LlamaForCausalLM(config)

inputs = {
"input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long),
"attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.float),
}

# Call model
# Generate two tokens so that we can instrument both the prefill
# and token generation stages.
# The token generation stage is the invocation that has "past_key_values"
# in the input shape.
model.generate(**inputs, max_length=max_seq_length + 2)
48 changes: 48 additions & 0 deletions models/transformers/llama2_34b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# labels: name::llama2_34b author::transformers task::Generative_AI license::apache-2.0
from turnkeyml.parser import parse
from transformers import LlamaConfig, LlamaForCausalLM
import torch

torch.manual_seed(0)

# Parsing command-line arguments
pretrained, batch_size, max_seq_length, model_path = parse(
["pretrained", "batch_size", "max_seq_length", "model_path"]
)

# Model and input configurations
if pretrained:
if not model_path:
raise ValueError(
"TurnkeyML does not include pretrained weights for LLaMA2 "
"because it has special licensing terms. See for details: "
"https://huggingface.co/docs/transformers/model_doc/llama2"
)

model = LlamaForCausalLM.from_pretrained(model_path)
else:
config = LlamaConfig(
architectures=["LlamaForCausalLM"],
hidden_size=8192,
intermediate_size=22016,
max_position_embeddings=4096,
num_attention_heads=64,
num_hidden_layers=48,
num_key_value_heads=8,
pad_token_id=0,
vocab_size=32000,
use_cache=True,
)
model = LlamaForCausalLM(config)

inputs = {
"input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long),
"attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.float),
}

# Call model
# Generate two tokens so that we can instrument both the prefill
# and token generation stages.
# The token generation stage is the invocation that has "past_key_values"
# in the input shape.
model.generate(**inputs, max_length=max_seq_length + 2)
48 changes: 48 additions & 0 deletions models/transformers/llama2_70b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# labels: name::llama2_70b author::transformers task::Generative_AI license::apache-2.0
from turnkeyml.parser import parse
from transformers import LlamaConfig, LlamaForCausalLM
import torch

torch.manual_seed(0)

# Parsing command-line arguments
pretrained, batch_size, max_seq_length, model_path = parse(
["pretrained", "batch_size", "max_seq_length", "model_path"]
)

# Model and input configurations
if pretrained:
if not model_path:
raise ValueError(
"TurnkeyML does not include pretrained weights for LLaMA2 "
"because it has special licensing terms. See for details: "
"https://huggingface.co/docs/transformers/model_doc/llama2"
)

model = LlamaForCausalLM.from_pretrained(model_path)
else:
config = LlamaConfig(
architectures=["LlamaForCausalLM"],
hidden_size=8192,
intermediate_size=28672,
max_position_embeddings=4096,
num_attention_heads=64,
num_hidden_layers=80,
num_key_value_heads=8,
pad_token_id=0,
vocab_size=32000,
use_cache=True,
)
model = LlamaForCausalLM(config)

inputs = {
"input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long),
"attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.float),
}

# Call model
# Generate two tokens so that we can instrument both the prefill
# and token generation stages.
# The token generation stage is the invocation that has "past_key_values"
# in the input shape.
model.generate(**inputs, max_length=max_seq_length + 2)
48 changes: 48 additions & 0 deletions models/transformers/llama2_7b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# labels: name::llama2_7b author::transformers task::Generative_AI license::apache-2.0
from turnkeyml.parser import parse
from transformers import LlamaConfig, LlamaForCausalLM
import torch

torch.manual_seed(0)

# Parsing command-line arguments
pretrained, batch_size, max_seq_length, model_path = parse(
["pretrained", "batch_size", "max_seq_length", "model_path"]
)

# Model and input configurations
if pretrained:
if not model_path:
raise ValueError(
"TurnkeyML does not include pretrained weights for LLaMA2 "
"because it has special licensing terms. See for details: "
"https://huggingface.co/docs/transformers/model_doc/llama2"
)

model = LlamaForCausalLM.from_pretrained(model_path)
else:
config = LlamaConfig(
architectures=["LlamaForCausalLM"],
hidden_size=4096,
intermediate_size=11008,
max_position_embeddings=4096,
num_attention_heads=32,
num_hidden_layers=32,
num_key_value_heads=32,
pad_token_id=0,
vocab_size=32000,
use_cache=True,
)
model = LlamaForCausalLM(config)

inputs = {
"input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long),
"attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.float),
}

# Call model
# Generate two tokens so that we can instrument both the prefill
# and token generation stages.
# The token generation stage is the invocation that has "past_key_values"
# in the input shape.
model.generate(**inputs, max_length=max_seq_length + 2)
35 changes: 35 additions & 0 deletions models/transformers/phi2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# labels: name::phi2 author::transformers task::Generative_AI license::mit
from turnkeyml.parser import parse
from transformers import AutoModelForCausalLM
import torch

torch.manual_seed(0)

# Parsing command-line arguments
pretrained, batch_size, max_seq_length = parse(
["pretrained", "batch_size", "max_seq_length"]
)

# Model and input configurations
if pretrained:
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")
else:
raise ValueError(
"This model is only supported with pretrained weights, try again with --pretrained"
)

# Make sure the user's sequence length fits within the model's maximum
assert max_seq_length <= model.config.max_position_embeddings


inputs = {
"input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long),
"attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.float),
}

# Call model
# Generate two tokens so that we can instrument both the prefill
# and token generation stages.
# The token generation stage is the invocation that has "past_key_values"
# in the input shape.
model.generate(**inputs, max_length=max_seq_length + 2)
9 changes: 7 additions & 2 deletions src/turnkeyml/analyze/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,11 @@ def explore_invocation(
fs.Keys.BENCHMARK_STATUS, build.FunctionStatus.NOT_STARTED.value
)

# Save the device name that will be used for the benchmark
stats.save_model_eval_stat(
fs.Keys.DEVICE, runtime_info["RuntimeClass"].device_name()
)

build_state = None
perf = None
benchmark_logfile_path = ""
Expand Down Expand Up @@ -667,7 +672,7 @@ def explore_frame(

# Starting in version 2.2.0, torch dynamo added wrappers to callbacks
# while tracing frames, which conflicts with TurnkeML's analysis. Here,
# we supress errors caused by those callback wrappers and only raise an
# we suppress errors caused by those callback wrappers and only raise an
# error if the compiled model actually tries to execute within TurnkeyML.
td = torch._dynamo # pylint: disable=protected-access
td.config.suppress_errors = True
Expand Down Expand Up @@ -788,7 +793,7 @@ def forward_spy(*args, **kwargs):
and invocation_info.is_target
and (model_info.build_model)
):
# Disable all modifications while we evalute the model
# Disable all modifications while we evaluate the model
# This is needed in case a tool called during evaluation wants to
# trace the model. There are some scenarios (e.g., ipex.quantization.prepare),
# that raise an exception when they encounter forward_spy()
Expand Down
2 changes: 1 addition & 1 deletion src/turnkeyml/build/ignition.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def validate_cached_model(
msg = (
f"Your build {state.config.build_name} was previously built against "
f"turnkey version {state.turnkey_version}, "
f"however you are now using onxxflow version {turnkey_version}. The previous build is "
f"however you are now using turnkey version {turnkey_version}. The previous build is "
f"incompatible with this version of turnkey, as indicated by the {out_of_date} "
"version number changing. See **docs/versioning.md** for details."
)
Expand Down
21 changes: 15 additions & 6 deletions src/turnkeyml/cli/spawn.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import getpass
from typing import List, Optional, Dict, Union
from enum import Enum
import psutil
import turnkeyml.common.filesystem as filesystem
import turnkeyml.common.printing as printing
import turnkeyml.common.build as build
Expand All @@ -22,7 +23,7 @@

class WatchdogTimer(Thread):
"""
Run *callback* in *timeout* seconds unless the timer is restarted.
Kill process in *timeout* seconds unless the timer is restarted.
This is needed because Popen natively supports streaming output to the terminal,
checking that output, and timeouts--but not all 3 at the same time.
Expand All @@ -31,11 +32,10 @@ class WatchdogTimer(Thread):
to stream and check output.
"""

def __init__(self, timeout, callback, *args, timer=monotonic, **kwargs):
def __init__(self, timeout, pid, timer=monotonic, **kwargs):
super().__init__(**kwargs)
self.timeout = timeout
self.callback = callback
self.args = args
self.pid = pid
self.timer = timer
self.cancelled = Event()
self.blocked = Lock()
Expand All @@ -50,14 +50,20 @@ def run(self):
with self.blocked:
if self.deadline <= self.timer() and not self.cancelled.is_set():
self.timeout_reached = True
return self.callback(*self.args)
return self.kill_process_tree()

def restart(self):
self.deadline = self.timer() + self.timeout

def cancel(self):
self.cancelled.set()

def kill_process_tree(self):
parent = psutil.Process(self.pid)
for child in parent.children(recursive=True):
child.kill()
parent.kill()


def parse_evaluation_id(line: str, current_value: str) -> Optional[str]:
"""
Expand Down Expand Up @@ -284,7 +290,7 @@ def run_turnkey(
# Create our own watchdog timer in a thread
# This is needed because the `for line in p.stdout` is a blocking
# call that is incompatible with Popen's native timeout features
watchdog = WatchdogTimer(timeout, callback=p.kill, daemon=True)
watchdog = WatchdogTimer(timeout, p.pid)
watchdog.start()

# Print the subprocess's output to the command line as it comes in,
Expand Down Expand Up @@ -374,6 +380,9 @@ def run_turnkey(
):
stats.save_model_eval_stat(key, evaluation_status.value)

# Save the exception into the error log stat
stats.save_model_eval_stat(filesystem.Keys.ERROR_LOG, str(e))

except Exception as stats_exception: # pylint: disable=broad-except
printing.log_info(
"Stats file found, but unable to perform cleanup due to "
Expand Down
Loading

0 comments on commit 7d48583

Please sign in to comment.