-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'dev' into onnx-mistral
- Loading branch information
Showing
12 changed files
with
552 additions
and
662 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,144 +1,132 @@ | ||
import argparse | ||
import logging | ||
import os | ||
import sys | ||
import time | ||
from collections import defaultdict | ||
from typing import Optional | ||
|
||
import numpy as np | ||
import torch | ||
from optimum.nvidia import AutoModelForCausalLM | ||
from transformers import AutoTokenizer | ||
|
||
logging.getLogger("transformers").setLevel(logging.ERROR) | ||
logging.basicConfig( | ||
stream=sys.stdout, | ||
level=print, | ||
format="%(asctime)s - %(levelname)s - %(message)s", | ||
) | ||
sys.path.append("/mnt") | ||
sys.path.append("/mnt/benchmarks/") | ||
|
||
from common.base import BaseBenchmarkClass # noqa | ||
from common.utils import launch_cli, make_report # noqa | ||
|
||
def log_and_print(message: str) -> None: | ||
print(message) | ||
logging.info(message) | ||
|
||
|
||
class LlamaOptimumNvidiaBenchmark: | ||
class OptimumBenchmark(BaseBenchmarkClass): | ||
def __init__( | ||
self, model_path: str, precision: str, device: Optional[str] = "cuda" | ||
self, | ||
model_path: str, | ||
model_name: str, | ||
benchmark_name: str, | ||
precision: str, | ||
device: str, | ||
experiment_name: str, | ||
) -> None: | ||
self.model_path = model_path | ||
self.precision = precision | ||
self.results = [] | ||
self.precision_to_dtype_map = { | ||
"fp16": torch.float16, | ||
"fp32": torch.float32, | ||
} | ||
|
||
# some of the conditions where things can not be supported | ||
assert precision in ["fp16", "fp32"], ValueError( | ||
"Supported precisions are: fp16', 'fp32'" | ||
assert precision in ["float32", "float16"], ValueError( | ||
"Supported precision: 'float32' and 'float16'" | ||
) | ||
super().__init__( | ||
model_name=model_name, | ||
model_path=model_path, | ||
benchmark_name=benchmark_name, | ||
experiment_name=experiment_name, | ||
precision=precision, | ||
device=device, | ||
root_folder="/mnt/benchmarks", | ||
) | ||
assert device in ["cuda"], ValueError("Supported devices are: 'cuda'") | ||
|
||
self.model_args = { | ||
"torch_dtype": self.precision_to_dtype_map[self.precision], | ||
} | ||
self.device = device | ||
if model_name == "llama": | ||
self.tokenizer_folder = os.path.join( | ||
self.root_folder, "models", "llama-2-7b-chat-hf" | ||
) | ||
else: | ||
self.tokenizer_folder = os.path.join( | ||
self.root_folder, "models", "mistral-7b-v0.1-instruct-hf" | ||
) | ||
|
||
def load_model(self): | ||
"""Loads the model into various formats and device""" | ||
def load_model_and_tokenizer(self): | ||
dtype_mapper = {"float16": torch.float16, "float32": torch.float32} | ||
self.model = AutoModelForCausalLM.from_pretrained( | ||
self.model_path, **self.model_args | ||
pretrained_model_name_or_path=self.model_path, | ||
torch_dtype=dtype_mapper[self.precision], | ||
) | ||
|
||
# Hardcoding this for now. | ||
self.tokenizer = AutoTokenizer.from_pretrained("/mnt/models/llama-2-7b-hf") | ||
self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder) | ||
return self | ||
|
||
def run_model(self, prompt: str, max_tokens: int) -> float: | ||
tokenized_input = self.tokenizer.encode(prompt, return_tensors="pt").to( | ||
self.device | ||
) | ||
start = time.time() | ||
generated = self.model.generate( | ||
input_ids=tokenized_input, max_new_tokens=max_tokens | ||
)[0] | ||
delta = time.time() - start | ||
|
||
output = generated.detach().cpu().numpy() | ||
decoded = self.tokenizer.decode(output[0][0], skip_special_tokens=True) | ||
return len(self.tokenizer.encode(decoded)) / delta | ||
|
||
def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None: | ||
for i in range(repetitions): | ||
log_and_print( | ||
f"Running repetition [{str(i+1).zfill(len(str(repetitions)))}/{repetitions}]" | ||
def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True): | ||
if chat_mode: | ||
template = self.get_chat_template_with_instruction( | ||
prompt=prompt, for_benchmarks=for_benchmarks | ||
) | ||
tokens_per_second = self.run_model(prompt, max_tokens) | ||
self.results.append(tokens_per_second) | ||
del self.model | ||
if self.device == "cuda": | ||
prompt = self.tokenizer.apply_chat_template(template, tokenize=False) | ||
|
||
tokenized_input = self.tokenizer.encode(text=prompt) | ||
tensor = self.tokenizer(prompt, return_tensors="pt") | ||
return { | ||
"prompt": prompt, | ||
"input_tokens": tokenized_input, | ||
"tensor": tensor, | ||
"num_input_tokens": len(tokenized_input), | ||
} | ||
|
||
def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict: | ||
tensor = inputs["tensor"] | ||
num_input_tokens = inputs["num_input_tokens"] | ||
|
||
generated, _ = self.model.generate( | ||
**tensor, | ||
top_k=40, | ||
top_p=0.1, | ||
pad_token_id=self.tokenizer.eos_token_id, | ||
eos_token_id=self.tokenizer.eos_token_id, | ||
temperature=temperature, | ||
max_new_tokens=max_tokens, | ||
) | ||
|
||
output_tokens = generated[0].detach().tolist()[num_input_tokens:] | ||
return {"output_tokens": output_tokens, "num_output_tokens": len(output_tokens)} | ||
|
||
def postprocess(self, output: dict) -> str: | ||
output_tokens = output["output_tokens"] | ||
output_text = self.tokenizer.decode(output_tokens, skip_special_tokens=True) | ||
return output_text | ||
|
||
def on_exit(self): | ||
if self.device == "cuda:0": | ||
del self.model | ||
torch.cuda.synchronize() | ||
else: | ||
del self.model | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Nvidia Optimum Benchmark.") | ||
parser.add_argument( | ||
"--prompt", | ||
type=str, | ||
help="The prompt for the model.", | ||
) | ||
parser.add_argument("--max_tokens", type=int, help="The maximum number of tokens.") | ||
parser.add_argument( | ||
"--repetitions", | ||
type=int, | ||
help="The number of repetitions for the benchmark.", | ||
) | ||
parser.add_argument( | ||
"--device", | ||
help="Device to use for the benchmark.", | ||
) | ||
parser.add_argument( | ||
"--log_file", | ||
type=str, | ||
help="Path to the log file for writing logs (in append mode).", | ||
) | ||
parser.add_argument( | ||
"--models_dir", | ||
type=str, | ||
help="Path to the models directory.", | ||
) | ||
parser = launch_cli(description="HF-Optimum Nvidia Benchmark.") | ||
args = parser.parse_args() | ||
log_and_print( | ||
f"Running benchmark with: max_tokens={args.max_tokens} prompt={args.prompt} " | ||
+ f"repetitions={args.repetitions} device={args.device}" | ||
) | ||
report = defaultdict(lambda: defaultdict(float)) | ||
|
||
for precision in ("fp16", "fp32"): | ||
log_and_print(f"Running Optimum-Nvidia on Llama with precision: {precision}") | ||
llama_transformers_pytorch_benchmark = LlamaOptimumNvidiaBenchmark( | ||
model_path=args.models_dir, | ||
device=args.device, | ||
precision=precision, | ||
).load_model() | ||
llama_transformers_pytorch_benchmark.benchmark( | ||
max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions | ||
) | ||
model_folder = "/mnt/benchmarks/models" | ||
model_name = ( | ||
f"{args.model_name}-2-7b-chat-optimum" | ||
if args.model_name == "llama" | ||
else f"{args.model_name}-7b-v0.1-instruct-optimum" | ||
) | ||
|
||
report["llama_optimum_nvidia"][precision] = { | ||
"mean": np.mean(llama_transformers_pytorch_benchmark.results), | ||
"std": np.std(llama_transformers_pytorch_benchmark.results), | ||
} | ||
log_and_print("Benchmark Report") | ||
with open(args.log_file, "a") as file: | ||
for framework, quantizations in report.items(): | ||
for quantization, stats in quantizations.items(): | ||
log_and_print( | ||
f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}" | ||
) | ||
print( | ||
f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}", | ||
file=file, | ||
) | ||
runner_dict = { | ||
"cuda": [ | ||
{ | ||
"precision": "float32", | ||
"model_path": os.path.join(model_folder, model_name + "-float32"), | ||
}, | ||
{ | ||
"precision": "float16", | ||
"model_path": os.path.join(model_folder, model_name + "-float16"), | ||
}, | ||
] | ||
} | ||
|
||
make_report( | ||
args=args, | ||
benchmark_class=OptimumBenchmark, | ||
runner_dict=runner_dict, | ||
benchmark_name="HF-Optimum Nvidia", | ||
is_bench_pytorch=False, | ||
) |
Oops, something went wrong.