Skip to content

Commit

Permalink
Merge branch 'dev' into onnx-mistral
Browse files Browse the repository at this point in the history
  • Loading branch information
Anindyadeep authored Apr 24, 2024
2 parents a7359d3 + 454b4c0 commit 63fa1da
Show file tree
Hide file tree
Showing 12 changed files with 552 additions and 662 deletions.
51 changes: 39 additions & 12 deletions bench_optimum_nvidia/README.md

Large diffs are not rendered by default.

224 changes: 106 additions & 118 deletions bench_optimum_nvidia/bench.py
Original file line number Diff line number Diff line change
@@ -1,144 +1,132 @@
import argparse
import logging
import os
import sys
import time
from collections import defaultdict
from typing import Optional

import numpy as np
import torch
from optimum.nvidia import AutoModelForCausalLM
from transformers import AutoTokenizer

logging.getLogger("transformers").setLevel(logging.ERROR)
logging.basicConfig(
stream=sys.stdout,
level=print,
format="%(asctime)s - %(levelname)s - %(message)s",
)
sys.path.append("/mnt")
sys.path.append("/mnt/benchmarks/")

from common.base import BaseBenchmarkClass # noqa
from common.utils import launch_cli, make_report # noqa

def log_and_print(message: str) -> None:
print(message)
logging.info(message)


class LlamaOptimumNvidiaBenchmark:
class OptimumBenchmark(BaseBenchmarkClass):
def __init__(
self, model_path: str, precision: str, device: Optional[str] = "cuda"
self,
model_path: str,
model_name: str,
benchmark_name: str,
precision: str,
device: str,
experiment_name: str,
) -> None:
self.model_path = model_path
self.precision = precision
self.results = []
self.precision_to_dtype_map = {
"fp16": torch.float16,
"fp32": torch.float32,
}

# some of the conditions where things can not be supported
assert precision in ["fp16", "fp32"], ValueError(
"Supported precisions are: fp16', 'fp32'"
assert precision in ["float32", "float16"], ValueError(
"Supported precision: 'float32' and 'float16'"
)
super().__init__(
model_name=model_name,
model_path=model_path,
benchmark_name=benchmark_name,
experiment_name=experiment_name,
precision=precision,
device=device,
root_folder="/mnt/benchmarks",
)
assert device in ["cuda"], ValueError("Supported devices are: 'cuda'")

self.model_args = {
"torch_dtype": self.precision_to_dtype_map[self.precision],
}
self.device = device
if model_name == "llama":
self.tokenizer_folder = os.path.join(
self.root_folder, "models", "llama-2-7b-chat-hf"
)
else:
self.tokenizer_folder = os.path.join(
self.root_folder, "models", "mistral-7b-v0.1-instruct-hf"
)

def load_model(self):
"""Loads the model into various formats and device"""
def load_model_and_tokenizer(self):
dtype_mapper = {"float16": torch.float16, "float32": torch.float32}
self.model = AutoModelForCausalLM.from_pretrained(
self.model_path, **self.model_args
pretrained_model_name_or_path=self.model_path,
torch_dtype=dtype_mapper[self.precision],
)

# Hardcoding this for now.
self.tokenizer = AutoTokenizer.from_pretrained("/mnt/models/llama-2-7b-hf")
self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder)
return self

def run_model(self, prompt: str, max_tokens: int) -> float:
tokenized_input = self.tokenizer.encode(prompt, return_tensors="pt").to(
self.device
)
start = time.time()
generated = self.model.generate(
input_ids=tokenized_input, max_new_tokens=max_tokens
)[0]
delta = time.time() - start

output = generated.detach().cpu().numpy()
decoded = self.tokenizer.decode(output[0][0], skip_special_tokens=True)
return len(self.tokenizer.encode(decoded)) / delta

def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
for i in range(repetitions):
log_and_print(
f"Running repetition [{str(i+1).zfill(len(str(repetitions)))}/{repetitions}]"
def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True):
if chat_mode:
template = self.get_chat_template_with_instruction(
prompt=prompt, for_benchmarks=for_benchmarks
)
tokens_per_second = self.run_model(prompt, max_tokens)
self.results.append(tokens_per_second)
del self.model
if self.device == "cuda":
prompt = self.tokenizer.apply_chat_template(template, tokenize=False)

tokenized_input = self.tokenizer.encode(text=prompt)
tensor = self.tokenizer(prompt, return_tensors="pt")
return {
"prompt": prompt,
"input_tokens": tokenized_input,
"tensor": tensor,
"num_input_tokens": len(tokenized_input),
}

def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict:
tensor = inputs["tensor"]
num_input_tokens = inputs["num_input_tokens"]

generated, _ = self.model.generate(
**tensor,
top_k=40,
top_p=0.1,
pad_token_id=self.tokenizer.eos_token_id,
eos_token_id=self.tokenizer.eos_token_id,
temperature=temperature,
max_new_tokens=max_tokens,
)

output_tokens = generated[0].detach().tolist()[num_input_tokens:]
return {"output_tokens": output_tokens, "num_output_tokens": len(output_tokens)}

def postprocess(self, output: dict) -> str:
output_tokens = output["output_tokens"]
output_text = self.tokenizer.decode(output_tokens, skip_special_tokens=True)
return output_text

def on_exit(self):
if self.device == "cuda:0":
del self.model
torch.cuda.synchronize()
else:
del self.model


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Nvidia Optimum Benchmark.")
parser.add_argument(
"--prompt",
type=str,
help="The prompt for the model.",
)
parser.add_argument("--max_tokens", type=int, help="The maximum number of tokens.")
parser.add_argument(
"--repetitions",
type=int,
help="The number of repetitions for the benchmark.",
)
parser.add_argument(
"--device",
help="Device to use for the benchmark.",
)
parser.add_argument(
"--log_file",
type=str,
help="Path to the log file for writing logs (in append mode).",
)
parser.add_argument(
"--models_dir",
type=str,
help="Path to the models directory.",
)
parser = launch_cli(description="HF-Optimum Nvidia Benchmark.")
args = parser.parse_args()
log_and_print(
f"Running benchmark with: max_tokens={args.max_tokens} prompt={args.prompt} "
+ f"repetitions={args.repetitions} device={args.device}"
)
report = defaultdict(lambda: defaultdict(float))

for precision in ("fp16", "fp32"):
log_and_print(f"Running Optimum-Nvidia on Llama with precision: {precision}")
llama_transformers_pytorch_benchmark = LlamaOptimumNvidiaBenchmark(
model_path=args.models_dir,
device=args.device,
precision=precision,
).load_model()
llama_transformers_pytorch_benchmark.benchmark(
max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions
)
model_folder = "/mnt/benchmarks/models"
model_name = (
f"{args.model_name}-2-7b-chat-optimum"
if args.model_name == "llama"
else f"{args.model_name}-7b-v0.1-instruct-optimum"
)

report["llama_optimum_nvidia"][precision] = {
"mean": np.mean(llama_transformers_pytorch_benchmark.results),
"std": np.std(llama_transformers_pytorch_benchmark.results),
}
log_and_print("Benchmark Report")
with open(args.log_file, "a") as file:
for framework, quantizations in report.items():
for quantization, stats in quantizations.items():
log_and_print(
f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}"
)
print(
f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}",
file=file,
)
runner_dict = {
"cuda": [
{
"precision": "float32",
"model_path": os.path.join(model_folder, model_name + "-float32"),
},
{
"precision": "float16",
"model_path": os.path.join(model_folder, model_name + "-float16"),
},
]
}

make_report(
args=args,
benchmark_class=OptimumBenchmark,
runner_dict=runner_dict,
benchmark_name="HF-Optimum Nvidia",
is_bench_pytorch=False,
)
Loading

0 comments on commit 63fa1da

Please sign in to comment.