Merge branch 'dev' into onnx-mistral

premAI-io · Apr 24, 2024 · 63fa1da · 63fa1da
2 parents a7359d3 + 454b4c0
commit 63fa1da
Show file tree

Hide file tree

Showing 12 changed files with 552 additions and 662 deletions.
diff --git a/bench_optimum_nvidia/README.md b/bench_optimum_nvidia/README.md
diff --git a/bench_optimum_nvidia/bench.py b/bench_optimum_nvidia/bench.py
@@ -1,144 +1,132 @@
-import argparse
-import logging
+import os
 import sys
-import time
-from collections import defaultdict
-from typing import Optional
 
-import numpy as np
 import torch
 from optimum.nvidia import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
-logging.getLogger("transformers").setLevel(logging.ERROR)
-logging.basicConfig(
-    stream=sys.stdout,
-    level=print,
-    format="%(asctime)s - %(levelname)s - %(message)s",
-)
+sys.path.append("/mnt")
+sys.path.append("/mnt/benchmarks/")
 
+from common.base import BaseBenchmarkClass  # noqa
+from common.utils import launch_cli, make_report  # noqa
 
-def log_and_print(message: str) -> None:
-    print(message)
-    logging.info(message)
 
-
-class LlamaOptimumNvidiaBenchmark:
+class OptimumBenchmark(BaseBenchmarkClass):
     def __init__(
-        self, model_path: str, precision: str, device: Optional[str] = "cuda"
+        self,
+        model_path: str,
+        model_name: str,
+        benchmark_name: str,
+        precision: str,
+        device: str,
+        experiment_name: str,
     ) -> None:
-        self.model_path = model_path
-        self.precision = precision
-        self.results = []
-        self.precision_to_dtype_map = {
-            "fp16": torch.float16,
-            "fp32": torch.float32,
-        }
-
-        # some of the conditions where things can not be supported
-        assert precision in ["fp16", "fp32"], ValueError(
-            "Supported precisions are: fp16', 'fp32'"
+        assert precision in ["float32", "float16"], ValueError(
+            "Supported precision: 'float32' and 'float16'"
+        )
+        super().__init__(
+            model_name=model_name,
+            model_path=model_path,
+            benchmark_name=benchmark_name,
+            experiment_name=experiment_name,
+            precision=precision,
+            device=device,
+            root_folder="/mnt/benchmarks",
         )
-        assert device in ["cuda"], ValueError("Supported devices are: 'cuda'")
 
-        self.model_args = {
-            "torch_dtype": self.precision_to_dtype_map[self.precision],
-        }
-        self.device = device
+        if model_name == "llama":
+            self.tokenizer_folder = os.path.join(
+                self.root_folder, "models", "llama-2-7b-chat-hf"
+            )
+        else:
+            self.tokenizer_folder = os.path.join(
+                self.root_folder, "models", "mistral-7b-v0.1-instruct-hf"
+            )
 
-    def load_model(self):
-        """Loads the model into various formats and device"""
+    def load_model_and_tokenizer(self):
+        dtype_mapper = {"float16": torch.float16, "float32": torch.float32}
         self.model = AutoModelForCausalLM.from_pretrained(
-            self.model_path, **self.model_args
+            pretrained_model_name_or_path=self.model_path,
+            torch_dtype=dtype_mapper[self.precision],
         )
-
-        # Hardcoding this for now.
-        self.tokenizer = AutoTokenizer.from_pretrained("/mnt/models/llama-2-7b-hf")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder)
         return self
 
-    def run_model(self, prompt: str, max_tokens: int) -> float:
-        tokenized_input = self.tokenizer.encode(prompt, return_tensors="pt").to(
-            self.device
-        )
-        start = time.time()
-        generated = self.model.generate(
-            input_ids=tokenized_input, max_new_tokens=max_tokens
-        )[0]
-        delta = time.time() - start
-
-        output = generated.detach().cpu().numpy()
-        decoded = self.tokenizer.decode(output[0][0], skip_special_tokens=True)
-        return len(self.tokenizer.encode(decoded)) / delta
-
-    def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
-        for i in range(repetitions):
-            log_and_print(
-                f"Running repetition [{str(i+1).zfill(len(str(repetitions)))}/{repetitions}]"
+    def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True):
+        if chat_mode:
+            template = self.get_chat_template_with_instruction(
+                prompt=prompt, for_benchmarks=for_benchmarks
             )
-            tokens_per_second = self.run_model(prompt, max_tokens)
-            self.results.append(tokens_per_second)
-        del self.model
-        if self.device == "cuda":
+            prompt = self.tokenizer.apply_chat_template(template, tokenize=False)
+
+        tokenized_input = self.tokenizer.encode(text=prompt)
+        tensor = self.tokenizer(prompt, return_tensors="pt")
+        return {
+            "prompt": prompt,
+            "input_tokens": tokenized_input,
+            "tensor": tensor,
+            "num_input_tokens": len(tokenized_input),
+        }
+
+    def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict:
+        tensor = inputs["tensor"]
+        num_input_tokens = inputs["num_input_tokens"]
+
+        generated, _ = self.model.generate(
+            **tensor,
+            top_k=40,
+            top_p=0.1,
+            pad_token_id=self.tokenizer.eos_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            temperature=temperature,
+            max_new_tokens=max_tokens,
+        )
+
+        output_tokens = generated[0].detach().tolist()[num_input_tokens:]
+        return {"output_tokens": output_tokens, "num_output_tokens": len(output_tokens)}
+
+    def postprocess(self, output: dict) -> str:
+        output_tokens = output["output_tokens"]
+        output_text = self.tokenizer.decode(output_tokens, skip_special_tokens=True)
+        return output_text
+
+    def on_exit(self):
+        if self.device == "cuda:0":
+            del self.model
             torch.cuda.synchronize()
+        else:
+            del self.model
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Nvidia Optimum Benchmark.")
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        help="The prompt for the model.",
-    )
-    parser.add_argument("--max_tokens", type=int, help="The maximum number of tokens.")
-    parser.add_argument(
-        "--repetitions",
-        type=int,
-        help="The number of repetitions for the benchmark.",
-    )
-    parser.add_argument(
-        "--device",
-        help="Device to use for the benchmark.",
-    )
-    parser.add_argument(
-        "--log_file",
-        type=str,
-        help="Path to the log file for writing logs (in append mode).",
-    )
-    parser.add_argument(
-        "--models_dir",
-        type=str,
-        help="Path to the models directory.",
-    )
+    parser = launch_cli(description="HF-Optimum Nvidia Benchmark.")
     args = parser.parse_args()
-    log_and_print(
-        f"Running benchmark with: max_tokens={args.max_tokens} prompt={args.prompt} "
-        + f"repetitions={args.repetitions} device={args.device}"
-    )
-    report = defaultdict(lambda: defaultdict(float))
 
-    for precision in ("fp16", "fp32"):
-        log_and_print(f"Running Optimum-Nvidia on Llama with precision: {precision}")
-        llama_transformers_pytorch_benchmark = LlamaOptimumNvidiaBenchmark(
-            model_path=args.models_dir,
-            device=args.device,
-            precision=precision,
-        ).load_model()
-        llama_transformers_pytorch_benchmark.benchmark(
-            max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions
-        )
+    model_folder = "/mnt/benchmarks/models"
+    model_name = (
+        f"{args.model_name}-2-7b-chat-optimum"
+        if args.model_name == "llama"
+        else f"{args.model_name}-7b-v0.1-instruct-optimum"
+    )
 
-        report["llama_optimum_nvidia"][precision] = {
-            "mean": np.mean(llama_transformers_pytorch_benchmark.results),
-            "std": np.std(llama_transformers_pytorch_benchmark.results),
-        }
-    log_and_print("Benchmark Report")
-    with open(args.log_file, "a") as file:
-        for framework, quantizations in report.items():
-            for quantization, stats in quantizations.items():
-                log_and_print(
-                    f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}"
-                )
-                print(
-                    f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}",
-                    file=file,
-                )
+    runner_dict = {
+        "cuda": [
+            {
+                "precision": "float32",
+                "model_path": os.path.join(model_folder, model_name + "-float32"),
+            },
+            {
+                "precision": "float16",
+                "model_path": os.path.join(model_folder, model_name + "-float16"),
+            },
+        ]
+    }
+
+    make_report(
+        args=args,
+        benchmark_class=OptimumBenchmark,
+        runner_dict=runner_dict,
+        benchmark_name="HF-Optimum Nvidia",
+        is_bench_pytorch=False,
+    )