added onnxruntime

premAI-io · Nov 17, 2023 · 7197a71 · 7197a71
1 parent b32365e
commit 7197a71
Show file tree

Hide file tree

Showing 5 changed files with 72 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -81,8 +81,9 @@ Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device gpu --nvidia
 | llama.cpp   |      -       |      -       | 67.64 ± 22.57| 106.21 ± 2.21|
 | ctranslate  |      -       | 58.54 ± 13.24| 34.22 ± 6.29 |      -       |
 | tinygrad    |      -       | 20.13 ± 1.35 |      -       |      -       |
+| onnx        |      -       | 50.50 ± 3.58 |      -       |      -       |
 
-*(data updated: 15th November 2023)
+*(data updated: 17th November 2023)
 
 
 ### M2 MAX 32GB Inference Bench:

diff --git a/bench.py b/bench.py
@@ -7,6 +7,7 @@
 
 from python_bench.ctranslate import CTranslateBenchmark, get_compute_types
 from python_bench.llama_cpp import LlamaCPPBenchmark
+from python_bench.onnx_bench import ONNXBenchmark
 from python_bench.tinygrad import TinyGradBenchmark
 
 logging.basicConfig(
@@ -57,6 +58,20 @@
         + f"repetitions={args.repetitions} gpu={args.gpu} nvidia={args.gpu}"
     )
     report = defaultdict(lambda: defaultdict(float))
+
+    logging.info("Running onnx benchmark")
+    onnx_bench = ONNXBenchmark(
+        "./models/llama-2-7b-onnx",
+        device="CPU" if not args.gpu else "GPU",
+    ).load_model()
+    onnx_bench.benchmark(
+        max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions
+    )
+    report["onnx"]["float16"] = {
+        "mean": np.mean(onnx_bench.results),
+        "std": np.std(onnx_bench.results),
+    }
+
     for quantize in ("Q8_0", "Q4_0"):
         logging.info(f"Running llama-cpp benchmark with {quantize}")
         llamacpp_bench = LlamaCPPBenchmark(

diff --git a/python_bench/onnx_bench.py b/python_bench/onnx_bench.py
@@ -0,0 +1,35 @@
+import time
+
+from optimum.onnxruntime import ORTModelForCausalLM
+from transformers import AutoTokenizer
+
+from python_bench.benchmark import Benchmark
+
+
+class ONNXBenchmark(Benchmark):
+    def __init__(self, model_path, device="CPU"):
+        super().__init__(model_path)
+        self.device = device
+        self.provider = (
+            "CUDAExecutionProvider" if device == "GPU" else "CPUExecutionProvider"
+        )
+
+    def load_model(self) -> Benchmark:
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+        self.model = ORTModelForCausalLM.from_pretrained(
+            self.model_path,
+            use_cache=False,
+            use_io_binding=False,
+            provider=self.provider,
+        )
+        return self
+
+    def run_model(self, prompt, max_tokens) -> float:
+        device_str = "cuda" if self.device == "GPU" else "cpu"
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(device_str)
+        start = time.time()
+        gen_tokens = self.model.generate(**inputs, max_length=max_tokens)
+        tokens_per_second = (gen_tokens.shape[1] - inputs["input_ids"].shape[1]) / (
+            time.time() - start
+        )
+        return tokens_per_second
diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,7 @@ ctranslate2==3.20.0
 huggingface-hub==0.17.3
 transformers==4.35.0
 torch==2.1.0
+optimum[onnxruntime-gpu]==1.14.1
 # Using fixed commit (a72b3700) for tinygrad to ensure stability in benchmarking.
 # Helps maintain reproducibility and guards against potential breaking changes.
 git+https://github.com/tinygrad/tinygrad.git@a72b370066837af5b4d44eeb5c4fb30aebf5c502
diff --git a/setup.sh b/setup.sh
@@ -16,6 +16,7 @@ BURN_MODEL_INPUT_DIR=$(pwd)/models/llama-2-7b-raw
 BURN_FOLDER=$(pwd)/rust_bench/llama2-burn
 BURN_MODEL_FOLDER=$(pwd)/models/llama-2-7b-burn
 BURN_MODEL_NAME="llama-2-7b-burn"
+LLAMA_ONNX_MODEL_DIR="./models/llama-2-7b-onnx"
 
 create_and_activate_venv() {
     if [ ! -d "$VENV_DIR" ]; then
@@ -81,3 +82,21 @@ if [ ! -e "$BURN_MODEL_FOLDER/$BURN_MODEL_NAME.cfg" ]; then
 else
     echo "Model llama-2-7b-burn already exists!"
 fi
+
+get_device() {
+  if command -v nvidia-smi &> /dev/null; then
+    echo "cuda"
+  else
+    echo "cpu"
+  fi
+}
+
+# Check and create llama-2-7b-st model
+if [ ! -d "$LLAMA_ONNX_MODEL_DIR" ]; then
+    optimum-cli export onnx \
+        --model $LLAMA_HF_MODEL_DIR --task text-generation --framework pt \
+        --opset 17 --sequence_length 1024 --batch_size 1 --device $(get_device) --fp16 \
+        $LLAMA_ONNX_MODEL_DIR > /dev/null
+else
+    echo "Model llama-2-7b-onnx already exists!"
+fi