From 7197a71e7d1b6cdbbd6f52c9d5cfb698b3162b3c Mon Sep 17 00:00:00 2001
From: nsosio <sosio.nicola94@tiscali.it>
Date: Fri, 17 Nov 2023 13:24:42 +0000
Subject: [PATCH 1/7] added onnxruntime

---
 README.md                  |  3 ++-
 bench.py                   | 15 +++++++++++++++
 python_bench/onnx_bench.py | 35 +++++++++++++++++++++++++++++++++++
 requirements.txt           |  1 +
 setup.sh                   | 19 +++++++++++++++++++
 5 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 python_bench/onnx_bench.py

diff --git a/README.md b/README.md
index 6e12d2cf..d0c6108a 100644
--- a/README.md
+++ b/README.md
@@ -81,8 +81,9 @@ Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device gpu --nvidia
 | llama.cpp   |      -       |      -       | 67.64 ± 22.57| 106.21 ± 2.21|
 | ctranslate  |      -       | 58.54 ± 13.24| 34.22 ± 6.29 |      -       |
 | tinygrad    |      -       | 20.13 ± 1.35 |      -       |      -       |
+| onnx        |      -       | 50.50 ± 3.58 |      -       |      -       |
 
-*(data updated: 15th November 2023)
+*(data updated: 17th November 2023)
 
 
 ### M2 MAX 32GB Inference Bench:
diff --git a/bench.py b/bench.py
index ca316af5..d9610ee8 100644
--- a/bench.py
+++ b/bench.py
@@ -7,6 +7,7 @@
 
 from python_bench.ctranslate import CTranslateBenchmark, get_compute_types
 from python_bench.llama_cpp import LlamaCPPBenchmark
+from python_bench.onnx_bench import ONNXBenchmark
 from python_bench.tinygrad import TinyGradBenchmark
 
 logging.basicConfig(
@@ -57,6 +58,20 @@
         + f"repetitions={args.repetitions} gpu={args.gpu} nvidia={args.gpu}"
     )
     report = defaultdict(lambda: defaultdict(float))
+
+    logging.info("Running onnx benchmark")
+    onnx_bench = ONNXBenchmark(
+        "./models/llama-2-7b-onnx",
+        device="CPU" if not args.gpu else "GPU",
+    ).load_model()
+    onnx_bench.benchmark(
+        max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions
+    )
+    report["onnx"]["float16"] = {
+        "mean": np.mean(onnx_bench.results),
+        "std": np.std(onnx_bench.results),
+    }
+
     for quantize in ("Q8_0", "Q4_0"):
         logging.info(f"Running llama-cpp benchmark with {quantize}")
         llamacpp_bench = LlamaCPPBenchmark(
diff --git a/python_bench/onnx_bench.py b/python_bench/onnx_bench.py
new file mode 100644
index 00000000..8c5749ef
--- /dev/null
+++ b/python_bench/onnx_bench.py
@@ -0,0 +1,35 @@
+import time
+
+from optimum.onnxruntime import ORTModelForCausalLM
+from transformers import AutoTokenizer
+
+from python_bench.benchmark import Benchmark
+
+
+class ONNXBenchmark(Benchmark):
+    def __init__(self, model_path, device="CPU"):
+        super().__init__(model_path)
+        self.device = device
+        self.provider = (
+            "CUDAExecutionProvider" if device == "GPU" else "CPUExecutionProvider"
+        )
+
+    def load_model(self) -> Benchmark:
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+        self.model = ORTModelForCausalLM.from_pretrained(
+            self.model_path,
+            use_cache=False,
+            use_io_binding=False,
+            provider=self.provider,
+        )
+        return self
+
+    def run_model(self, prompt, max_tokens) -> float:
+        device_str = "cuda" if self.device == "GPU" else "cpu"
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(device_str)
+        start = time.time()
+        gen_tokens = self.model.generate(**inputs, max_length=max_tokens)
+        tokens_per_second = (gen_tokens.shape[1] - inputs["input_ids"].shape[1]) / (
+            time.time() - start
+        )
+        return tokens_per_second
diff --git a/requirements.txt b/requirements.txt
index b37ebfcc..ad09f117 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ ctranslate2==3.20.0
 huggingface-hub==0.17.3
 transformers==4.35.0
 torch==2.1.0
+optimum[onnxruntime-gpu]==1.14.1
 # Using fixed commit (a72b3700) for tinygrad to ensure stability in benchmarking.
 # Helps maintain reproducibility and guards against potential breaking changes.
 git+https://github.com/tinygrad/tinygrad.git@a72b370066837af5b4d44eeb5c4fb30aebf5c502
diff --git a/setup.sh b/setup.sh
index d8e7cf6a..e0bcb92f 100755
--- a/setup.sh
+++ b/setup.sh
@@ -16,6 +16,7 @@ BURN_MODEL_INPUT_DIR=$(pwd)/models/llama-2-7b-raw
 BURN_FOLDER=$(pwd)/rust_bench/llama2-burn
 BURN_MODEL_FOLDER=$(pwd)/models/llama-2-7b-burn
 BURN_MODEL_NAME="llama-2-7b-burn"
+LLAMA_ONNX_MODEL_DIR="./models/llama-2-7b-onnx"
 
 create_and_activate_venv() {
     if [ ! -d "$VENV_DIR" ]; then
@@ -81,3 +82,21 @@ if [ ! -e "$BURN_MODEL_FOLDER/$BURN_MODEL_NAME.cfg" ]; then
 else
     echo "Model llama-2-7b-burn already exists!"
 fi
+
+get_device() {
+  if command -v nvidia-smi &> /dev/null; then
+    echo "cuda"
+  else
+    echo "cpu"
+  fi
+}
+
+# Check and create llama-2-7b-st model
+if [ ! -d "$LLAMA_ONNX_MODEL_DIR" ]; then
+    optimum-cli export onnx \
+        --model $LLAMA_HF_MODEL_DIR --task text-generation --framework pt \
+        --opset 17 --sequence_length 1024 --batch_size 1 --device $(get_device) --fp16 \
+        $LLAMA_ONNX_MODEL_DIR > /dev/null
+else
+    echo "Model llama-2-7b-onnx already exists!"
+fi

From 21e588060e51a0cd74763cc4965d7f21f5e38276 Mon Sep 17 00:00:00 2001
From: nsosio <sosio.nicola94@tiscali.it>
Date: Thu, 23 Nov 2023 09:54:56 +0100
Subject: [PATCH 2/7] updated readme

---
 README.md.template | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md.template b/README.md.template
index fc32074d..774c8d98 100644
--- a/README.md.template
+++ b/README.md.template
@@ -71,7 +71,7 @@ Model: LLAMA-2-7B
 
 CUDA Version: 11.7
 
-Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device gpu --nvidia --prompt 'Explain what is a transformer'`
+Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device cuda --prompt 'Explain what is a transformer'`
 
 | Engine      | float32      | float16       | int8          | int4          |
 |-------------|--------------|---------------|---------------|---------------|
@@ -80,6 +80,7 @@ Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device gpu --nvidia
 | llama.cpp   |      -       |      -        | 84.48 ± 3.76  | 106.76 ± 1.29 |
 | ctranslate  |      -       | 51.38 ± 16.01 | 36.12 ± 11.93 |      -        |
 | tinygrad    |      -       | 20.32 ± 0.06  |      -        |      -        |
+| onnx        |      -       | 50.50 ± 3.58  |      -        |      -        |
 
 *(data updated: <LAST_UPDATE>)
 
@@ -104,7 +105,7 @@ Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device cpu --prompt
 
 #### GPU (Metal)
 
-Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device gpu --prompt 'Explain what is a transformer'`
+Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device metal --prompt 'Explain what is a transformer'`
 
 | Engine      | float32       | float16       | int8         | int4         |
 |-------------|--------------|--------------|--------------|--------------|

From 8ed2d26b984831fb77a1b404bbf08a5bfa8a7982 Mon Sep 17 00:00:00 2001
From: nsosio <sosio.nicola94@tiscali.it>
Date: Thu, 23 Nov 2023 12:05:08 +0000
Subject: [PATCH 3/7] bugfixes; still not working

---
 bench_onnxruntime/bench.sh         |  2 +-
 bench_onnxruntime/requirements.txt |  4 ++++
 bench_onnxruntime/setup.sh         | 36 +++++++++++++++++++++++++++---
 3 files changed, 38 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 bench_onnxruntime/setup.sh

diff --git a/bench_onnxruntime/bench.sh b/bench_onnxruntime/bench.sh
index 866abe41..3fcb07c0 100755
--- a/bench_onnxruntime/bench.sh
+++ b/bench_onnxruntime/bench.sh
@@ -150,5 +150,5 @@ MODELS_DIR="${MODELS_DIR:-"./models"}"
 
 check_platform
 check_python
-setup "$DEVICE"
+setup "$MODELS_DIR"
 run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"
diff --git a/bench_onnxruntime/requirements.txt b/bench_onnxruntime/requirements.txt
index 9025b77d..2444ea67 100644
--- a/bench_onnxruntime/requirements.txt
+++ b/bench_onnxruntime/requirements.txt
@@ -1 +1,5 @@
+sentencepiece==0.1.99
+huggingface-hub==0.17.3
+transformers==4.35.0
+torch==2.1.0
 optimum[onnxruntime-gpu]==1.14.1
diff --git a/bench_onnxruntime/setup.sh b/bench_onnxruntime/setup.sh
old mode 100644
new mode 100755
index 987709b5..45073d96
--- a/bench_onnxruntime/setup.sh
+++ b/bench_onnxruntime/setup.sh
@@ -1,16 +1,24 @@
 #!/bin/bash
 
 ################################################################################
-# Script: setup.sh
+# Script: setup.sh <MODELS_FOLDER>
 # Description: Automates the setup of a virtual environment and installs project
-# requirements.
+# requirements and handles model conversion.
 ################################################################################
 
 set -euo pipefail
 
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <models_folder>"
+    exit 1
+fi
+
 # Define directory paths
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 VENV_DIR="$SCRIPT_DIR/venv"
+MODELS_FOLDER="$1"
+LLAMA_HF_MODEL_DIR="$MODELS_FOLDER/llama-2-7b-hf"
+LLAMA_ONNX_MODEL_DIR="$MODELS_FOLDER/llama-2-7b-onnx"
 
 if [ ! -d "$VENV_DIR" ]; then
     python -m venv "$VENV_DIR"
@@ -18,8 +26,30 @@ if [ ! -d "$VENV_DIR" ]; then
     # shellcheck disable=SC1091
     source "$VENV_DIR/bin/activate"
     pip install --upgrade pip > /dev/null
-    pip install -r requirements.txt > /dev/null
+    pip install -r "$SCRIPT_DIR"/requirements.txt
 else
     # shellcheck disable=SC1091
     source "$VENV_DIR/bin/activate"
 fi
+
+get_device() {
+  if command -v nvidia-smi &> /dev/null; then
+    echo "cuda"
+  else
+    echo "cpu"
+  fi
+}
+
+# Check and create llama-2-7b-onnx model
+if [ ! -d "$LLAMA_ONNX_MODEL_DIR" ]; then
+    echo "optimum-cli export onnx \
+        --model $LLAMA_HF_MODEL_DIR --task text-generation --framework pt \
+        --opset 17 --sequence_length 1024 --batch_size 1 --device $(get_device) --fp16 \
+        $LLAMA_ONNX_MODEL_DIR"
+    optimum-cli export onnx \
+        --model "$LLAMA_HF_MODEL_DIR" --task text-generation --framework pt \
+        --opset 17 --sequence_length 1024 --batch_size 1 --device "$(get_device)" --fp16 \
+        "$LLAMA_ONNX_MODEL_DIR"
+else
+    echo "Model llama-2-7b-onnx already exists!"
+fi

From df9958838aad5660cb2fa3076ca1eff9327e3817 Mon Sep 17 00:00:00 2001
From: nsosio <sosio.nicola94@tiscali.it>
Date: Thu, 23 Nov 2023 16:03:57 +0000
Subject: [PATCH 4/7] bugfixes

---
 README.md.template                 | 2 +-
 bench_onnxruntime/bench.py         | 1 +
 bench_onnxruntime/bench.sh         | 3 +++
 bench_onnxruntime/requirements.txt | 7 ++-----
 bench_onnxruntime/setup.sh         | 8 ++------
 5 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/README.md.template b/README.md.template
index 774c8d98..7f893179 100644
--- a/README.md.template
+++ b/README.md.template
@@ -80,7 +80,7 @@ Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device cuda --promp
 | llama.cpp   |      -       |      -        | 84.48 ± 3.76  | 106.76 ± 1.29 |
 | ctranslate  |      -       | 51.38 ± 16.01 | 36.12 ± 11.93 |      -        |
 | tinygrad    |      -       | 20.32 ± 0.06  |      -        |      -        |
-| onnx        |      -       | 50.50 ± 3.58  |      -        |      -        |
+| onnx        |      -       | 54.16 ± 3.15  |      -        |      -        |
 
 *(data updated: <LAST_UPDATE>)
 
diff --git a/bench_onnxruntime/bench.py b/bench_onnxruntime/bench.py
index 22d594a5..4264de26 100644
--- a/bench_onnxruntime/bench.py
+++ b/bench_onnxruntime/bench.py
@@ -22,6 +22,7 @@ def __init__(self, model_path, device="cpu"):
         self.provider = (
             "CUDAExecutionProvider" if device == "cuda" else "CPUExecutionProvider"
         )
+        self.results = []
 
     def load_model(self):
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
diff --git a/bench_onnxruntime/bench.sh b/bench_onnxruntime/bench.sh
index 3fcb07c0..bbd77b11 100755
--- a/bench_onnxruntime/bench.sh
+++ b/bench_onnxruntime/bench.sh
@@ -78,6 +78,9 @@ run_benchmarks() {
     local DEVICE="$4"
     local LOG_FILENAME="$5"
     local MODELS_DIR="$6"
+    
+    # shellcheck disable=SC1091
+    source "$SCRIPT_DIR/venv/bin/activate"
 
     python "$SCRIPT_DIR"/bench.py \
         --prompt "$PROMPT" \
diff --git a/bench_onnxruntime/requirements.txt b/bench_onnxruntime/requirements.txt
index 2444ea67..a4fc99f4 100644
--- a/bench_onnxruntime/requirements.txt
+++ b/bench_onnxruntime/requirements.txt
@@ -1,5 +1,2 @@
-sentencepiece==0.1.99
-huggingface-hub==0.17.3
-transformers==4.35.0
-torch==2.1.0
-optimum[onnxruntime-gpu]==1.14.1
+torch --index-url https://download.pytorch.org/whl/cu116
+optimum[onnxruntime-gpu]==1.14
diff --git a/bench_onnxruntime/setup.sh b/bench_onnxruntime/setup.sh
index 45073d96..f4d21ceb 100755
--- a/bench_onnxruntime/setup.sh
+++ b/bench_onnxruntime/setup.sh
@@ -26,7 +26,7 @@ if [ ! -d "$VENV_DIR" ]; then
     # shellcheck disable=SC1091
     source "$VENV_DIR/bin/activate"
     pip install --upgrade pip > /dev/null
-    pip install -r "$SCRIPT_DIR"/requirements.txt
+    pip install -r "$SCRIPT_DIR"/requirements.txt > /dev/null
 else
     # shellcheck disable=SC1091
     source "$VENV_DIR/bin/activate"
@@ -42,14 +42,10 @@ get_device() {
 
 # Check and create llama-2-7b-onnx model
 if [ ! -d "$LLAMA_ONNX_MODEL_DIR" ]; then
-    echo "optimum-cli export onnx \
-        --model $LLAMA_HF_MODEL_DIR --task text-generation --framework pt \
-        --opset 17 --sequence_length 1024 --batch_size 1 --device $(get_device) --fp16 \
-        $LLAMA_ONNX_MODEL_DIR"
     optimum-cli export onnx \
         --model "$LLAMA_HF_MODEL_DIR" --task text-generation --framework pt \
         --opset 17 --sequence_length 1024 --batch_size 1 --device "$(get_device)" --fp16 \
-        "$LLAMA_ONNX_MODEL_DIR"
+        "$LLAMA_ONNX_MODEL_DIR" > /dev/null
 else
     echo "Model llama-2-7b-onnx already exists!"
 fi

From 015b69f6db82c6ad02cacd08471e6ea1f80b159d Mon Sep 17 00:00:00 2001
From: nsosio <sosio.nicola94@tiscali.it>
Date: Thu, 23 Nov 2023 16:06:28 +0000
Subject: [PATCH 5/7] linter

---
 bench_onnxruntime/bench.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_onnxruntime/bench.sh b/bench_onnxruntime/bench.sh
index bbd77b11..0e53abbf 100755
--- a/bench_onnxruntime/bench.sh
+++ b/bench_onnxruntime/bench.sh
@@ -78,7 +78,7 @@ run_benchmarks() {
     local DEVICE="$4"
     local LOG_FILENAME="$5"
     local MODELS_DIR="$6"
-    
+
     # shellcheck disable=SC1091
     source "$SCRIPT_DIR/venv/bin/activate"
 

From 0b15adf0e163d771c6bc90b61223a605f360009f Mon Sep 17 00:00:00 2001
From: nsosio <sosio.nicola94@tiscali.it>
Date: Thu, 23 Nov 2023 16:36:11 +0000
Subject: [PATCH 6/7] disabled cpu

---
 bench_onnxruntime/bench.sh |  4 ++++
 bench_onnxruntime/setup.sh | 11 +----------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/bench_onnxruntime/bench.sh b/bench_onnxruntime/bench.sh
index 0e53abbf..c90ca7a7 100755
--- a/bench_onnxruntime/bench.sh
+++ b/bench_onnxruntime/bench.sh
@@ -124,6 +124,10 @@ while [ "$#" -gt 0 ]; do
                 echo "Metal not supported!"
                 exit 0
             fi
+            if [ "$DEVICE" == "cpu" ]; then
+                echo "cpu not supported!"
+                exit 0
+            fi
             shift 2
             ;;
         -lf|--log_file)
diff --git a/bench_onnxruntime/setup.sh b/bench_onnxruntime/setup.sh
index f4d21ceb..d33a443b 100755
--- a/bench_onnxruntime/setup.sh
+++ b/bench_onnxruntime/setup.sh
@@ -31,20 +31,11 @@ else
     # shellcheck disable=SC1091
     source "$VENV_DIR/bin/activate"
 fi
-
-get_device() {
-  if command -v nvidia-smi &> /dev/null; then
-    echo "cuda"
-  else
-    echo "cpu"
-  fi
-}
-
 # Check and create llama-2-7b-onnx model
 if [ ! -d "$LLAMA_ONNX_MODEL_DIR" ]; then
     optimum-cli export onnx \
         --model "$LLAMA_HF_MODEL_DIR" --task text-generation --framework pt \
-        --opset 17 --sequence_length 1024 --batch_size 1 --device "$(get_device)" --fp16 \
+        --opset 17 --sequence_length 1024 --batch_size 1 --device cuda --fp16 \
         "$LLAMA_ONNX_MODEL_DIR" > /dev/null
 else
     echo "Model llama-2-7b-onnx already exists!"

From c653ef9042b17830a5ba7de601b1ad8dedf04c9a Mon Sep 17 00:00:00 2001
From: nsosio <sosio.nicola94@tiscali.it>
Date: Thu, 23 Nov 2023 16:38:08 +0000
Subject: [PATCH 7/7] updated README.md

---
 README.md.template | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md.template b/README.md.template
index 7f893179..315f5606 100644
--- a/README.md.template
+++ b/README.md.template
@@ -95,24 +95,26 @@ CUDA Version: NA
 
 Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device cpu --prompt 'Explain what is a transformer'`
 
-| Engine      | float32       | float16       | int8         | int4         |
+| Engine      | float32      | float16      | int8         | int4         |
 |-------------|--------------|--------------|--------------|--------------|
 | burn        | 0.30 ± 0.09  |      -       |      -       |      -       |
 | candle      |      -       | 3.43 ± 0.02  |      -       |      -       |
 | llama.cpp   |      -       |      -       | 14.41 ± 1.59 | 20.96 ± 1.94 |
 | ctranslate  |      -       |      -       | 2.11 ± 0.73  |      -       |
 | tinygrad    |      -       | 4.21 ± 0.38  |      -       |      -       |
+| onnx        |      -       |      -       |      -       |      -       |
 
 #### GPU (Metal)
 
 Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device metal --prompt 'Explain what is a transformer'`
 
-| Engine      | float32       | float16       | int8         | int4         |
+| Engine      | float32      | float16      | int8         | int4         |
 |-------------|--------------|--------------|--------------|--------------|
 | burn        |      -       |      -       |      -       |      -       |
 | candle      |      -       |      -       |      -       |      -       |
 | llama.cpp   |      -       |      -       | 31.24 ± 7.82 | 46.75 ± 9.55 |
 | ctranslate  |      -       |      -       |      -       |      -       |
 | tinygrad    |      -       | 29.78 ± 1.18 |      -       |      -       |
+| onnx        |      -       |      -       |      -       |      -       |
 
 *(data updated: <LAST_UPDATE>)