From 7197a71e7d1b6cdbbd6f52c9d5cfb698b3162b3c Mon Sep 17 00:00:00 2001 From: nsosio Date: Fri, 17 Nov 2023 13:24:42 +0000 Subject: [PATCH 1/7] added onnxruntime --- README.md | 3 ++- bench.py | 15 +++++++++++++++ python_bench/onnx_bench.py | 35 +++++++++++++++++++++++++++++++++++ requirements.txt | 1 + setup.sh | 19 +++++++++++++++++++ 5 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 python_bench/onnx_bench.py diff --git a/README.md b/README.md index 6e12d2cf..d0c6108a 100644 --- a/README.md +++ b/README.md @@ -81,8 +81,9 @@ Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device gpu --nvidia | llama.cpp | - | - | 67.64 ± 22.57| 106.21 ± 2.21| | ctranslate | - | 58.54 ± 13.24| 34.22 ± 6.29 | - | | tinygrad | - | 20.13 ± 1.35 | - | - | +| onnx | - | 50.50 ± 3.58 | - | - | -*(data updated: 15th November 2023) +*(data updated: 17th November 2023) ### M2 MAX 32GB Inference Bench: diff --git a/bench.py b/bench.py index ca316af5..d9610ee8 100644 --- a/bench.py +++ b/bench.py @@ -7,6 +7,7 @@ from python_bench.ctranslate import CTranslateBenchmark, get_compute_types from python_bench.llama_cpp import LlamaCPPBenchmark +from python_bench.onnx_bench import ONNXBenchmark from python_bench.tinygrad import TinyGradBenchmark logging.basicConfig( @@ -57,6 +58,20 @@ + f"repetitions={args.repetitions} gpu={args.gpu} nvidia={args.gpu}" ) report = defaultdict(lambda: defaultdict(float)) + + logging.info("Running onnx benchmark") + onnx_bench = ONNXBenchmark( + "./models/llama-2-7b-onnx", + device="CPU" if not args.gpu else "GPU", + ).load_model() + onnx_bench.benchmark( + max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions + ) + report["onnx"]["float16"] = { + "mean": np.mean(onnx_bench.results), + "std": np.std(onnx_bench.results), + } + for quantize in ("Q8_0", "Q4_0"): logging.info(f"Running llama-cpp benchmark with {quantize}") llamacpp_bench = LlamaCPPBenchmark( diff --git a/python_bench/onnx_bench.py b/python_bench/onnx_bench.py new file mode 100644 index 00000000..8c5749ef --- /dev/null +++ b/python_bench/onnx_bench.py @@ -0,0 +1,35 @@ +import time + +from optimum.onnxruntime import ORTModelForCausalLM +from transformers import AutoTokenizer + +from python_bench.benchmark import Benchmark + + +class ONNXBenchmark(Benchmark): + def __init__(self, model_path, device="CPU"): + super().__init__(model_path) + self.device = device + self.provider = ( + "CUDAExecutionProvider" if device == "GPU" else "CPUExecutionProvider" + ) + + def load_model(self) -> Benchmark: + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.model = ORTModelForCausalLM.from_pretrained( + self.model_path, + use_cache=False, + use_io_binding=False, + provider=self.provider, + ) + return self + + def run_model(self, prompt, max_tokens) -> float: + device_str = "cuda" if self.device == "GPU" else "cpu" + inputs = self.tokenizer(prompt, return_tensors="pt").to(device_str) + start = time.time() + gen_tokens = self.model.generate(**inputs, max_length=max_tokens) + tokens_per_second = (gen_tokens.shape[1] - inputs["input_ids"].shape[1]) / ( + time.time() - start + ) + return tokens_per_second diff --git a/requirements.txt b/requirements.txt index b37ebfcc..ad09f117 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ ctranslate2==3.20.0 huggingface-hub==0.17.3 transformers==4.35.0 torch==2.1.0 +optimum[onnxruntime-gpu]==1.14.1 # Using fixed commit (a72b3700) for tinygrad to ensure stability in benchmarking. # Helps maintain reproducibility and guards against potential breaking changes. git+https://github.com/tinygrad/tinygrad.git@a72b370066837af5b4d44eeb5c4fb30aebf5c502 diff --git a/setup.sh b/setup.sh index d8e7cf6a..e0bcb92f 100755 --- a/setup.sh +++ b/setup.sh @@ -16,6 +16,7 @@ BURN_MODEL_INPUT_DIR=$(pwd)/models/llama-2-7b-raw BURN_FOLDER=$(pwd)/rust_bench/llama2-burn BURN_MODEL_FOLDER=$(pwd)/models/llama-2-7b-burn BURN_MODEL_NAME="llama-2-7b-burn" +LLAMA_ONNX_MODEL_DIR="./models/llama-2-7b-onnx" create_and_activate_venv() { if [ ! -d "$VENV_DIR" ]; then @@ -81,3 +82,21 @@ if [ ! -e "$BURN_MODEL_FOLDER/$BURN_MODEL_NAME.cfg" ]; then else echo "Model llama-2-7b-burn already exists!" fi + +get_device() { + if command -v nvidia-smi &> /dev/null; then + echo "cuda" + else + echo "cpu" + fi +} + +# Check and create llama-2-7b-st model +if [ ! -d "$LLAMA_ONNX_MODEL_DIR" ]; then + optimum-cli export onnx \ + --model $LLAMA_HF_MODEL_DIR --task text-generation --framework pt \ + --opset 17 --sequence_length 1024 --batch_size 1 --device $(get_device) --fp16 \ + $LLAMA_ONNX_MODEL_DIR > /dev/null +else + echo "Model llama-2-7b-onnx already exists!" +fi From 21e588060e51a0cd74763cc4965d7f21f5e38276 Mon Sep 17 00:00:00 2001 From: nsosio Date: Thu, 23 Nov 2023 09:54:56 +0100 Subject: [PATCH 2/7] updated readme --- README.md.template | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md.template b/README.md.template index fc32074d..774c8d98 100644 --- a/README.md.template +++ b/README.md.template @@ -71,7 +71,7 @@ Model: LLAMA-2-7B CUDA Version: 11.7 -Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device gpu --nvidia --prompt 'Explain what is a transformer'` +Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device cuda --prompt 'Explain what is a transformer'` | Engine | float32 | float16 | int8 | int4 | |-------------|--------------|---------------|---------------|---------------| @@ -80,6 +80,7 @@ Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device gpu --nvidia | llama.cpp | - | - | 84.48 ± 3.76 | 106.76 ± 1.29 | | ctranslate | - | 51.38 ± 16.01 | 36.12 ± 11.93 | - | | tinygrad | - | 20.32 ± 0.06 | - | - | +| onnx | - | 50.50 ± 3.58 | - | - | *(data updated: ) @@ -104,7 +105,7 @@ Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device cpu --prompt #### GPU (Metal) -Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device gpu --prompt 'Explain what is a transformer'` +Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device metal --prompt 'Explain what is a transformer'` | Engine | float32 | float16 | int8 | int4 | |-------------|--------------|--------------|--------------|--------------| From 8ed2d26b984831fb77a1b404bbf08a5bfa8a7982 Mon Sep 17 00:00:00 2001 From: nsosio Date: Thu, 23 Nov 2023 12:05:08 +0000 Subject: [PATCH 3/7] bugfixes; still not working --- bench_onnxruntime/bench.sh | 2 +- bench_onnxruntime/requirements.txt | 4 ++++ bench_onnxruntime/setup.sh | 36 +++++++++++++++++++++++++++--- 3 files changed, 38 insertions(+), 4 deletions(-) mode change 100644 => 100755 bench_onnxruntime/setup.sh diff --git a/bench_onnxruntime/bench.sh b/bench_onnxruntime/bench.sh index 866abe41..3fcb07c0 100755 --- a/bench_onnxruntime/bench.sh +++ b/bench_onnxruntime/bench.sh @@ -150,5 +150,5 @@ MODELS_DIR="${MODELS_DIR:-"./models"}" check_platform check_python -setup "$DEVICE" +setup "$MODELS_DIR" run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR" diff --git a/bench_onnxruntime/requirements.txt b/bench_onnxruntime/requirements.txt index 9025b77d..2444ea67 100644 --- a/bench_onnxruntime/requirements.txt +++ b/bench_onnxruntime/requirements.txt @@ -1 +1,5 @@ +sentencepiece==0.1.99 +huggingface-hub==0.17.3 +transformers==4.35.0 +torch==2.1.0 optimum[onnxruntime-gpu]==1.14.1 diff --git a/bench_onnxruntime/setup.sh b/bench_onnxruntime/setup.sh old mode 100644 new mode 100755 index 987709b5..45073d96 --- a/bench_onnxruntime/setup.sh +++ b/bench_onnxruntime/setup.sh @@ -1,16 +1,24 @@ #!/bin/bash ################################################################################ -# Script: setup.sh +# Script: setup.sh # Description: Automates the setup of a virtual environment and installs project -# requirements. +# requirements and handles model conversion. ################################################################################ set -euo pipefail +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + # Define directory paths SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" VENV_DIR="$SCRIPT_DIR/venv" +MODELS_FOLDER="$1" +LLAMA_HF_MODEL_DIR="$MODELS_FOLDER/llama-2-7b-hf" +LLAMA_ONNX_MODEL_DIR="$MODELS_FOLDER/llama-2-7b-onnx" if [ ! -d "$VENV_DIR" ]; then python -m venv "$VENV_DIR" @@ -18,8 +26,30 @@ if [ ! -d "$VENV_DIR" ]; then # shellcheck disable=SC1091 source "$VENV_DIR/bin/activate" pip install --upgrade pip > /dev/null - pip install -r requirements.txt > /dev/null + pip install -r "$SCRIPT_DIR"/requirements.txt else # shellcheck disable=SC1091 source "$VENV_DIR/bin/activate" fi + +get_device() { + if command -v nvidia-smi &> /dev/null; then + echo "cuda" + else + echo "cpu" + fi +} + +# Check and create llama-2-7b-onnx model +if [ ! -d "$LLAMA_ONNX_MODEL_DIR" ]; then + echo "optimum-cli export onnx \ + --model $LLAMA_HF_MODEL_DIR --task text-generation --framework pt \ + --opset 17 --sequence_length 1024 --batch_size 1 --device $(get_device) --fp16 \ + $LLAMA_ONNX_MODEL_DIR" + optimum-cli export onnx \ + --model "$LLAMA_HF_MODEL_DIR" --task text-generation --framework pt \ + --opset 17 --sequence_length 1024 --batch_size 1 --device "$(get_device)" --fp16 \ + "$LLAMA_ONNX_MODEL_DIR" +else + echo "Model llama-2-7b-onnx already exists!" +fi From df9958838aad5660cb2fa3076ca1eff9327e3817 Mon Sep 17 00:00:00 2001 From: nsosio Date: Thu, 23 Nov 2023 16:03:57 +0000 Subject: [PATCH 4/7] bugfixes --- README.md.template | 2 +- bench_onnxruntime/bench.py | 1 + bench_onnxruntime/bench.sh | 3 +++ bench_onnxruntime/requirements.txt | 7 ++----- bench_onnxruntime/setup.sh | 8 ++------ 5 files changed, 9 insertions(+), 12 deletions(-) diff --git a/README.md.template b/README.md.template index 774c8d98..7f893179 100644 --- a/README.md.template +++ b/README.md.template @@ -80,7 +80,7 @@ Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device cuda --promp | llama.cpp | - | - | 84.48 ± 3.76 | 106.76 ± 1.29 | | ctranslate | - | 51.38 ± 16.01 | 36.12 ± 11.93 | - | | tinygrad | - | 20.32 ± 0.06 | - | - | -| onnx | - | 50.50 ± 3.58 | - | - | +| onnx | - | 54.16 ± 3.15 | - | - | *(data updated: ) diff --git a/bench_onnxruntime/bench.py b/bench_onnxruntime/bench.py index 22d594a5..4264de26 100644 --- a/bench_onnxruntime/bench.py +++ b/bench_onnxruntime/bench.py @@ -22,6 +22,7 @@ def __init__(self, model_path, device="cpu"): self.provider = ( "CUDAExecutionProvider" if device == "cuda" else "CPUExecutionProvider" ) + self.results = [] def load_model(self): self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) diff --git a/bench_onnxruntime/bench.sh b/bench_onnxruntime/bench.sh index 3fcb07c0..bbd77b11 100755 --- a/bench_onnxruntime/bench.sh +++ b/bench_onnxruntime/bench.sh @@ -78,6 +78,9 @@ run_benchmarks() { local DEVICE="$4" local LOG_FILENAME="$5" local MODELS_DIR="$6" + + # shellcheck disable=SC1091 + source "$SCRIPT_DIR/venv/bin/activate" python "$SCRIPT_DIR"/bench.py \ --prompt "$PROMPT" \ diff --git a/bench_onnxruntime/requirements.txt b/bench_onnxruntime/requirements.txt index 2444ea67..a4fc99f4 100644 --- a/bench_onnxruntime/requirements.txt +++ b/bench_onnxruntime/requirements.txt @@ -1,5 +1,2 @@ -sentencepiece==0.1.99 -huggingface-hub==0.17.3 -transformers==4.35.0 -torch==2.1.0 -optimum[onnxruntime-gpu]==1.14.1 +torch --index-url https://download.pytorch.org/whl/cu116 +optimum[onnxruntime-gpu]==1.14 diff --git a/bench_onnxruntime/setup.sh b/bench_onnxruntime/setup.sh index 45073d96..f4d21ceb 100755 --- a/bench_onnxruntime/setup.sh +++ b/bench_onnxruntime/setup.sh @@ -26,7 +26,7 @@ if [ ! -d "$VENV_DIR" ]; then # shellcheck disable=SC1091 source "$VENV_DIR/bin/activate" pip install --upgrade pip > /dev/null - pip install -r "$SCRIPT_DIR"/requirements.txt + pip install -r "$SCRIPT_DIR"/requirements.txt > /dev/null else # shellcheck disable=SC1091 source "$VENV_DIR/bin/activate" @@ -42,14 +42,10 @@ get_device() { # Check and create llama-2-7b-onnx model if [ ! -d "$LLAMA_ONNX_MODEL_DIR" ]; then - echo "optimum-cli export onnx \ - --model $LLAMA_HF_MODEL_DIR --task text-generation --framework pt \ - --opset 17 --sequence_length 1024 --batch_size 1 --device $(get_device) --fp16 \ - $LLAMA_ONNX_MODEL_DIR" optimum-cli export onnx \ --model "$LLAMA_HF_MODEL_DIR" --task text-generation --framework pt \ --opset 17 --sequence_length 1024 --batch_size 1 --device "$(get_device)" --fp16 \ - "$LLAMA_ONNX_MODEL_DIR" + "$LLAMA_ONNX_MODEL_DIR" > /dev/null else echo "Model llama-2-7b-onnx already exists!" fi From 015b69f6db82c6ad02cacd08471e6ea1f80b159d Mon Sep 17 00:00:00 2001 From: nsosio Date: Thu, 23 Nov 2023 16:06:28 +0000 Subject: [PATCH 5/7] linter --- bench_onnxruntime/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench_onnxruntime/bench.sh b/bench_onnxruntime/bench.sh index bbd77b11..0e53abbf 100755 --- a/bench_onnxruntime/bench.sh +++ b/bench_onnxruntime/bench.sh @@ -78,7 +78,7 @@ run_benchmarks() { local DEVICE="$4" local LOG_FILENAME="$5" local MODELS_DIR="$6" - + # shellcheck disable=SC1091 source "$SCRIPT_DIR/venv/bin/activate" From 0b15adf0e163d771c6bc90b61223a605f360009f Mon Sep 17 00:00:00 2001 From: nsosio Date: Thu, 23 Nov 2023 16:36:11 +0000 Subject: [PATCH 6/7] disabled cpu --- bench_onnxruntime/bench.sh | 4 ++++ bench_onnxruntime/setup.sh | 11 +---------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/bench_onnxruntime/bench.sh b/bench_onnxruntime/bench.sh index 0e53abbf..c90ca7a7 100755 --- a/bench_onnxruntime/bench.sh +++ b/bench_onnxruntime/bench.sh @@ -124,6 +124,10 @@ while [ "$#" -gt 0 ]; do echo "Metal not supported!" exit 0 fi + if [ "$DEVICE" == "cpu" ]; then + echo "cpu not supported!" + exit 0 + fi shift 2 ;; -lf|--log_file) diff --git a/bench_onnxruntime/setup.sh b/bench_onnxruntime/setup.sh index f4d21ceb..d33a443b 100755 --- a/bench_onnxruntime/setup.sh +++ b/bench_onnxruntime/setup.sh @@ -31,20 +31,11 @@ else # shellcheck disable=SC1091 source "$VENV_DIR/bin/activate" fi - -get_device() { - if command -v nvidia-smi &> /dev/null; then - echo "cuda" - else - echo "cpu" - fi -} - # Check and create llama-2-7b-onnx model if [ ! -d "$LLAMA_ONNX_MODEL_DIR" ]; then optimum-cli export onnx \ --model "$LLAMA_HF_MODEL_DIR" --task text-generation --framework pt \ - --opset 17 --sequence_length 1024 --batch_size 1 --device "$(get_device)" --fp16 \ + --opset 17 --sequence_length 1024 --batch_size 1 --device cuda --fp16 \ "$LLAMA_ONNX_MODEL_DIR" > /dev/null else echo "Model llama-2-7b-onnx already exists!" From c653ef9042b17830a5ba7de601b1ad8dedf04c9a Mon Sep 17 00:00:00 2001 From: nsosio Date: Thu, 23 Nov 2023 16:38:08 +0000 Subject: [PATCH 7/7] updated README.md --- README.md.template | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md.template b/README.md.template index 7f893179..315f5606 100644 --- a/README.md.template +++ b/README.md.template @@ -95,24 +95,26 @@ CUDA Version: NA Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device cpu --prompt 'Explain what is a transformer'` -| Engine | float32 | float16 | int8 | int4 | +| Engine | float32 | float16 | int8 | int4 | |-------------|--------------|--------------|--------------|--------------| | burn | 0.30 ± 0.09 | - | - | - | | candle | - | 3.43 ± 0.02 | - | - | | llama.cpp | - | - | 14.41 ± 1.59 | 20.96 ± 1.94 | | ctranslate | - | - | 2.11 ± 0.73 | - | | tinygrad | - | 4.21 ± 0.38 | - | - | +| onnx | - | - | - | - | #### GPU (Metal) Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device metal --prompt 'Explain what is a transformer'` -| Engine | float32 | float16 | int8 | int4 | +| Engine | float32 | float16 | int8 | int4 | |-------------|--------------|--------------|--------------|--------------| | burn | - | - | - | - | | candle | - | - | - | - | | llama.cpp | - | - | 31.24 ± 7.82 | 46.75 ± 9.55 | | ctranslate | - | - | - | - | | tinygrad | - | 29.78 ± 1.18 | - | - | +| onnx | - | - | - | - | *(data updated: )