From 7197a71e7d1b6cdbbd6f52c9d5cfb698b3162b3c Mon Sep 17 00:00:00 2001 From: nsosio Date: Fri, 17 Nov 2023 13:24:42 +0000 Subject: [PATCH] added onnxruntime --- README.md | 3 ++- bench.py | 15 +++++++++++++++ python_bench/onnx_bench.py | 35 +++++++++++++++++++++++++++++++++++ requirements.txt | 1 + setup.sh | 19 +++++++++++++++++++ 5 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 python_bench/onnx_bench.py diff --git a/README.md b/README.md index 6e12d2cf..d0c6108a 100644 --- a/README.md +++ b/README.md @@ -81,8 +81,9 @@ Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device gpu --nvidia | llama.cpp | - | - | 67.64 ± 22.57| 106.21 ± 2.21| | ctranslate | - | 58.54 ± 13.24| 34.22 ± 6.29 | - | | tinygrad | - | 20.13 ± 1.35 | - | - | +| onnx | - | 50.50 ± 3.58 | - | - | -*(data updated: 15th November 2023) +*(data updated: 17th November 2023) ### M2 MAX 32GB Inference Bench: diff --git a/bench.py b/bench.py index ca316af5..d9610ee8 100644 --- a/bench.py +++ b/bench.py @@ -7,6 +7,7 @@ from python_bench.ctranslate import CTranslateBenchmark, get_compute_types from python_bench.llama_cpp import LlamaCPPBenchmark +from python_bench.onnx_bench import ONNXBenchmark from python_bench.tinygrad import TinyGradBenchmark logging.basicConfig( @@ -57,6 +58,20 @@ + f"repetitions={args.repetitions} gpu={args.gpu} nvidia={args.gpu}" ) report = defaultdict(lambda: defaultdict(float)) + + logging.info("Running onnx benchmark") + onnx_bench = ONNXBenchmark( + "./models/llama-2-7b-onnx", + device="CPU" if not args.gpu else "GPU", + ).load_model() + onnx_bench.benchmark( + max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions + ) + report["onnx"]["float16"] = { + "mean": np.mean(onnx_bench.results), + "std": np.std(onnx_bench.results), + } + for quantize in ("Q8_0", "Q4_0"): logging.info(f"Running llama-cpp benchmark with {quantize}") llamacpp_bench = LlamaCPPBenchmark( diff --git a/python_bench/onnx_bench.py b/python_bench/onnx_bench.py new file mode 100644 index 00000000..8c5749ef --- /dev/null +++ b/python_bench/onnx_bench.py @@ -0,0 +1,35 @@ +import time + +from optimum.onnxruntime import ORTModelForCausalLM +from transformers import AutoTokenizer + +from python_bench.benchmark import Benchmark + + +class ONNXBenchmark(Benchmark): + def __init__(self, model_path, device="CPU"): + super().__init__(model_path) + self.device = device + self.provider = ( + "CUDAExecutionProvider" if device == "GPU" else "CPUExecutionProvider" + ) + + def load_model(self) -> Benchmark: + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.model = ORTModelForCausalLM.from_pretrained( + self.model_path, + use_cache=False, + use_io_binding=False, + provider=self.provider, + ) + return self + + def run_model(self, prompt, max_tokens) -> float: + device_str = "cuda" if self.device == "GPU" else "cpu" + inputs = self.tokenizer(prompt, return_tensors="pt").to(device_str) + start = time.time() + gen_tokens = self.model.generate(**inputs, max_length=max_tokens) + tokens_per_second = (gen_tokens.shape[1] - inputs["input_ids"].shape[1]) / ( + time.time() - start + ) + return tokens_per_second diff --git a/requirements.txt b/requirements.txt index b37ebfcc..ad09f117 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ ctranslate2==3.20.0 huggingface-hub==0.17.3 transformers==4.35.0 torch==2.1.0 +optimum[onnxruntime-gpu]==1.14.1 # Using fixed commit (a72b3700) for tinygrad to ensure stability in benchmarking. # Helps maintain reproducibility and guards against potential breaking changes. git+https://github.com/tinygrad/tinygrad.git@a72b370066837af5b4d44eeb5c4fb30aebf5c502 diff --git a/setup.sh b/setup.sh index d8e7cf6a..e0bcb92f 100755 --- a/setup.sh +++ b/setup.sh @@ -16,6 +16,7 @@ BURN_MODEL_INPUT_DIR=$(pwd)/models/llama-2-7b-raw BURN_FOLDER=$(pwd)/rust_bench/llama2-burn BURN_MODEL_FOLDER=$(pwd)/models/llama-2-7b-burn BURN_MODEL_NAME="llama-2-7b-burn" +LLAMA_ONNX_MODEL_DIR="./models/llama-2-7b-onnx" create_and_activate_venv() { if [ ! -d "$VENV_DIR" ]; then @@ -81,3 +82,21 @@ if [ ! -e "$BURN_MODEL_FOLDER/$BURN_MODEL_NAME.cfg" ]; then else echo "Model llama-2-7b-burn already exists!" fi + +get_device() { + if command -v nvidia-smi &> /dev/null; then + echo "cuda" + else + echo "cpu" + fi +} + +# Check and create llama-2-7b-st model +if [ ! -d "$LLAMA_ONNX_MODEL_DIR" ]; then + optimum-cli export onnx \ + --model $LLAMA_HF_MODEL_DIR --task text-generation --framework pt \ + --opset 17 --sequence_length 1024 --batch_size 1 --device $(get_device) --fp16 \ + $LLAMA_ONNX_MODEL_DIR > /dev/null +else + echo "Model llama-2-7b-onnx already exists!" +fi