Skip to content

Commit

Permalink
added onnxruntime
Browse files Browse the repository at this point in the history
  • Loading branch information
nsosio committed Nov 17, 2023
1 parent b32365e commit 7197a71
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 1 deletion.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,9 @@ Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device gpu --nvidia
| llama.cpp | - | - | 67.64 ± 22.57| 106.21 ± 2.21|
| ctranslate | - | 58.54 ± 13.24| 34.22 ± 6.29 | - |
| tinygrad | - | 20.13 ± 1.35 | - | - |
| onnx | - | 50.50 ± 3.58 | - | - |

*(data updated: 15th November 2023)
*(data updated: 17th November 2023)


### M2 MAX 32GB Inference Bench:
Expand Down
15 changes: 15 additions & 0 deletions bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from python_bench.ctranslate import CTranslateBenchmark, get_compute_types
from python_bench.llama_cpp import LlamaCPPBenchmark
from python_bench.onnx_bench import ONNXBenchmark
from python_bench.tinygrad import TinyGradBenchmark

logging.basicConfig(
Expand Down Expand Up @@ -57,6 +58,20 @@
+ f"repetitions={args.repetitions} gpu={args.gpu} nvidia={args.gpu}"
)
report = defaultdict(lambda: defaultdict(float))

logging.info("Running onnx benchmark")
onnx_bench = ONNXBenchmark(
"./models/llama-2-7b-onnx",
device="CPU" if not args.gpu else "GPU",
).load_model()
onnx_bench.benchmark(
max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions
)
report["onnx"]["float16"] = {
"mean": np.mean(onnx_bench.results),
"std": np.std(onnx_bench.results),
}

for quantize in ("Q8_0", "Q4_0"):
logging.info(f"Running llama-cpp benchmark with {quantize}")
llamacpp_bench = LlamaCPPBenchmark(
Expand Down
35 changes: 35 additions & 0 deletions python_bench/onnx_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import time

from optimum.onnxruntime import ORTModelForCausalLM
from transformers import AutoTokenizer

from python_bench.benchmark import Benchmark


class ONNXBenchmark(Benchmark):
def __init__(self, model_path, device="CPU"):
super().__init__(model_path)
self.device = device
self.provider = (
"CUDAExecutionProvider" if device == "GPU" else "CPUExecutionProvider"
)

def load_model(self) -> Benchmark:
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
self.model = ORTModelForCausalLM.from_pretrained(
self.model_path,
use_cache=False,
use_io_binding=False,
provider=self.provider,
)
return self

def run_model(self, prompt, max_tokens) -> float:
device_str = "cuda" if self.device == "GPU" else "cpu"
inputs = self.tokenizer(prompt, return_tensors="pt").to(device_str)
start = time.time()
gen_tokens = self.model.generate(**inputs, max_length=max_tokens)
tokens_per_second = (gen_tokens.shape[1] - inputs["input_ids"].shape[1]) / (
time.time() - start
)
return tokens_per_second
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ ctranslate2==3.20.0
huggingface-hub==0.17.3
transformers==4.35.0
torch==2.1.0
optimum[onnxruntime-gpu]==1.14.1
# Using fixed commit (a72b3700) for tinygrad to ensure stability in benchmarking.
# Helps maintain reproducibility and guards against potential breaking changes.
git+https://github.com/tinygrad/tinygrad.git@a72b370066837af5b4d44eeb5c4fb30aebf5c502
19 changes: 19 additions & 0 deletions setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ BURN_MODEL_INPUT_DIR=$(pwd)/models/llama-2-7b-raw
BURN_FOLDER=$(pwd)/rust_bench/llama2-burn
BURN_MODEL_FOLDER=$(pwd)/models/llama-2-7b-burn
BURN_MODEL_NAME="llama-2-7b-burn"
LLAMA_ONNX_MODEL_DIR="./models/llama-2-7b-onnx"

create_and_activate_venv() {
if [ ! -d "$VENV_DIR" ]; then
Expand Down Expand Up @@ -81,3 +82,21 @@ if [ ! -e "$BURN_MODEL_FOLDER/$BURN_MODEL_NAME.cfg" ]; then
else
echo "Model llama-2-7b-burn already exists!"
fi

get_device() {
if command -v nvidia-smi &> /dev/null; then
echo "cuda"
else
echo "cpu"
fi
}

# Check and create llama-2-7b-st model
if [ ! -d "$LLAMA_ONNX_MODEL_DIR" ]; then
optimum-cli export onnx \
--model $LLAMA_HF_MODEL_DIR --task text-generation --framework pt \
--opset 17 --sequence_length 1024 --batch_size 1 --device $(get_device) --fp16 \
$LLAMA_ONNX_MODEL_DIR > /dev/null
else
echo "Model llama-2-7b-onnx already exists!"
fi

0 comments on commit 7197a71

Please sign in to comment.