Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for ExLLama v2 #90

Merged
merged 26 commits into from
Dec 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
cabb322
add exllamav2 inside .gitignore
Anindyadeep Dec 6, 2023
da2b9e6
added exllamav2 zip link inside models.json
Anindyadeep Dec 6, 2023
26d11d8
added initial inference logic
Anindyadeep Dec 6, 2023
bede094
added the script for conversion from .bin to .safetensors
Anindyadeep Dec 6, 2023
a9afdb4
added exllamav2 inside requirements
Anindyadeep Dec 6, 2023
0423283
initial setup script for exllamav2
Anindyadeep Dec 6, 2023
f2b293a
initial commit for benchmark sh file
Anindyadeep Dec 6, 2023
2c2114d
modified conversion logic for benchmarks script usage
Anindyadeep Dec 7, 2023
cf4831a
removed unnecessary code
Anindyadeep Dec 7, 2023
0479ead
added the logic to not convert redundantly
Anindyadeep Dec 7, 2023
41b02a8
fix: bug in file .safetensors file checks
Anindyadeep Dec 7, 2023
0e9ae10
added script for doing both q4 and q8 quantization
Anindyadeep Dec 7, 2023
96aa995
refactor: benchmark script for exllamav2
Anindyadeep Dec 7, 2023
3fe6d1f
minor bug fix on quantization conversion
Anindyadeep Dec 7, 2023
4656fb2
adding .parquet files to ignore
Anindyadeep Dec 7, 2023
f7d7383
small bug fix in benchmarking logic
Anindyadeep Dec 7, 2023
52bad8f
removed sanity checks
Anindyadeep Dec 7, 2023
5f8dd03
added the script to benchmark exllama2
Anindyadeep Dec 7, 2023
5552dfc
added the benchmarking resuls on cuda
Anindyadeep Dec 7, 2023
0fb0246
added device as argument in bench.sh
Anindyadeep Dec 7, 2023
da50f9f
fix: spacing
Anindyadeep Dec 7, 2023
b5eab5f
removed exllamav2 download zip url
Anindyadeep Dec 8, 2023
6a02327
remove merge conflicts
Dec 8, 2023
07d41c2
added info of cpu and apple gpu for exllamav2
Dec 8, 2023
17f0bac
Merge pull request #5 from premAI-io/main
Anindyadeep Dec 8, 2023
fdfae86
resolve merge conflicts
Dec 8, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,5 @@ models/*
# Repositories
bench_tinygrad/tinygrad
bench_burn/llama2-burn
bench_exllamav2/exllamav2
bench_exllamav2/wikitext-test.parquet
129 changes: 129 additions & 0 deletions bench_exllamav2/bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import argparse
import logging
import sys
import time
from collections import defaultdict
from dataclasses import dataclass

import numpy as np
import torch
from exllamav2 import ExLlamaV2Cache, model_init
from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler

logging.getLogger("llama_cpp").setLevel(logging.ERROR)
logging.basicConfig(
stream=sys.stdout,
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)


@dataclass
class ExtraConfig:
model_dir: str
length: int = 2048
rope_scale: float = 1.0
rope_alpha: float = 1.0
no_flash_attn: bool = False
low_mem: bool = False
gpu_split: str = None


class ExllamaV2Benchmark:
def __init__(self, model_path: str) -> None:
self.model_path = model_path
self.cache = None
self.results = []

def load_model(self):
self.model, self.tokenizer = model_init.init(
ExtraConfig(model_dir=self.model_path), allow_auto_split=True
)
self.settings = ExLlamaV2Sampler.Settings()
self.settings.temperature = 0.85
self.settings.top_k = 50
self.settings.top_p = 0.8
self.settings.token_repetition_penalty = 1.15

if not self.model.loaded:
self.cache = ExLlamaV2Cache(self.model)
self.model.load_autosplit(self.cache)
self.cache = None
self.cache = ExLlamaV2Cache(self.model)
self.generator = ExLlamaV2BaseGenerator(self.model, self.cache, self.tokenizer)
self.settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
self.generator.warmup()
return self

@torch.inference_mode()
def run_model(self, prompt: str, max_tokens: int) -> float:
start = time.time()
_ = self.generator.generate_simple(
prompt, self.settings, max_tokens, token_healing=True
)
delta = time.time() - start
return len(self.generator.sequence_ids[0]) / delta

def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
for i in range(repetitions):
logging.info(
f"Running repetition [{str(i+1).zfill(len(str(repetitions)))}/{repetitions}]"
)
tokens_per_second = self.run_model(prompt, max_tokens)
self.results.append(tokens_per_second)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="llama.cpp Benchmark Llama model.")
parser.add_argument(
"--prompt",
type=str,
help="The prompt for the model.",
)
parser.add_argument("--max_tokens", type=int, help="The maximum number of tokens.")
parser.add_argument(
"--repetitions",
type=int,
help="The number of repetitions for the benchmark.",
)
parser.add_argument(
"--log_file",
type=str,
help="Path to the log file for writing logs (in append mode).",
)
parser.add_argument(
"--models_dir",
type=str,
help="Path to the models directory.",
)
args = parser.parse_args()
logging.info(
f"Running benchmark with: max_tokens={args.max_tokens} prompt={args.prompt} "
+ f"repetitions={args.repetitions} device=cuda"
)
report = defaultdict(lambda: defaultdict(float))
for quantize in ("q4", "q8"):
logging.info(f"Running ExllamaV2 benchmark with {quantize}")
llamacpp_bench = ExllamaV2Benchmark(
f"{args.models_dir}/llama-2-7b-exllamav2-{quantize}"
).load_model()
llamacpp_bench.benchmark(
max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions
)
q = "int8" if quantize == "q8" else "int4"
report["exllamav2"][q] = {
"mean": np.mean(llamacpp_bench.results),
"std": np.std(llamacpp_bench.results),
}

logging.info("Benchmark report")
with open(args.log_file, "a") as file:
for framework, quantizations in report.items():
for quantization, stats in quantizations.items():
logging.info(
f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}"
)
print(
f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}",
file=file,
)
159 changes: 159 additions & 0 deletions bench_exllamav2/bench.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#!/bin/bash

#!/bin/bash

########################################################################################################
# Script: bench.sh
# Description: This script runs benchmarks llama.cpp llama benchmark.
#
# Usage: ./bench.sh [OPTIONS]
# OPTIONS:
# -p, --prompt Prompt for benchmarks (default: 'Explain what is a transformer')
# -r, --repetitions Number of repetitions for benchmarks (default: 2)
# -m, --max_tokens Maximum number of tokens for benchmarks (default: 100)
# -d, --device Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')
# -lf, --log_file Logging file name.
# -md, --models_dir Models directory.
# -h, --help Show this help message
########################################################################################################

set -euo pipefail

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

print_usage() {
echo "Usage: $0 [OPTIONS]"
echo "OPTIONS:"
echo " -p, --prompt Prompt for benchmarks (default: 'Explain what is a transformer')"
echo " -r, --repetitions Number of repetitions for benchmarks (default: 2)"
echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 100)"
echo " -d, --device Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
echo " -lf, --log_file Logging file name."
echo " -md, --models_dir Models directory."
echo " -h, --help Show this help message"
exit 1
}

check_cuda() {
if command -v nvcc &> /dev/null
then
echo -e "\nUsing CUDA"
nvcc --version
else
echo -e "\nCUDA is not available."
exit 1
fi
}

check_platform() {
local platform
platform=$(uname -s)
if [[ "$platform" == "Linux" ]]; then
echo "Running on Linux."
elif [[ "$platform" == "Darwin" ]]; then
echo "Running on Mac OS."
else
echo "Unknown platform."
exit 1
fi
}

check_python() {
if command -v python &> /dev/null
then
echo -e "\nUsing $(python --version)."
else
echo -e "\nPython does not exist."
exit 1
fi
}

setup() {
echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
bash "$SCRIPT_DIR"/setup.sh
}

run_benchmarks() {
local PROMPT="$1"
local REPETITIONS="$2"
local MAX_TOKENS="$3"
local DEVICE="$4"
local LOG_FILENAME="$5"
local MODELS_DIR="$6"

# shellcheck disable=SC1091
source "$SCRIPT_DIR/venv/bin/activate"
python "$SCRIPT_DIR"/bench.py \
--prompt "$PROMPT" \
--repetitions "$REPETITIONS" \
--max_tokens "$MAX_TOKENS" \
--log_file "$LOG_FILENAME" \
--models_dir "$MODELS_DIR"
}


# Parse command-line arguments
while [ "$#" -gt 0 ]; do
case "$1" in
-p|--prompt)
PROMPT="$2"
shift 2
;;
-r|--repetitions)
REPETITIONS="$2"
shift 2
;;
-m|--max_tokens)
MAX_TOKENS="$2"
shift 2
;;
-d|--device)
DEVICE="$2"
case "$DEVICE" in
"cuda" | "metal" | "cpu")
;;
*)
echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
print_usage
;;
esac
if [ "$DEVICE" == "cuda" ]; then
check_cuda
else
echo "Not supported for $DEVICE"
exit 1
fi
shift 2
;;
-lf|--log_file)
LOG_FILENAME="$2"
shift 2
;;
-md|--models_dir)
MODELS_DIR="$2"
shift 2
;;
-h|--help)
print_usage
;;
*)
echo "Unknown option: $1"
print_usage
;;
esac
done


# Set default values if not provided
PROMPT="${PROMPT:-"Explain what is a transformer"}"
REPETITIONS="${REPETITIONS:-10}"
MAX_TOKENS="${MAX_TOKENS:-100}"
DEVICE="${DEVICE:-'cuda'}"
LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
MODELS_DIR="${MODELS_DIR:-"./models"}"

check_platform
check_cuda
check_python
setup
run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"
Loading