premAI-io · nsosio · Dec 8, 2023 · Dec 6, 2023 · Dec 6, 2023 · Dec 6, 2023
diff --git a/.gitignore b/.gitignore
@@ -166,3 +166,5 @@ models/*
 # Repositories
 bench_tinygrad/tinygrad
 bench_burn/llama2-burn
+bench_exllamav2/exllamav2
+bench_exllamav2/wikitext-test.parquet
diff --git a/bench_exllamav2/bench.py b/bench_exllamav2/bench.py
@@ -0,0 +1,129 @@
+import argparse
+import logging
+import sys
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+
+import numpy as np
+import torch
+from exllamav2 import ExLlamaV2Cache, model_init
+from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler
+
+logging.getLogger("llama_cpp").setLevel(logging.ERROR)
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
+
+@dataclass
+class ExtraConfig:
+    model_dir: str
+    length: int = 2048
+    rope_scale: float = 1.0
+    rope_alpha: float = 1.0
+    no_flash_attn: bool = False
+    low_mem: bool = False
+    gpu_split: str = None
+
+
+class ExllamaV2Benchmark:
+    def __init__(self, model_path: str) -> None:
+        self.model_path = model_path
+        self.cache = None
+        self.results = []
+
+    def load_model(self):
+        self.model, self.tokenizer = model_init.init(
+            ExtraConfig(model_dir=self.model_path), allow_auto_split=True
+        )
+        self.settings = ExLlamaV2Sampler.Settings()
+        self.settings.temperature = 0.85
+        self.settings.top_k = 50
+        self.settings.top_p = 0.8
+        self.settings.token_repetition_penalty = 1.15
+
+        if not self.model.loaded:
+            self.cache = ExLlamaV2Cache(self.model)
+            self.model.load_autosplit(self.cache)
+            self.cache = None
+        self.cache = ExLlamaV2Cache(self.model)
+        self.generator = ExLlamaV2BaseGenerator(self.model, self.cache, self.tokenizer)
+        self.settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
+        self.generator.warmup()
+        return self
+
+    @torch.inference_mode()
+    def run_model(self, prompt: str, max_tokens: int) -> float:
+        start = time.time()
+        _ = self.generator.generate_simple(
+            prompt, self.settings, max_tokens, token_healing=True
+        )
+        delta = time.time() - start
+        return len(self.generator.sequence_ids[0]) / delta
+
+    def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
+        for i in range(repetitions):
+            logging.info(
+                f"Running repetition [{str(i+1).zfill(len(str(repetitions)))}/{repetitions}]"
+            )
+            tokens_per_second = self.run_model(prompt, max_tokens)
+            self.results.append(tokens_per_second)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="llama.cpp Benchmark Llama model.")
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        help="The prompt for the model.",
+    )
+    parser.add_argument("--max_tokens", type=int, help="The maximum number of tokens.")
+    parser.add_argument(
+        "--repetitions",
+        type=int,
+        help="The number of repetitions for the benchmark.",
+    )
+    parser.add_argument(
+        "--log_file",
+        type=str,
+        help="Path to the log file for writing logs (in append mode).",
+    )
+    parser.add_argument(
+        "--models_dir",
+        type=str,
+        help="Path to the models directory.",
+    )
+    args = parser.parse_args()
+    logging.info(
+        f"Running benchmark with: max_tokens={args.max_tokens} prompt={args.prompt} "
+        + f"repetitions={args.repetitions} device=cuda"
+    )
+    report = defaultdict(lambda: defaultdict(float))
+    for quantize in ("q4", "q8"):
+        logging.info(f"Running ExllamaV2 benchmark with {quantize}")
+        llamacpp_bench = ExllamaV2Benchmark(
+            f"{args.models_dir}/llama-2-7b-exllamav2-{quantize}"
+        ).load_model()
+        llamacpp_bench.benchmark(
+            max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions
+        )
+        q = "int8" if quantize == "q8" else "int4"
+        report["exllamav2"][q] = {
+            "mean": np.mean(llamacpp_bench.results),
+            "std": np.std(llamacpp_bench.results),
+        }
+
+    logging.info("Benchmark report")
+    with open(args.log_file, "a") as file:
+        for framework, quantizations in report.items():
+            for quantization, stats in quantizations.items():
+                logging.info(
+                    f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}"
+                )
+                print(
+                    f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}",
+                    file=file,
+                )
diff --git a/bench_exllamav2/bench.sh b/bench_exllamav2/bench.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+
+#!/bin/bash
+
+########################################################################################################
+# Script: bench.sh
+# Description: This script runs benchmarks llama.cpp llama benchmark.
+#
+# Usage: ./bench.sh [OPTIONS]
+# OPTIONS:
+#   -p, --prompt      Prompt for benchmarks (default: 'Explain what is a transformer')
+#   -r, --repetitions Number of repetitions for benchmarks (default: 2)
+#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 100)
+#   -d, --device      Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')
+#   -lf, --log_file   Logging file name.
+#   -md, --models_dir Models directory.
+#   -h, --help        Show this help message
+########################################################################################################
+
+set -euo pipefail
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+print_usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo "OPTIONS:"
+    echo "  -p, --prompt        Prompt for benchmarks (default: 'Explain what is a transformer')"
+    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 2)"
+    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 100)"
+    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
+    echo "  -lf, --log_file     Logging file name."
+    echo "  -md, --models_dir   Models directory."
+    echo "  -h, --help          Show this help message"
+    exit 1
+}
+
+check_cuda() {
+    if command -v nvcc &> /dev/null
+    then
+        echo -e "\nUsing CUDA"
+        nvcc --version
+    else
+        echo -e "\nCUDA is not available."
+        exit 1
+    fi
+}
+
+check_platform() {
+    local platform
+    platform=$(uname -s)
+    if [[ "$platform" == "Linux" ]]; then
+        echo "Running on Linux."
+    elif [[ "$platform" == "Darwin" ]]; then
+        echo "Running on Mac OS."
+    else
+        echo "Unknown platform."
+        exit 1
+    fi
+}
+
+check_python() {
+    if command -v python &> /dev/null
+    then
+        echo -e "\nUsing $(python --version)."
+    else
+        echo -e "\nPython does not exist."
+        exit 1
+    fi
+}
+
+setup() {
+    echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
+    bash "$SCRIPT_DIR"/setup.sh
+}
+
+run_benchmarks() {
+    local PROMPT="$1"
+    local REPETITIONS="$2"
+    local MAX_TOKENS="$3"
+    local DEVICE="$4"
+    local LOG_FILENAME="$5"
+    local MODELS_DIR="$6"
+
+    # shellcheck disable=SC1091
+    source "$SCRIPT_DIR/venv/bin/activate"
+    python "$SCRIPT_DIR"/bench.py \
+        --prompt "$PROMPT" \
+        --repetitions "$REPETITIONS" \
+        --max_tokens "$MAX_TOKENS" \
+        --log_file "$LOG_FILENAME" \
+        --models_dir "$MODELS_DIR"
+}
+
+
+# Parse command-line arguments
+while [ "$#" -gt 0 ]; do
+    case "$1" in
+        -p|--prompt)
+            PROMPT="$2"
+            shift 2
+            ;;
+        -r|--repetitions)
+            REPETITIONS="$2"
+            shift 2
+            ;;
+        -m|--max_tokens)
+            MAX_TOKENS="$2"
+            shift 2
+            ;;
+        -d|--device)
+            DEVICE="$2"
+            case "$DEVICE" in
+                "cuda" | "metal" | "cpu")
+                    ;;
+                *)
+                    echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
+                    print_usage
+                    ;;
+            esac
+            if [ "$DEVICE" == "cuda" ]; then
+                check_cuda
+            else
+                echo "Not supported for $DEVICE"
+                exit 1
+            fi
+            shift 2
+            ;;
+        -lf|--log_file)
+            LOG_FILENAME="$2"
+            shift 2
+            ;;
+        -md|--models_dir)
+            MODELS_DIR="$2"
+            shift 2
+            ;;
+        -h|--help)
+            print_usage
+            ;;
+        *)
+            echo "Unknown option: $1"
+            print_usage
+            ;;
+    esac
+done
+
+
+# Set default values if not provided
+PROMPT="${PROMPT:-"Explain what is a transformer"}"
+REPETITIONS="${REPETITIONS:-10}"
+MAX_TOKENS="${MAX_TOKENS:-100}"
+DEVICE="${DEVICE:-'cuda'}"
+LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
+MODELS_DIR="${MODELS_DIR:-"./models"}"
+
+check_platform
+check_cuda
+check_python
+setup
+run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"