From e9659c1680a004c97a6b0d3477f1fe554bab70d3 Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindya@pop-os.localdomain>
Date: Tue, 5 Dec 2023 18:14:31 +0530
Subject: [PATCH 01/29] Feat: Benchmark for transformers pytorch.

This commit adds the initial script for doing benchmarking on pytorch implementation
of LLama by huggingface transformers. Benchmark is done on fp16/32 and bf16 format.
---
 bench_pytorch/bench.py | 143 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 bench_pytorch/bench.py

diff --git a/bench_pytorch/bench.py b/bench_pytorch/bench.py
new file mode 100644
index 00000000..29f4cc00
--- /dev/null
+++ b/bench_pytorch/bench.py
@@ -0,0 +1,143 @@
+import argparse
+import logging
+import sys
+import time
+from collections import defaultdict
+from typing import Optional
+
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+logging.getLogger("transformers").setLevel(logging.ERROR)
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
+
+class LlamaPyTorchBenchmark:
+    def __init__(
+        self, model_path: str, precision: str, device: Optional[str] = "cuda"
+    ) -> None:
+        self.model_path = model_path
+        self.precision = precision
+        self.results = []
+        self.precision_to_dtype_map = {
+            "fp16": torch.float16,
+            "fp32": torch.float32,
+            "bf16": torch.bfloat16,
+        }
+
+        # some of the conditions where things can not be supported
+        assert precision in ["bf16", "fp16", "fp32"], ValueError(
+            "Supported precisions are: p16', 'fp32', 'int8', 'int4'"
+        )
+        assert device in ["cpu", "cuda", "mps"], ValueError(
+            "Supported devices are: 'cpu', 'cuda', 'mps'"
+        )
+
+        if device == "cpu" and precision != "fp32":
+            raise ValueError(
+                "When device is set to CPU, fp32 is the only supported precision."
+            )
+
+        self.device = "cuda:0" if device == "cuda" else device
+        # build the params
+        self.model_args = {
+            "device_map": self.device,
+            "torch_dtype": self.precision_to_dtype_map[self.precision],
+        }
+
+    def load_model(self):
+        """Loads the model into various formats and device."""
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_path, **self.model_args
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+        return self
+
+    def run_model(self, prompt: str, max_tokens: int) -> float:
+        start = time.time()
+        tokenized_input = self.tokenizer.encode(prompt, return_tensors="pt").to(
+            self.device
+        )
+        output = (
+            self.model.generate(**tokenized_input, max_new_tokens=max_tokens)
+            .detach()
+            .cpu()
+            .numpy()
+        )
+        delta = time.time() - start
+        return len(output) / delta
+
+    def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
+        for i in range(repetitions):
+            logging.info(
+                f"Running repetition [{str(i+1).zfill(len(str(repetitions)))}/{repetitions}]"
+            )
+            tokens_per_second = self.run_model(prompt, max_tokens)
+            self.results.append(tokens_per_second)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="CTransformers Benchmark.")
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        help="The prompt for the model.",
+    )
+    parser.add_argument("--max_tokens", type=int, help="The maximum number of tokens.")
+    parser.add_argument(
+        "--repetitions",
+        type=int,
+        help="The number of repetitions for the benchmark.",
+    )
+    parser.add_argument(
+        "--device",
+        help="Device to use for the benchmark.",
+    )
+    parser.add_argument(
+        "--log_file",
+        type=str,
+        help="Path to the log file for writing logs (in append mode).",
+    )
+    parser.add_argument(
+        "--models_dir",
+        type=str,
+        help="Path to the models directory.",
+    )
+    args = parser.parse_args()
+    logging.info(
+        f"Running benchmark with: max_tokens={args.max_tokens} prompt={args.prompt} "
+        + f"repetitions={args.repetitions} device={args.device}"
+    )
+    report = defaultdict(lambda: defaultdict(float))
+
+    for precision in ("bf16", "fp16", "fp32") if args.device != "cpu" else ("fp32"):
+        logging.info(
+            f"Running Transformer benchmark (pytorch backend) on Llama with precision: {precision}"
+        )
+        llama_transformers_pytorch_benchmark = LlamaPyTorchBenchmark(
+            model_path=args.model_dir, device=args.device, precision=precision
+        ).load_model()
+        llama_transformers_pytorch_benchmark.benchmark(
+            max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions
+        )
+
+        report["llama_transformers_pytorch"][precision] = {
+            "mean": np.mean(llama_transformers_pytorch_benchmark.results),
+            "std": np.mean(llama_transformers_pytorch_benchmark.results),
+        }
+    logging.info("Benchmark Report")
+    with open(args.log_file, "a") as file:
+        for framework, quantizations in report.items():
+            for quantization, stats in quantizations.items():
+                logging.info(
+                    f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}"
+                )
+                print(
+                    f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}",
+                    file=file,
+                )

From d86788ce19a8b9b74432cf3bf4d085c40165bd75 Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindya@pop-os.localdomain>
Date: Tue, 5 Dec 2023 18:16:09 +0530
Subject: [PATCH 02/29] initial script for doing benchmarking transformers
 pytorch

---
 bench_pytorch/bench.sh | 151 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 bench_pytorch/bench.sh

diff --git a/bench_pytorch/bench.sh b/bench_pytorch/bench.sh
new file mode 100644
index 00000000..37d36003
--- /dev/null
+++ b/bench_pytorch/bench.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+########################################################################################################
+# Script: bench.sh
+# Description: This script runs benchmarks llama.cpp llama benchmark.
+#
+# Usage: ./bench.sh [OPTIONS]
+# OPTIONS:
+#   -p, --prompt      Prompt for benchmarks (default: 'Explain what is a transformer')
+#   -r, --repetitions Number of repetitions for benchmarks (default: 2)
+#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 100)
+#   -d, --device      Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')
+#   -lf, --log_file   Logging file name.
+#   -md, --models_dir Models directory.
+#   -h, --help        Show this help message
+########################################################################################################
+
+set -euo pipefail
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+print_usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo "OPTIONS:"
+    echo "  -p, --prompt        Prompt for benchmarks (default: 'Explain what is a transformer')"
+    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 2)"
+    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 100)"
+    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
+    echo "  -lf, --log_file     Logging file name."
+    echo "  -md, --models_dir   Models directory."
+    echo "  -h, --help          Show this help message"
+    exit 1
+}
+
+check_cuda() {
+    if command -v nvcc &> /dev/null
+    then
+        echo -e "\nUsing CUDA"
+        nvcc --version
+    else
+        echo -e "\nCUDA is not available."
+        exit 1
+    fi
+}
+
+check_platform() {
+    local platform
+    platform=$(uname -s)
+    if [[ "$platform" == "Linux" ]]; then
+        echo "Running on Linux."
+    elif [[ "$platform" == "Darwin" ]]; then
+        echo "Running on Mac OS."
+    else
+        echo "Unknown platform."
+        exit 1
+    fi
+}
+
+check_python() {
+    if command -v python &> /dev/null
+    then
+        echo -e "\nUsing $(python --version)."
+    else
+        echo -e "\nPython does not exist."
+        exit 1
+    fi
+}
+
+setup() {
+    echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
+    bash "$SCRIPT_DIR"/setup.sh "$1"
+}
+
+run_benchmarks() {
+    local PROMPT="$1"
+    local REPETITIONS="$2"
+    local MAX_TOKENS="$3"
+    local DEVICE="$4"
+    local LOG_FILENAME="$5"
+    local MODELS_DIR="$6"
+
+    # shellcheck disable=SC1091
+    source "$SCRIPT_DIR/venv/bin/activate"
+    python "$SCRIPT_DIR"/bench.py \
+        --prompt "$PROMPT" \
+        --repetitions "$REPETITIONS" \
+        --max_tokens "$MAX_TOKENS" \
+        --log_file "$LOG_FILENAME" \
+        --models_dir "$MODELS_DIR" \
+        --device "$DEVICE"
+}
+
+# Parse command-line arguments
+while [ "$#" -gt 0 ]; do
+    case "$1" in
+        -p|--prompt)
+            PROMPT="$2"
+            shift 2
+            ;;
+        -r|--repetitions)
+            REPETITIONS="$2"
+            shift 2
+            ;;
+        -m|--max_tokens)
+            MAX_TOKENS="$2"
+            shift 2
+            ;;
+        -d|--device)
+            DEVICE="$2"
+            case "$DEVICE" in
+                "cuda" | "metal" | "cpu")
+                    ;;
+                *)
+                    echo "Invalid value for --device. Please use 'cuda', 'gpu' or 'cpu'."
+                    print_usage
+                    ;;
+            esac
+            if [ "$DEVICE" == "cuda" ]; then
+                check_cuda
+            fi
+            shift 2
+            ;;
+        -lf|--log_file)
+            LOG_FILENAME="$2"
+            shift 2
+            ;;
+        -md|--models_dir)
+            MODELS_DIR="$2"
+            shift 2
+            ;;
+        -h|--help)
+            print_usage
+            ;;
+        *)
+            echo "Unknown option: $1"
+            print_usage
+            ;;
+    esac
+done
+# Set default values if not provided
+PROMPT="${PROMPT:-"Explain what is a transformer"}"
+REPETITIONS="${REPETITIONS:-10}"
+MAX_TOKENS="${MAX_TOKENS:-100}"
+DEVICE="${DEVICE:-'cpu'}"
+LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
+MODELS_DIR="${MODELS_DIR:-"./models"}"
+
+check_platform
+check_python
+setup "$DEVICE"
+run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"

From f625ef72685e33edb9a094d86a98ff50ca4e7a7e Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindya@pop-os.localdomain>
Date: Tue, 5 Dec 2023 18:16:33 +0530
Subject: [PATCH 03/29] added the requirements to install for benchmarking
 transformers pytorch

---
 bench_pytorch/requirements.txt | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 bench_pytorch/requirements.txt

diff --git a/bench_pytorch/requirements.txt b/bench_pytorch/requirements.txt
new file mode 100644
index 00000000..3f76c56b
--- /dev/null
+++ b/bench_pytorch/requirements.txt
@@ -0,0 +1,4 @@
+transformers==4.34.1
+torch==2.0.0
+dataset==2.14.6
+accelerate==0.24.1

From 5e0f43690c182ae25d568782c1cd3af573069d69 Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindya@pop-os.localdomain>
Date: Tue, 5 Dec 2023 18:17:12 +0530
Subject: [PATCH 04/29] added the installation setup sh files

---
 bench_pytorch/setup.sh | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 bench_pytorch/setup.sh

diff --git a/bench_pytorch/setup.sh b/bench_pytorch/setup.sh
new file mode 100644
index 00000000..563e0815
--- /dev/null
+++ b/bench_pytorch/setup.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+################################################################################
+# Script: setup.sh <DEVICE>
+# Description: Automates the setup of a virtual environment and installs project
+# requirements.
+################################################################################
+
+set -euo pipefail
+
+# Main script starts here.
+
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <DEVICE>"
+    exit 1
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+VENV_DIR="$SCRIPT_DIR/venv"
+
+if [ ! -d "$VENV_DIR" ]; then
+    python3 -m venv "$VENV_DIR"
+    echo "Virtual environment '$VENV_DIR' created."
+    # shellcheck disable=SC1091
+    source "$VENV_DIR/bin/activate"
+    pip install --upgrade pip > /dev/null
+    pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null
+else
+    # shellcheck disable=SC1091
+    source "$VENV_DIR/bin/activate"
+fi

From 19ced2da3d4c1a2ba1210058ceb62a08d43af62c Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 5 Dec 2023 14:18:38 +0000
Subject: [PATCH 05/29] refactor: Fix minor bugs in benchmark python script.

---
 bench_pytorch/bench.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/bench_pytorch/bench.py b/bench_pytorch/bench.py
index 29f4cc00..e41c814b 100644
--- a/bench_pytorch/bench.py
+++ b/bench_pytorch/bench.py
@@ -32,7 +32,7 @@ def __init__(
 
         # some of the conditions where things can not be supported
         assert precision in ["bf16", "fp16", "fp32"], ValueError(
-            "Supported precisions are: p16', 'fp32', 'int8', 'int4'"
+            "Supported precisions are: 'bf16', fp16', 'fp32'"
         )
         assert device in ["cpu", "cuda", "mps"], ValueError(
             "Supported devices are: 'cpu', 'cuda', 'mps'"
@@ -64,7 +64,7 @@ def run_model(self, prompt: str, max_tokens: int) -> float:
             self.device
         )
         output = (
-            self.model.generate(**tokenized_input, max_new_tokens=max_tokens)
+            self.model.generate(input_ids=tokenized_input, max_new_tokens=max_tokens)
             .detach()
             .cpu()
             .numpy()
@@ -79,6 +79,8 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
             )
             tokens_per_second = self.run_model(prompt, max_tokens)
             self.results.append(tokens_per_second)
+        del self.model
+        torch.cuda.synchronize()
 
 
 if __name__ == "__main__":
@@ -115,12 +117,12 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
     )
     report = defaultdict(lambda: defaultdict(float))
 
-    for precision in ("bf16", "fp16", "fp32") if args.device != "cpu" else ("fp32"):
+    for precision in ("bf16", "fp16", "fp32") if args.device != "cpu" else ("fp32",):
         logging.info(
             f"Running Transformer benchmark (pytorch backend) on Llama with precision: {precision}"
         )
         llama_transformers_pytorch_benchmark = LlamaPyTorchBenchmark(
-            model_path=args.model_dir, device=args.device, precision=precision
+            model_path=args.models_dir, device=args.device, precision=precision
         ).load_model()
         llama_transformers_pytorch_benchmark.benchmark(
             max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions

From 545e1a676a2fe16827963cf6f732dbfbf7f11196 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 5 Dec 2023 14:19:47 +0000
Subject: [PATCH 06/29] fix: model_dir path

---
 bench_pytorch/bench.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 mode change 100644 => 100755 bench_pytorch/bench.sh

diff --git a/bench_pytorch/bench.sh b/bench_pytorch/bench.sh
old mode 100644
new mode 100755
index 37d36003..0518c69a
--- a/bench_pytorch/bench.sh
+++ b/bench_pytorch/bench.sh
@@ -143,7 +143,7 @@ REPETITIONS="${REPETITIONS:-10}"
 MAX_TOKENS="${MAX_TOKENS:-100}"
 DEVICE="${DEVICE:-'cpu'}"
 LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
-MODELS_DIR="${MODELS_DIR:-"./models"}"
+MODELS_DIR="${MODELS_DIR:-"./models/llama-2-7b-hf"}"
 
 check_platform
 check_python

From d38f50e8ea9eff4fb3392cf1744ac77e7e926e23 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 5 Dec 2023 14:20:28 +0000
Subject: [PATCH 07/29] fix: requirements versioning

---
 bench_pytorch/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bench_pytorch/requirements.txt b/bench_pytorch/requirements.txt
index 3f76c56b..8068fb56 100644
--- a/bench_pytorch/requirements.txt
+++ b/bench_pytorch/requirements.txt
@@ -1,4 +1,4 @@
 transformers==4.34.1
 torch==2.0.0
-dataset==2.14.6
-accelerate==0.24.1
+dataset
+accelerate

From 769399a5b2a9da63c3d290f11009eed1cc5c706b Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 5 Dec 2023 14:21:08 +0000
Subject: [PATCH 08/29] minor fixes in setup script

---
 bench_pytorch/setup.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 bench_pytorch/setup.sh

diff --git a/bench_pytorch/setup.sh b/bench_pytorch/setup.sh
old mode 100644
new mode 100755

From 08735ec31cf5ef5486c9ed26759cd1b69a99a2be Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 5 Dec 2023 14:21:29 +0000
Subject: [PATCH 09/29] added transformers for cuda in fp16/32

---
 docs/llama2.md.template | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index 0a0560b6..4ed84d77 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -8,15 +8,15 @@
 - Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device cuda --prompt 'Explain what is a transformer'`
 
 **Performance Metrics:**
-| Engine               | float32      | float16       | int8          | int4          |
-|----------------------|--------------|---------------|---------------|---------------|
-| burn                 | 13.12 ± 0.85 |      -        |      -        |      -        |
-| candle               |      -       | 36.78 ± 2.17  |      -        |      -        |
-| llama.cpp            |      -       |      -        | 84.48 ± 3.76  | 106.76 ± 1.29 |
-| ctranslate           |      -       | 51.38 ± 16.01 | 36.12 ± 11.93 |      -        |
-| tinygrad             |      -       | 20.32 ± 0.06  |      -        |      -        |
-| onnx                 |      -       | 54.16 ± 3.15  |      -        |      -        |
-| ctransformers        |      -       |      -        | 81.61 ± 3.66  | 84.51 ± 7.93  |
+| Engine                       | float32      | float16       | int8          | int4          |
+|------------------------------|--------------|---------------|---------------|---------------|
+| burn                         | 13.12 ± 0.85 |      -        |      -        |      -        |
+| candle                       |      -       | 36.78 ± 2.17  |      -        |      -        |
+| llama.cpp                    |      -       |      -        | 84.48 ± 3.76  | 106.76 ± 1.29 |
+| ctranslate                   |      -       | 51.38 ± 16.01 | 36.12 ± 11.93 |      -        |
+| tinygrad                     |      -       | 20.32 ± 0.06  |      -        |      -        |
+| onnx                         |      -       | 54.16 ± 3.15  |      -        |      -        |
+| transformers (pytorch)       | 0.44 ± 0.44  | 0.44 ± 0.44   |      -        |      -        |
 
 *(Data updated: `<LAST_UPDATE>`)
 

From 027530af09afaa338aa1a2873bf2a901a8b2ec4d Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 6 Dec 2023 10:37:59 +0000
Subject: [PATCH 10/29] added llama2 results as none for mac devices

---
 docs/llama2.md.template | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index 4ed84d77..fe9eb667 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -31,29 +31,31 @@
 - Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device cpu --prompt 'Explain what is a transformer'`
 
 **Performance Metrics:**
-| Engine               | float32      | float16      | int8         | int4         |
-|----------------------|--------------|--------------|--------------|--------------|
-| burn                 | 0.30 ± 0.09  |      -       |      -       |      -       |
-| candle               |      -       | 3.43 ± 0.02  |      -       |      -       |
-| llama.cpp            |      -       |      -       | 14.41 ± 1.59 | 20.96 ± 1.94 |
-| ctranslate           |      -       |      -       | 2.11 ± 0.73  |      -       |
-| tinygrad             |      -       | 4.21 ± 0.38  |      -       |      -       |
-| onnx                 |      -       |      -       |      -       |      -       |
-| ctransformers        |      -       |      -       | 13.79 ± 0.50 | 22.93 ± 0.86 |
+| Engine                | float32      | float16      | int8         | int4         |
+|-----------------------|--------------|--------------|--------------|--------------|
+| burn                  | 0.30 ± 0.09  |      -       |      -       |      -       |
+| candle                |      -       | 3.43 ± 0.02  |      -       |      -       |
+| llama.cpp             |      -       |      -       | 14.41 ± 1.59 | 20.96 ± 1.94 |
+| ctranslate            |      -       |      -       | 2.11 ± 0.73  |      -       |
+| tinygrad              |      -       | 4.21 ± 0.38  |      -       |      -       |
+| onnx                  |      -       |      -       |      -       |      -       |
+| ctransformers         |      -       |      -       | 13.79 ± 0.50 | 22.93 ± 0.86 |
+| transformers (pytorch)|      -       |      -       |      -       |      -       |
 
 ### GPU (Metal)
 
 **Command:** `./benchmark.sh --repetitions 10 --max_tokens 100 --device metal --prompt 'Explain what is a transformer'`
 
 **Performance Metrics:**
-| Engine               | float32      | float16       | int8         | int4         |
-|----------------------|--------------|---------------|--------------|--------------|
-| burn                 |      -       |      -        |      -       |      -       |
-| candle               |      -       |      -        |      -       |      -       |
-| llama.cpp            |      -       |      -        | 31.24 ± 7.82 | 46.75 ± 9.55 |
-| ctranslate           |      -       |      -        |      -       |      -       |
-| tinygrad             |      -       | 29.78 ± 1.18  |      -       |      -       |
-| onnx                 |      -       |      -        |      -       |      -       |
-| ctransformers        |      -       |      -        | 21.24 ± 0.81 | 34.08 ± 4.78 |
+| Engine                | float32      | float16       | int8         | int4         |
+|-----------------------|--------------|---------------|--------------|--------------|
+| burn                  |      -       |      -        |      -       |      -       |
+| candle                |      -       |      -        |      -       |      -       |
+| llama.cpp             |      -       |      -        | 31.24 ± 7.82 | 46.75 ± 9.55 |
+| ctranslate            |      -       |      -        |      -       |      -       |
+| tinygrad              |      -       | 29.78 ± 1.18  |      -       |      -       |
+| onnx                  |      -       |      -        |      -       |      -       |
+| ctransformers         |      -       |      -        | 21.24 ± 0.81 | 34.08 ± 4.78 |
+| transformers (pytorch)|      -       |      -       |      -       |      -       |
 
 *(Data updated: `<LAST_UPDATE>`)

From b61612227f6c6e10014b440afa0e9e4accf00277 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 6 Dec 2023 10:38:48 +0000
Subject: [PATCH 11/29] added a note on benchmarking condition on mac using
 transformers (pytorch)

---
 docs/llama2.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/llama2.md b/docs/llama2.md
index 1128add4..3963bc8c 100644
--- a/docs/llama2.md
+++ b/docs/llama2.md
@@ -57,3 +57,5 @@
 | ctransformers        |      -       |      -        | 21.24 ± 0.81 | 34.08 ± 4.78 |
 
 *(Data updated: `02th December 2023`)
+
+*Note: Although benchmarking for pytorch transformers on mac is possible. But, we are not doing it, since it is very much time taking, and so makes it very less significant.

From 96c87c516cb2bca98aee2dea060370b7a7d3fa02 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 6 Dec 2023 22:50:15 +0530
Subject: [PATCH 12/29] fix: change device from mps to metal.

Co-authored-by: Nicola Sosio <sosio.nicola94@tiscali.it>
---
 bench_pytorch/bench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_pytorch/bench.py b/bench_pytorch/bench.py
index e41c814b..9c78682a 100644
--- a/bench_pytorch/bench.py
+++ b/bench_pytorch/bench.py
@@ -34,7 +34,7 @@ def __init__(
         assert precision in ["bf16", "fp16", "fp32"], ValueError(
             "Supported precisions are: 'bf16', fp16', 'fp32'"
         )
-        assert device in ["cpu", "cuda", "mps"], ValueError(
+        assert device in ["cpu", "cuda", "metal"], ValueError(
             "Supported devices are: 'cpu', 'cuda', 'mps'"
         )
 

From 6794c1ad93d18d842bf95a1c1a0a6dad875a56b9 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 6 Dec 2023 22:51:13 +0530
Subject: [PATCH 13/29] change default value from cuda to cpu

Co-authored-by: Nicola Sosio <sosio.nicola94@tiscali.it>
---
 bench_pytorch/bench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_pytorch/bench.py b/bench_pytorch/bench.py
index 9c78682a..e0c89d76 100644
--- a/bench_pytorch/bench.py
+++ b/bench_pytorch/bench.py
@@ -19,7 +19,7 @@
 
 class LlamaPyTorchBenchmark:
     def __init__(
-        self, model_path: str, precision: str, device: Optional[str] = "cuda"
+        self, model_path: str, precision: str, device: Optional[str] = "cpu"
     ) -> None:
         self.model_path = model_path
         self.precision = precision

From ea405adaf7f95524c5f0e7a73ac1f153a0af6160 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 6 Dec 2023 22:52:03 +0530
Subject: [PATCH 14/29] fix: starting timer after tokenizer.

Co-authored-by: Nicola Sosio <sosio.nicola94@tiscali.it>
---
 bench_pytorch/bench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_pytorch/bench.py b/bench_pytorch/bench.py
index e0c89d76..13414ead 100644
--- a/bench_pytorch/bench.py
+++ b/bench_pytorch/bench.py
@@ -59,10 +59,10 @@ def load_model(self):
         return self
 
     def run_model(self, prompt: str, max_tokens: int) -> float:
-        start = time.time()
         tokenized_input = self.tokenizer.encode(prompt, return_tensors="pt").to(
             self.device
         )
+        start = time.time()
         output = (
             self.model.generate(input_ids=tokenized_input, max_new_tokens=max_tokens)
             .detach()

From ec8663f9de2f753eb2cf9adcdb54be83cac5f805 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Thu, 7 Dec 2023 12:04:48 +0000
Subject: [PATCH 15/29] removed device argument, not required

---
 bench_pytorch/setup.sh | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/bench_pytorch/setup.sh b/bench_pytorch/setup.sh
index 563e0815..6927184c 100755
--- a/bench_pytorch/setup.sh
+++ b/bench_pytorch/setup.sh
@@ -9,12 +9,6 @@
 set -euo pipefail
 
 # Main script starts here.
-
-if [ "$#" -ne 1 ]; then
-    echo "Usage: $0 <DEVICE>"
-    exit 1
-fi
-
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 VENV_DIR="$SCRIPT_DIR/venv"
 

From 9f7b6ac4529354a21f7aac3b73e7c10b29f8de59 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Thu, 7 Dec 2023 12:06:11 +0000
Subject: [PATCH 16/29] removed using device as argument for installation

---
 bench_pytorch/bench.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bench_pytorch/bench.sh b/bench_pytorch/bench.sh
index 0518c69a..2fa0c702 100755
--- a/bench_pytorch/bench.sh
+++ b/bench_pytorch/bench.sh
@@ -68,7 +68,7 @@ check_python() {
 
 setup() {
     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
-    bash "$SCRIPT_DIR"/setup.sh "$1"
+    bash "$SCRIPT_DIR"/setup.sh
 }
 
 run_benchmarks() {
@@ -147,5 +147,5 @@ MODELS_DIR="${MODELS_DIR:-"./models/llama-2-7b-hf"}"
 
 check_platform
 check_python
-setup "$DEVICE"
+setup
 run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"

From b449e86ad2c0e989af542040fd6c945a0db32a2c Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Thu, 7 Dec 2023 12:08:01 +0000
Subject: [PATCH 17/29] changed python3 to python

---
 bench_pytorch/setup.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_pytorch/setup.sh b/bench_pytorch/setup.sh
index 6927184c..2ed48ba0 100755
--- a/bench_pytorch/setup.sh
+++ b/bench_pytorch/setup.sh
@@ -13,7 +13,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 VENV_DIR="$SCRIPT_DIR/venv"
 
 if [ ! -d "$VENV_DIR" ]; then
-    python3 -m venv "$VENV_DIR"
+    python -m venv "$VENV_DIR"
     echo "Virtual environment '$VENV_DIR' created."
     # shellcheck disable=SC1091
     source "$VENV_DIR/bin/activate"

From df607ca52c85d7e6f10e3ed19740150240db9ab1 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Thu, 7 Dec 2023 12:10:50 +0000
Subject: [PATCH 18/29] torch synchronize only if device set to cuda

---
 bench_pytorch/bench.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bench_pytorch/bench.py b/bench_pytorch/bench.py
index 13414ead..0f022905 100644
--- a/bench_pytorch/bench.py
+++ b/bench_pytorch/bench.py
@@ -80,7 +80,8 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
             tokens_per_second = self.run_model(prompt, max_tokens)
             self.results.append(tokens_per_second)
         del self.model
-        torch.cuda.synchronize()
+        if self.device == "cuda":
+            torch.cuda.synchronize()
 
 
 if __name__ == "__main__":

From 27bb50b3b409372195bdc93228ffb89415b61660 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Thu, 7 Dec 2023 12:20:49 +0000
Subject: [PATCH 19/29] removed bf16 benchmarking

---
 bench_pytorch/bench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_pytorch/bench.py b/bench_pytorch/bench.py
index 0f022905..08145800 100644
--- a/bench_pytorch/bench.py
+++ b/bench_pytorch/bench.py
@@ -118,7 +118,7 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
     )
     report = defaultdict(lambda: defaultdict(float))
 
-    for precision in ("bf16", "fp16", "fp32") if args.device != "cpu" else ("fp32",):
+    for precision in ("fp16", "fp32") if args.device != "cpu" else ("fp32",):
         logging.info(
             f"Running Transformer benchmark (pytorch backend) on Llama with precision: {precision}"
         )

From 91946d4c2d97fd448c04e8cfe84fad7b579f05e8 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Thu, 7 Dec 2023 12:21:11 +0000
Subject: [PATCH 20/29] removed datasets in requirements

---
 bench_pytorch/requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bench_pytorch/requirements.txt b/bench_pytorch/requirements.txt
index 8068fb56..613d23dd 100644
--- a/bench_pytorch/requirements.txt
+++ b/bench_pytorch/requirements.txt
@@ -1,4 +1,3 @@
 transformers==4.34.1
 torch==2.0.0
-dataset
 accelerate

From eded8920afdc295c11142c18f1a4dca5e9141a30 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Thu, 7 Dec 2023 12:21:35 +0000
Subject: [PATCH 21/29] updated the benchmarking results for fp16

---
 docs/llama2.md.template | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index fe9eb667..1e007744 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -16,7 +16,7 @@
 | ctranslate                   |      -       | 51.38 ± 16.01 | 36.12 ± 11.93 |      -        |
 | tinygrad                     |      -       | 20.32 ± 0.06  |      -        |      -        |
 | onnx                         |      -       | 54.16 ± 3.15  |      -        |      -        |
-| transformers (pytorch)       | 0.44 ± 0.44  | 0.44 ± 0.44   |      -        |      -        |
+| transformers (pytorch)       | 0.40 ± 0.40  | 0.37 ± 0.37   |      -        |      -        |
 
 *(Data updated: `<LAST_UPDATE>`)
 

From aa488f9957e77530edb8dbe9c3318d348cc56d48 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Thu, 7 Dec 2023 22:24:54 +0530
Subject: [PATCH 22/29] fix: output to tuple

Co-authored-by: Nicola Sosio <sosio.nicola94@tiscali.it>
---
 bench_pytorch/bench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_pytorch/bench.py b/bench_pytorch/bench.py
index 08145800..50700558 100644
--- a/bench_pytorch/bench.py
+++ b/bench_pytorch/bench.py
@@ -70,7 +70,7 @@ def run_model(self, prompt: str, max_tokens: int) -> float:
             .numpy()
         )
         delta = time.time() - start
-        return len(output) / delta
+        return len(output[0]) / delta
 
     def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
         for i in range(repetitions):

From c5d84548601e5c5ffdfd8ae0d443d43a9c029ef8 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Thu, 7 Dec 2023 22:25:36 +0530
Subject: [PATCH 23/29] remove <DEVICE>

Co-authored-by: Nicola Sosio <sosio.nicola94@tiscali.it>
---
 bench_pytorch/setup.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_pytorch/setup.sh b/bench_pytorch/setup.sh
index 2ed48ba0..6c231a5f 100755
--- a/bench_pytorch/setup.sh
+++ b/bench_pytorch/setup.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 ################################################################################
-# Script: setup.sh <DEVICE>
+# Script: setup.sh
 # Description: Automates the setup of a virtual environment and installs project
 # requirements.
 ################################################################################

From f2cde7c72801edb8d0f713d1ff82be0ba66f567e Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Thu, 7 Dec 2023 22:25:49 +0530
Subject: [PATCH 24/29] fix: change device from mps to metal.

Co-authored-by: Nicola Sosio <sosio.nicola94@tiscali.it>
---
 bench_pytorch/bench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_pytorch/bench.py b/bench_pytorch/bench.py
index 50700558..31a37d28 100644
--- a/bench_pytorch/bench.py
+++ b/bench_pytorch/bench.py
@@ -35,7 +35,7 @@ def __init__(
             "Supported precisions are: 'bf16', fp16', 'fp32'"
         )
         assert device in ["cpu", "cuda", "metal"], ValueError(
-            "Supported devices are: 'cpu', 'cuda', 'mps'"
+            "Supported devices are: 'cpu', 'cuda', 'metal'"
         )
 
         if device == "cpu" and precision != "fp32":

From 9c19b860620f0e49bbe3d6cb8f2b7d8c21c9973d Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindya@pop-os.localdomain>
Date: Fri, 8 Dec 2023 00:23:45 +0530
Subject: [PATCH 25/29] Refactor: Change in models dir path.

- in bench.sh: changed from models/llama2-7b-hf to models.
- in bench.py: take the models_dir and add llama2-7b-hf by default.
---
 bench_pytorch/bench.py | 4 +++-
 bench_pytorch/bench.sh | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/bench_pytorch/bench.py b/bench_pytorch/bench.py
index 31a37d28..9aeb306e 100644
--- a/bench_pytorch/bench.py
+++ b/bench_pytorch/bench.py
@@ -123,7 +123,9 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
             f"Running Transformer benchmark (pytorch backend) on Llama with precision: {precision}"
         )
         llama_transformers_pytorch_benchmark = LlamaPyTorchBenchmark(
-            model_path=args.models_dir, device=args.device, precision=precision
+            model_path=f"{args.models_dir}/llama2-7b-hf",
+            device=args.device,
+            precision=precision,
         ).load_model()
         llama_transformers_pytorch_benchmark.benchmark(
             max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions
diff --git a/bench_pytorch/bench.sh b/bench_pytorch/bench.sh
index 2fa0c702..cfd10d13 100755
--- a/bench_pytorch/bench.sh
+++ b/bench_pytorch/bench.sh
@@ -143,7 +143,7 @@ REPETITIONS="${REPETITIONS:-10}"
 MAX_TOKENS="${MAX_TOKENS:-100}"
 DEVICE="${DEVICE:-'cpu'}"
 LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
-MODELS_DIR="${MODELS_DIR:-"./models/llama-2-7b-hf"}"
+MODELS_DIR="${MODELS_DIR:-"./models/"}"
 
 check_platform
 check_python

From d1d18278903055512f51e43eecb037babf037d4b Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Fri, 8 Dec 2023 07:10:13 +0000
Subject: [PATCH 26/29] fix typo in path name

---
 bench_pytorch/bench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_pytorch/bench.py b/bench_pytorch/bench.py
index 9aeb306e..9ae119ba 100644
--- a/bench_pytorch/bench.py
+++ b/bench_pytorch/bench.py
@@ -123,7 +123,7 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
             f"Running Transformer benchmark (pytorch backend) on Llama with precision: {precision}"
         )
         llama_transformers_pytorch_benchmark = LlamaPyTorchBenchmark(
-            model_path=f"{args.models_dir}/llama2-7b-hf",
+            model_path=f"{args.models_dir}/llama-2-7b-hf",
             device=args.device,
             precision=precision,
         ).load_model()

From ac503db8c0d4d20d92c3f8fb1077fcace7334be2 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Fri, 8 Dec 2023 07:11:00 +0000
Subject: [PATCH 27/29] fix: models dir path

---
 bench_pytorch/bench.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_pytorch/bench.sh b/bench_pytorch/bench.sh
index cfd10d13..876793af 100755
--- a/bench_pytorch/bench.sh
+++ b/bench_pytorch/bench.sh
@@ -143,7 +143,7 @@ REPETITIONS="${REPETITIONS:-10}"
 MAX_TOKENS="${MAX_TOKENS:-100}"
 DEVICE="${DEVICE:-'cpu'}"
 LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
-MODELS_DIR="${MODELS_DIR:-"./models/"}"
+MODELS_DIR="${MODELS_DIR:-"./models"}"
 
 check_platform
 check_python

From cbbb3f25a9e6236ed7949d16323a6b45e8174923 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Fri, 8 Dec 2023 07:14:12 +0000
Subject: [PATCH 28/29] fixed the benchmark scores for pytorch transformers gpu

---
 docs/llama2.md.template | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index 1e007744..ef8358ac 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -16,7 +16,7 @@
 | ctranslate                   |      -       | 51.38 ± 16.01 | 36.12 ± 11.93 |      -        |
 | tinygrad                     |      -       | 20.32 ± 0.06  |      -        |      -        |
 | onnx                         |      -       | 54.16 ± 3.15  |      -        |      -        |
-| transformers (pytorch)       | 0.40 ± 0.40  | 0.37 ± 0.37   |      -        |      -        |
+| transformers (pytorch)       | 46.44 ± 46.44| 42.56 ± 42.56 |      -        |      -        |
 
 *(Data updated: `<LAST_UPDATE>`)
 

From db18c556b9c4599f7d0a56b6771862ebee6a5215 Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindya@pop-os.localdomain>
Date: Fri, 8 Dec 2023 14:27:20 +0530
Subject: [PATCH 29/29] replaced pytorch cpu performance note from llama2 to
 llama2_template

---
 docs/llama2.md          | 2 --
 docs/llama2.md.template | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/llama2.md b/docs/llama2.md
index 3963bc8c..1128add4 100644
--- a/docs/llama2.md
+++ b/docs/llama2.md
@@ -57,5 +57,3 @@
 | ctransformers        |      -       |      -        | 21.24 ± 0.81 | 34.08 ± 4.78 |
 
 *(Data updated: `02th December 2023`)
-
-*Note: Although benchmarking for pytorch transformers on mac is possible. But, we are not doing it, since it is very much time taking, and so makes it very less significant.
diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index 1e007744..c72c4b39 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -59,3 +59,5 @@
 | transformers (pytorch)|      -       |      -       |      -       |      -       |
 
 *(Data updated: `<LAST_UPDATE>`)
+
+*Note: Although benchmarking for pytorch transformers on mac is possible. But, we are not doing it, since it is very much time taking, and so makes it very less significant.