From 8e0aedf3fff690fad74e4c0d194ba792a27b981b Mon Sep 17 00:00:00 2001 From: nsosio Date: Thu, 16 Nov 2023 12:15:18 +0100 Subject: [PATCH 1/5] added linter --- .pre-commit-config.yaml | 43 +++++++++++ NOTES.md | 2 +- README.md | 2 +- bench.py | 11 +-- benchmark.sh | 10 +-- convert_to_safetensors.py | 7 +- download.sh | 6 +- models.json | 2 +- python_bench/benchmark.py | 3 +- python_bench/ctranslate.py | 2 +- python_bench/llama_cpp.py | 6 +- python_bench/tinygrad.py | 51 ++++++------- requirements.txt | 4 +- rust_bench/llama2-burn/README.md | 8 +- rust_bench/llama2-burn/llama-py/dump.py | 72 ++++++++++-------- rust_bench/llama2-burn/llama-py/dump_model.py | 72 ++++++++++++------ rust_bench/llama2-burn/llama-py/dump_test.py | 26 ++++--- rust_bench/llama2-burn/llama-py/model.py | 49 +++++-------- .../llama2-burn/llama-py/requirements.txt | 2 +- rust_bench/llama2-burn/llama-py/test.py | 73 +++++++++++-------- .../llama2-burn/llama-py/test_tokenizer.py | 4 +- rust_bench/llama2-burn/llama-py/tokenizer.py | 14 ++-- rust_bench/llama2-candle/Cargo.toml | 2 +- setup.cfg | 3 + setup.sh | 4 +- 25 files changed, 286 insertions(+), 192 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 setup.cfg diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..fc065c44 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,43 @@ +default_stages: [commit] + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: end-of-file-fixer + - id: check-toml + - id: check-xml + - id: debug-statements + - id: check-builtin-literals + - id: check-case-conflict + + - repo: https://github.com/asottile/pyupgrade + rev: v3.15.0 + hooks: + - id: pyupgrade + args: [--py311-plus] + + - repo: https://github.com/psf/black + rev: 23.11.0 + hooks: + - id: black + + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + + - repo: https://github.com/PyCQA/flake8 + rev: 6.1.0 + hooks: + - id: flake8 + args: ["--config=setup.cfg"] + additional_dependencies: [flake8-isort] + +ci: + autoupdate_schedule: weekly + skip: [] + submodules: false diff --git a/NOTES.md b/NOTES.md index 92a1ac6f..c04443ee 100644 --- a/NOTES.md +++ b/NOTES.md @@ -14,7 +14,7 @@ Currently working on requirement and understanding different project constraints ## Early Investigation The overall investigation assumes PyTorch as the performance base and all relevant understand should be built in context of that, -the specific benchmark might not be directly comparable to each other but it should provide a rough picture of the state of +the specific benchmark might not be directly comparable to each other but it should provide a rough picture of the state of open source ML framework performance. This generally is to port, add and support new things into burn and other platforms. diff --git a/README.md b/README.md index 77294d97..6e12d2cf 100644 --- a/README.md +++ b/README.md @@ -115,4 +115,4 @@ Command: `./benchmark.sh --repetitions 10 --max_tokens 100 --device gpu --prompt | ctranslate | - | - | - | - | | tinygrad | - | 29.78 ± 1.18 | - | - | -*(data updated: 15th November 2023) \ No newline at end of file +*(data updated: 15th November 2023) diff --git a/bench.py b/bench.py index ee9ceb3b..ca316af5 100644 --- a/bench.py +++ b/bench.py @@ -1,12 +1,12 @@ import argparse -from collections import defaultdict import logging import sys +from collections import defaultdict import numpy as np -from python_bench.llama_cpp import LlamaCPPBenchmark from python_bench.ctranslate import CTranslateBenchmark, get_compute_types +from python_bench.llama_cpp import LlamaCPPBenchmark from python_bench.tinygrad import TinyGradBenchmark logging.basicConfig( @@ -53,7 +53,8 @@ args = parser.parse_args() logging.info( - f"Running benchmark with: max_tokens={args.max_tokens} prompt={args.prompt} repetitions={args.repetitions} gpu={args.gpu} nvidia={args.gpu}" + f"Running benchmark with: max_tokens={args.max_tokens} prompt={args.prompt} " + + f"repetitions={args.repetitions} gpu={args.gpu} nvidia={args.gpu}" ) report = defaultdict(lambda: defaultdict(float)) for quantize in ("Q8_0", "Q4_0"): @@ -74,7 +75,7 @@ for compute_type in compute_types.intersection({"float16", "int8"}): logging.info(f"Running ctranslate benchmark with {compute_type}") ctranslate_bench = CTranslateBenchmark( - f"./models/llama-2-7b-hf-float16", + "./models/llama-2-7b-hf-float16", gpu=args.gpu, compute_type=compute_type, ).load_model() @@ -86,7 +87,7 @@ "std": np.std(ctranslate_bench.results), } - logging.info(f"Running tinygrad benchmark") + logging.info("Running tinygrad benchmark") tinygrad_bench = TinyGradBenchmark( "./models/llama-2-7b-hf", quantize=False, diff --git a/benchmark.sh b/benchmark.sh index 2e5edb18..d2ca1d3c 100755 --- a/benchmark.sh +++ b/benchmark.sh @@ -2,8 +2,8 @@ ############################################################################################## # Script: run_benchmarks.sh -# Description: This script runs benchmarks for a transformer model using both -# Rust and Python implementations. It provides options to customize the +# Description: This script runs benchmarks for a transformer model using both +# Rust and Python implementations. It provides options to customize the # benchmarks, such as the prompt, repetitions, maximum tokens, device, and NVIDIA flag. # # Usage: ./run_benchmarks.sh [OPTIONS] @@ -150,8 +150,8 @@ run_benchmarks() { --prompt "$PROMPT" \ --sample-len $MAX_TOKENS \ --log-file $LOG_FILENAME - fi - + fi + # Set options based on $DEVICE and $USE_NVIDIA [ "$DEVICE" == "gpu" ] && PYTHON_DEVICE="--gpu" [ "$USE_NVIDIA" == true ] && PYTHON_NVIDIA="--nvidia" @@ -235,4 +235,4 @@ check_rust check_jq download_models setup -run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" $USE_NVIDIA "$log_filename" \ No newline at end of file +run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" $USE_NVIDIA "$log_filename" diff --git a/convert_to_safetensors.py b/convert_to_safetensors.py index a4ff3e28..6cd2fddf 100644 --- a/convert_to_safetensors.py +++ b/convert_to_safetensors.py @@ -1,9 +1,8 @@ import argparse -import os import logging -from collections import defaultdict -from typing import List +import os import shutil +from collections import defaultdict import torch from safetensors.torch import load_file, save_file @@ -80,7 +79,7 @@ def convert_file(pt_filename: str, sf_filename: str): raise RuntimeError(f"The output tensors do not match for key {k}") -def convert_multi(input_dir: str, output_dir: str) -> List[str]: +def convert_multi(input_dir: str, output_dir: str) -> list[str]: if os.path.exists(output_dir): logging.warning(f"{output_dir} already exists!") return [] diff --git a/download.sh b/download.sh index b12b5018..85d31847 100755 --- a/download.sh +++ b/download.sh @@ -2,9 +2,9 @@ ################################################################################ # Script: download.sh -# Description: Downloads files from a list of URLs specified in a JSON file. -# The JSON file should contain an array of objects, each with a 'url', 'file', -# and 'folder' property. The script checks if the file already exists before +# Description: Downloads files from a list of URLs specified in a JSON file. +# The JSON file should contain an array of objects, each with a 'url', 'file', +# and 'folder' property. The script checks if the file already exists before # downloading it. # # Usage: ./download.sh --models --cache --force-download diff --git a/models.json b/models.json index da726b39..8058fc09 100644 --- a/models.json +++ b/models.json @@ -19,4 +19,4 @@ "file": "llama-2-7b-raw.zip", "folder": "./models/llama-2-7b-raw" } -] \ No newline at end of file +] diff --git a/python_bench/benchmark.py b/python_bench/benchmark.py index ff808bbe..efbbeba2 100644 --- a/python_bench/benchmark.py +++ b/python_bench/benchmark.py @@ -1,6 +1,7 @@ from __future__ import annotations -from abc import ABC, abstractmethod + import logging +from abc import ABC, abstractmethod logger = logging.getLogger(__name__) diff --git a/python_bench/ctranslate.py b/python_bench/ctranslate.py index 5dce718a..37cc10b3 100644 --- a/python_bench/ctranslate.py +++ b/python_bench/ctranslate.py @@ -1,5 +1,5 @@ -import os import logging +import os import time import ctranslate2 diff --git a/python_bench/llama_cpp.py b/python_bench/llama_cpp.py index e5d72e3e..f7c1365c 100644 --- a/python_bench/llama_cpp.py +++ b/python_bench/llama_cpp.py @@ -1,8 +1,10 @@ -import time import logging -from python_bench.benchmark import Benchmark +import time + from llama_cpp import Llama +from python_bench.benchmark import Benchmark + logging.getLogger("llama_cpp").setLevel(logging.ERROR) diff --git a/python_bench/tinygrad.py b/python_bench/tinygrad.py index b8c9e666..b238ffdd 100644 --- a/python_bench/tinygrad.py +++ b/python_bench/tinygrad.py @@ -1,21 +1,16 @@ +import json +import logging import os +import time from pathlib import Path -import json + import numpy as np -from typing import Optional, Tuple, Union -from tinygrad.shape.symbolic import Variable -from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE -from tinygrad.nn.state import safe_load, torch_load, load_state_dict +from tinygrad.helpers import CI, dtypes, getenv +from tinygrad.jit import JIT_SUPPORTED_DEVICE, TinyJit from tinygrad.nn import Embedding, Linear +from tinygrad.nn.state import load_state_dict, safe_load, torch_load +from tinygrad.shape.symbolic import Variable from tinygrad.tensor import Tensor -from tinygrad.helpers import getenv, dtypes, CI -from typing import Optional, Tuple -from pathlib import Path -import json -import time -import numpy as np -from pathlib import Path -import logging from python_bench.benchmark import Benchmark @@ -43,7 +38,7 @@ def complex_mult(A, c, d): return ro.cat(co, dim=-1) -def apply_rotary_emb(xq, xk, freqs_cis) -> Tuple[Tensor, Tensor]: +def apply_rotary_emb(xq, xk, freqs_cis) -> tuple[Tensor, Tensor]: assert ( freqs_cis.shape[1] == xq.shape[1] and freqs_cis.shape[1] == xk.shape[1] ), f"freqs_cis shape mismatch {freqs_cis.shape} xq:{xq.shape} xk:{xk.shape}" @@ -95,9 +90,9 @@ def __init__(self, dim, n_heads, n_kv_heads, linear=Linear): def __call__( self, x: Tensor, - start_pos: Union[Variable, int], + start_pos: Variable | int, freqs_cis: Tensor, - mask: Optional[Tensor], + mask: Tensor | None, ) -> Tensor: xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) xq = xq.reshape(xq.shape[0], xq.shape[1], self.n_heads, self.head_dim) @@ -181,9 +176,9 @@ def __init__( def __call__( self, x: Tensor, - start_pos: Union[Variable, int], + start_pos: Variable | int, freqs_cis: Tensor, - mask: Optional[Tensor], + mask: Tensor | None, ): h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis, mask) return (h + self.feed_forward(self.ffn_norm(h))).realize() @@ -228,7 +223,7 @@ def __init__( self.forward_jit = TinyJit(self.forward) def forward( - self, tokens: Tensor, start_pos: Union[Variable, int], temperature: float = 0.0 + self, tokens: Tensor, start_pos: Variable | int, temperature: float = 0.0 ): _bsz, seqlen = tokens.shape freqs_cis = self.freqs_cis.shrink( @@ -513,22 +508,22 @@ def convert_from_huggingface(weights, model): keymap = { "model.embed_tokens.weight": "tok_embeddings.weight", **{ - f"model.layers.{l}.input_layernorm.weight": f"layers.{l}.attention_norm.weight" - for l in range(len(model.layers)) + f"model.layers.{layer}.input_layernorm.weight": f"layers.{layer}.attention_norm.weight" + for layer in range(len(model.layers)) }, **{ - f"model.layers.{l}.self_attn.{x}_proj.weight": f"layers.{l}.attention.w{x}.weight" + f"model.layers.{layer}.self_attn.{x}_proj.weight": f"layers.{layer}.attention.w{x}.weight" for x in ["q", "k", "v", "o"] - for l in range(len(model.layers)) + for layer in range(len(model.layers)) }, **{ - f"model.layers.{l}.post_attention_layernorm.weight": f"layers.{l}.ffn_norm.weight" - for l in range(len(model.layers)) + f"model.layers.{layer}.post_attention_layernorm.weight": f"layers.{layer}.ffn_norm.weight" + for layer in range(len(model.layers)) }, **{ - f"model.layers.{l}.mlp.{x}_proj.weight": f"layers.{l}.feed_forward.w{y}.weight" + f"model.layers.{layer}.mlp.{x}_proj.weight": f"layers.{layer}.feed_forward.w{y}.weight" for x, y in {"gate": "1", "down": "2", "up": "3"}.items() - for l in range(len(model.layers)) + for layer in range(len(model.layers)) }, "model.norm.weight": "norm.weight", "lm_head.weight": "output.weight", @@ -538,7 +533,7 @@ def convert_from_huggingface(weights, model): class AbsmaxQuantizedLinear: def __init__(self, in_features, out_features, bias=False): - assert bias == False + assert not bias self.weight = Tensor.ones(out_features, in_features, dtype=dtypes.int8) self.scale = Tensor.ones(out_features, dtype=dtypes.half) diff --git a/requirements.txt b/requirements.txt index 1526cdbd..b37ebfcc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ llama_cpp_python==0.2.15 sentencepiece==0.1.99 ctranslate2==3.20.0 -huggingface-hub==0.17.3 +huggingface-hub==0.17.3 transformers==4.35.0 torch==2.1.0 # Using fixed commit (a72b3700) for tinygrad to ensure stability in benchmarking. # Helps maintain reproducibility and guards against potential breaking changes. -git+https://github.com/tinygrad/tinygrad.git@a72b370066837af5b4d44eeb5c4fb30aebf5c502 \ No newline at end of file +git+https://github.com/tinygrad/tinygrad.git@a72b370066837af5b4d44eeb5c4fb30aebf5c502 diff --git a/rust_bench/llama2-burn/README.md b/rust_bench/llama2-burn/README.md index 77a3b5ef..09448b06 100644 --- a/rust_bench/llama2-burn/README.md +++ b/rust_bench/llama2-burn/README.md @@ -61,7 +61,7 @@ python3 dump_model.py ``` Example: `python3 dump_model.py llama2-7b-chat tokenizer.model` -3. **Test the Tokenizer**: Finally, run the `test_tokenizer.py` script to load the tokenizer.model file and verify an example encoding and decoding. This script should be run in the same directory as the tokenizer file. Execute this script using the command: +3. **Test the Tokenizer**: Finally, run the `test_tokenizer.py` script to load the tokenizer.model file and verify an example encoding and decoding. This script should be run in the same directory as the tokenizer file. Execute this script using the command: ``` python3 test_tokenizer.py ``` @@ -70,7 +70,7 @@ python3 test_tokenizer.py Inside the 'src/bin' folder, you will find Rust binaries: `convert`, `sample`, and `test`. -1. **Converting Dumped Weights**: The `convert` binary converts dumped weights into burn's model format. It saves them for further use. Execute this using the following command: +1. **Converting Dumped Weights**: The `convert` binary converts dumped weights into burn's model format. It saves them for further use. Execute this using the following command: ``` cargo run --bin convert ``` @@ -82,11 +82,11 @@ cargo run --bin test ``` Example: `cargo run --release --bin test tokenizer.model params` -3. **Sampling Text**: The `sample` binary loads the converted burn model file and generates a sample output based on an input prompt. The model can run on either the cpu or gpu. Execute this using the following command: +3. **Sampling Text**: The `sample` binary loads the converted burn model file and generates a sample output based on an input prompt. The model can run on either the cpu or gpu. Execute this using the following command: ``` cargo run --bin sample ``` -Example: +Example: ``` #export TORCH_CUDA_VERSION=cu113 # if running on gpu cargo run --release --bin sample llama2-7b-chat tokenizer.model "Hello, I am " 10 cpu diff --git a/rust_bench/llama2-burn/llama-py/dump.py b/rust_bench/llama2-burn/llama-py/dump.py index 072cd1b2..f758084e 100644 --- a/rust_bench/llama2-burn/llama-py/dump.py +++ b/rust_bench/llama2-burn/llama-py/dump.py @@ -1,73 +1,83 @@ import pathlib -import torch + import numpy as np +import torch -import model def save_scalar(s, name, path): s = np.array([1.0, float(s)]).astype(np.float32) - np.save(pathlib.Path(path, f'{name}.npy'), s) + np.save(pathlib.Path(path, f"{name}.npy"), s) + def save_tensor(tensor, name, path): tensor_numpy = tensor.numpy() tensor_dims = np.array(tensor_numpy.shape) tensor_values = tensor_numpy.flatten() tensor_to_save = np.concatenate((tensor_dims, tensor_values)).astype(np.float32) - np.save(pathlib.Path(path, f'{name}.npy'), tensor_to_save) + np.save(pathlib.Path(path, f"{name}.npy"), tensor_to_save) + def save_linear(linear, path): pathlib.Path(path).mkdir(parents=True, exist_ok=True) - save_tensor(linear.weight.t(), 'weight', path) # PyTorch and Tinygrad strangely transpose linear weights so reverse that + save_tensor( + linear.weight.t(), "weight", path + ) # PyTorch and Tinygrad strangely transpose linear weights so reverse that if linear.bias is not None: - save_tensor(linear.bias, 'bias', path) - + save_tensor(linear.bias, "bias", path) def save_rmsnorm(norm, path): pathlib.Path(path).mkdir(parents=True, exist_ok=True) - save_tensor(norm.weight, 'weight', path) - save_scalar(norm.eps, 'eps', path) + save_tensor(norm.weight, "weight", path) + save_scalar(norm.eps, "eps", path) + def save_attention(attention, path): pathlib.Path(path).mkdir(parents=True, exist_ok=True) - save_linear(attention.wq, pathlib.Path(path, 'wq')) - save_linear(attention.wk, pathlib.Path(path, 'wk')) - save_linear(attention.wv, pathlib.Path(path, 'wv')) - save_linear(attention.wo, pathlib.Path(path, 'wo')) + save_linear(attention.wq, pathlib.Path(path, "wq")) + save_linear(attention.wk, pathlib.Path(path, "wk")) + save_linear(attention.wv, pathlib.Path(path, "wv")) + save_linear(attention.wo, pathlib.Path(path, "wo")) n_kv_head = attention.n_kv_heads n_head = n_kv_head * attention.n_rep save_scalar(n_head, "n_head", path) save_scalar(n_kv_head, "n_kv_head", path) + def save_feedforward(feed_forward, path): pathlib.Path(path).mkdir(parents=True, exist_ok=True) - save_linear(feed_forward.w1, pathlib.Path(path, 'w1')) - save_linear(feed_forward.w2, pathlib.Path(path, 'w2')) - save_linear(feed_forward.w3, pathlib.Path(path, 'w3')) + save_linear(feed_forward.w1, pathlib.Path(path, "w1")) + save_linear(feed_forward.w2, pathlib.Path(path, "w2")) + save_linear(feed_forward.w3, pathlib.Path(path, "w3")) + def save_embedding(embedding, path): pathlib.Path(path).mkdir(parents=True, exist_ok=True) - save_tensor(embedding.weight, 'weight', path) + save_tensor(embedding.weight, "weight", path) + def save_transformer_block(transformer_block, path): pathlib.Path(path).mkdir(parents=True, exist_ok=True) - save_attention(transformer_block.attention, pathlib.Path(path, 'attention')) - save_feedforward(transformer_block.feed_forward, pathlib.Path(path, 'feedforward')) - save_rmsnorm(transformer_block.attention_norm, pathlib.Path(path, 'attention_norm')) - save_rmsnorm(transformer_block.ffn_norm, pathlib.Path(path, 'ffn_norm')) + save_attention(transformer_block.attention, pathlib.Path(path, "attention")) + save_feedforward(transformer_block.feed_forward, pathlib.Path(path, "feedforward")) + save_rmsnorm(transformer_block.attention_norm, pathlib.Path(path, "attention_norm")) + save_rmsnorm(transformer_block.ffn_norm, pathlib.Path(path, "ffn_norm")) + def save_transformer(transformer, path): with torch.no_grad(): pathlib.Path(path).mkdir(parents=True, exist_ok=True) - save_scalar(len(transformer.layers), 'n_layer', path) + save_scalar(len(transformer.layers), "n_layer", path) for idx, layer in enumerate(transformer.layers): - save_transformer_block(layer, pathlib.Path(path, f'layer{idx}')) - save_rmsnorm(transformer.norm, pathlib.Path(path, 'norm')) - save_embedding(transformer.tok_embeddings, pathlib.Path(path, 'tok_embeddings')) - save_linear(transformer.output, pathlib.Path(path, 'output')) - save_scalar(10000.0, 'theta', path) - save_scalar(transformer.params.max_seq_len, 'n_ctx', path) - save_scalar(transformer.params.multiple_of, 'multiple_of', path) + save_transformer_block(layer, pathlib.Path(path, f"layer{idx}")) + save_rmsnorm(transformer.norm, pathlib.Path(path, "norm")) + save_embedding(transformer.tok_embeddings, pathlib.Path(path, "tok_embeddings")) + save_linear(transformer.output, pathlib.Path(path, "output")) + save_scalar(10000.0, "theta", path) + save_scalar(transformer.params.max_seq_len, "n_ctx", path) + save_scalar(transformer.params.multiple_of, "multiple_of", path) if transformer.params.ffn_dim_multiplier is not None: - save_scalar(transformer.params.ffn_dim_multiplier, 'ffn_dim_multiplier', path) - #save_tensor(transformer.freqs_cis, 'freqs_cis', path) + save_scalar( + transformer.params.ffn_dim_multiplier, "ffn_dim_multiplier", path + ) + # save_tensor(transformer.freqs_cis, 'freqs_cis', path) diff --git a/rust_bench/llama2-burn/llama-py/dump_model.py b/rust_bench/llama2-burn/llama-py/dump_model.py index 816dcaa6..3cfcad6a 100644 --- a/rust_bench/llama2-burn/llama-py/dump_model.py +++ b/rust_bench/llama2-burn/llama-py/dump_model.py @@ -1,28 +1,31 @@ -import torch -from pathlib import Path -import json import argparse +import json import logging import sys +from pathlib import Path import dump -from model import Transformer, ModelArgs import tokenizer +import torch +from model import ModelArgs, Transformer # Configure logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) + def load_model(model_dir): tok = tokenizer.Tokenizer(model_path=str(model_dir / "tokenizer.model")) checkpoints = sorted((model_dir).glob("*.pth")) if len(checkpoints) == 0: raise ValueError(f"No checkpoint files found in {model_dir}") - + weights = [torch.load(filename, map_location="cpu") for filename in checkpoints] - with open(model_dir / "params.json", "r") as f: + with open(model_dir / "params.json") as f: params = json.loads(f.read()) - + model_args: ModelArgs = ModelArgs( max_batch_size=1, **params, @@ -31,7 +34,7 @@ def load_model(model_dir): model = Transformer(model_args) model.load_state_dict(concat_weights(weights), strict=False) model.max_seq_len = model.tok_embeddings.weight.shape[0] - logger.info('Loaded model') + logger.info("Loaded model") return model @@ -41,15 +44,35 @@ def convert(name) -> torch.Tensor: disk_tensors = [model[name] for model in models] if len(disk_tensors) == 1 or len(disk_tensors[0].shape) == 1: return disk_tensors[0] - axis = 1 if name.startswith('tok_embeddings.') or name.endswith('.attention.wo.weight') or name.endswith('.feed_forward.w2.weight') else 0 + axis = ( + 1 + if name.startswith("tok_embeddings.") + or name.endswith(".attention.wo.weight") + or name.endswith(".feed_forward.w2.weight") + else 0 + ) return disk_tensors[0].cat(*disk_tensors[1:], dim=axis) - return {name: convert(name) for name in {name: None for model in models for name in model}} + + return { + name: convert(name) + for name in {name: None for model in models for name in model} + } if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Load and dump transformer model.') - parser.add_argument('--model-dir', type=Path, required=True, help='Path to the directory containing the model checkpoints') - parser.add_argument('--output-dir', type=Path, required=True, help='Path to the directory where to dump the model.') + parser = argparse.ArgumentParser(description="Load and dump transformer model.") + parser.add_argument( + "--model-dir", + type=Path, + required=True, + help="Path to the directory containing the model checkpoints", + ) + parser.add_argument( + "--output-dir", + type=Path, + required=True, + help="Path to the directory where to dump the model.", + ) args = parser.parse_args() @@ -59,21 +82,28 @@ def convert(name) -> torch.Tensor: # Check if the model-dir/params directory already exists params_dir = output_dir / "params" if params_dir.is_dir(): - logger.info(f"The {params_dir} directory already exists. Model dump will not be performed.") + logger.info( + f"The {params_dir} directory already exists. Model dump will not be performed." + ) sys.exit(0) # Check that the model dir contains the required files - if not (model_dir / "params.json").is_file() or not (model_dir / "tokenizer.model").is_file() or not any(model_dir.glob("*.pth")): - logger.error("The model directory must contain params.json, tokenizer.model, and at least one .pth file") + if ( + not (model_dir / "params.json").is_file() + or not (model_dir / "tokenizer.model").is_file() + or not any(model_dir.glob("*.pth")) + ): + logger.error( + "The model directory must contain params.json, tokenizer.model, and at least one .pth file" + ) sys.exit(1) - try: logger.info(f"Loading model from {model_dir}") llama = load_model(model_dir) - logger.info('Dumping model...') + logger.info("Dumping model...") dump.save_transformer(llama, params_dir) - logger.info(f'Dump saved in {params_dir} folder.') + logger.info(f"Dump saved in {params_dir} folder.") except Exception as e: - logger.error(f"An error occurred: {e}") \ No newline at end of file + logger.error(f"An error occurred: {e}") diff --git a/rust_bench/llama2-burn/llama-py/dump_test.py b/rust_bench/llama2-burn/llama-py/dump_test.py index a8b4b3a4..d8a6f73f 100644 --- a/rust_bench/llama2-burn/llama-py/dump_test.py +++ b/rust_bench/llama2-burn/llama-py/dump_test.py @@ -1,7 +1,6 @@ -import torch import dump -import model -from model import Transformer, ModelArgs +import torch +from model import ModelArgs, Transformer if __name__ == "__main__": n_vocab = 10 @@ -14,18 +13,25 @@ norm_eps = 1e-6 max_batch_size = 1 - model_args = ModelArgs(dim=n_state, n_layers=n_layer, n_heads=n_head, n_kv_heads=n_kv_head, - vocab_size=n_vocab, multiple_of=multiple_of, norm_eps=norm_eps, - max_batch_size=max_batch_size) + model_args = ModelArgs( + dim=n_state, + n_layers=n_layer, + n_heads=n_head, + n_kv_heads=n_kv_head, + vocab_size=n_vocab, + multiple_of=multiple_of, + norm_eps=norm_eps, + max_batch_size=max_batch_size, + ) llama = Transformer(model_args) with torch.no_grad(): tokens = torch.tensor([0, 2, 1], dtype=torch.int32).unsqueeze(0) output = llama(tokens, 0) - print(f'Test input {tokens.numpy()}') - print(f'Test output {output.numpy()}') + print(f"Test input {tokens.numpy()}") + print(f"Test output {output.numpy()}") - print('Dumping test model...') + print("Dumping test model...") dump.save_transformer(llama, "params") - print('Dump saved in params folder.') \ No newline at end of file + print("Dump saved in params folder.") diff --git a/rust_bench/llama2-burn/llama-py/model.py b/rust_bench/llama2-burn/llama-py/model.py index d63ff075..562ef0b9 100644 --- a/rust_bench/llama2-burn/llama-py/model.py +++ b/rust_bench/llama2-burn/llama-py/model.py @@ -1,18 +1,16 @@ # This file is adapted from the LLama project: # https://github.com/facebookresearch/llama/blob/main/llama/model.py -# Original LLama code by Facebook AI Research +# Original LLama code by Facebook AI Research # Adapted by Gadersd import math from dataclasses import dataclass -from typing import Any, Optional, Tuple import torch import torch.nn.functional as F from torch import nn from torch.nn import Embedding, Linear -from torch import Tensor @dataclass @@ -20,10 +18,10 @@ class ModelArgs: dim: int = 4096 n_layers: int = 32 n_heads: int = 32 - n_kv_heads: Optional[int] = None + n_kv_heads: int | None = None vocab_size: int = -1 # defined later by tokenizer multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 - ffn_dim_multiplier: Optional[float] = None + ffn_dim_multiplier: float | None = None norm_eps: float = 1e-5 max_batch_size: int = 32 @@ -64,7 +62,7 @@ def apply_rotary_emb( xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) freqs_cis = reshape_for_broadcast(freqs_cis, xq_) @@ -89,7 +87,7 @@ class Attention(nn.Module): def __init__(self, args: ModelArgs): super().__init__() self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads - model_parallel_size = 1#fs_init.get_model_parallel_world_size() + model_parallel_size = 1 # fs_init.get_model_parallel_world_size() self.n_local_heads = args.n_heads // model_parallel_size self.n_local_kv_heads = self.n_kv_heads // model_parallel_size self.n_rep = self.n_local_heads // self.n_local_kv_heads @@ -138,7 +136,7 @@ def forward( x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, - mask: Optional[torch.Tensor], + mask: torch.Tensor | None, ): bsz, seqlen, _ = x.shape xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) @@ -152,8 +150,8 @@ def forward( self.cache_k = self.cache_k.to(xq) self.cache_v = self.cache_v.to(xq) - self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk - self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv + self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk # noqa + self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv # noqa keys = self.cache_k[:bsz, : start_pos + seqlen] values = self.cache_v[:bsz, : start_pos + seqlen] @@ -180,7 +178,7 @@ def __init__( dim: int, hidden_dim: int, multiple_of: int, - ffn_dim_multiplier: Optional[float], + ffn_dim_multiplier: float | None, ): super().__init__() hidden_dim = int(2 * hidden_dim / 3) @@ -189,15 +187,9 @@ def __init__( hidden_dim = int(ffn_dim_multiplier * hidden_dim) hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - self.w1 = Linear( - dim, hidden_dim, bias=False - ) - self.w2 = Linear( - hidden_dim, dim, bias=False - ) - self.w3 = Linear( - dim, hidden_dim, bias=False - ) + self.w1 = Linear(dim, hidden_dim, bias=False) + self.w2 = Linear(hidden_dim, dim, bias=False) + self.w3 = Linear(dim, hidden_dim, bias=False) def forward(self, x): return self.w2(F.silu(self.w1(x)) * self.w3(x)) @@ -225,7 +217,7 @@ def forward( x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, - mask: Optional[torch.Tensor], + mask: torch.Tensor | None, ): h = x + self.attention.forward( self.attention_norm(x), start_pos, freqs_cis, mask @@ -241,18 +233,14 @@ def __init__(self, params: ModelArgs): self.vocab_size = params.vocab_size self.n_layers = params.n_layers - self.tok_embeddings = Embedding( - params.vocab_size, params.dim - ) + self.tok_embeddings = Embedding(params.vocab_size, params.dim) self.layers = torch.nn.ModuleList() for layer_id in range(params.n_layers): self.layers.append(TransformerBlock(layer_id, params)) self.norm = RMSNorm(params.dim, eps=params.norm_eps) - self.output = Linear( - params.dim, params.vocab_size, bias=False - ) + self.output = Linear(params.dim, params.vocab_size, bias=False) self.freqs_cis = precompute_freqs_cis( self.params.dim // self.params.n_heads, self.params.max_seq_len * 2 @@ -262,10 +250,10 @@ def __init__(self, params: ModelArgs): def forward(self, tokens: torch.Tensor, start_pos: int): _bsz, seqlen = tokens.shape h = self.tok_embeddings(tokens) - #print(h.numpy()) - #print(h.shape) + # print(h.numpy()) + # print(h.shape) self.freqs_cis = self.freqs_cis.to(h.device) - freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen] + freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen] # noqa mask = None if seqlen > 1: @@ -279,4 +267,3 @@ def forward(self, tokens: torch.Tensor, start_pos: int): h = self.norm(h) output = self.output(h).float() return output - \ No newline at end of file diff --git a/rust_bench/llama2-burn/llama-py/requirements.txt b/rust_bench/llama2-burn/llama-py/requirements.txt index 3c977dc1..4ba53e28 100644 --- a/rust_bench/llama2-burn/llama-py/requirements.txt +++ b/rust_bench/llama2-burn/llama-py/requirements.txt @@ -1,3 +1,3 @@ numpy torch -sentencepiece \ No newline at end of file +sentencepiece diff --git a/rust_bench/llama2-burn/llama-py/test.py b/rust_bench/llama2-burn/llama-py/test.py index 91b6e27e..e8a3facc 100644 --- a/rust_bench/llama2-burn/llama-py/test.py +++ b/rust_bench/llama2-burn/llama-py/test.py @@ -1,23 +1,24 @@ -import torch -from pathlib import Path import json import sys -#from safetensors.torch import load_file +from pathlib import Path -import dump -from model import Transformer, ModelArgs import tokenizer +import torch +from model import ModelArgs, Transformer + +# from safetensors.torch import load_file + def load_model(model_dir, tokenizer_path): tok = tokenizer.Tokenizer(model_path=tokenizer_path) checkpoints = sorted(Path(model_dir).glob("*.pth")) if len(checkpoints) == 0: raise ValueError(f"No checkpoint files found in {model_dir}") - + weights = [torch.load(filename, map_location="cpu") for filename in checkpoints] - with open(Path(model_dir) / "params.json", "r") as f: + with open(Path(model_dir) / "params.json") as f: params = json.loads(f.read()) - + model_args: ModelArgs = ModelArgs( max_batch_size=1, **params, @@ -26,50 +27,62 @@ def load_model(model_dir, tokenizer_path): model = Transformer(model_args) model.load_state_dict(concat_weights(weights), strict=False) model.max_seq_len = model.tok_embeddings.weight.shape[0] - print('Loaded model') + print("Loaded model") return model -# The concat_weights function is adapted from the tinygrad library: +# The concat_weights function is adapted from the tinygrad library: # https://github.com/tinygrad/tinygrad/blob/master/tinygrad/examples/llama.py # Original code by TinyGrad authors # Adapted by [Your Name] def concat_weights(models): - def convert(name) -> torch.Tensor: - disk_tensors = [model[name] for model in models] - if len(disk_tensors) == 1 or len(disk_tensors[0].shape) == 1: - return disk_tensors[0] - axis = 1 if name.startswith('tok_embeddings.') or name.endswith('.attention.wo.weight') or name.endswith('.feed_forward.w2.weight') else 0 - return disk_tensors[0].cat(*disk_tensors[1:], dim=axis) - return {name: convert(name) for name in {name: None for model in models for name in model}} + def convert(name) -> torch.Tensor: + disk_tensors = [model[name] for model in models] + if len(disk_tensors) == 1 or len(disk_tensors[0].shape) == 1: + return disk_tensors[0] + axis = ( + 1 + if name.startswith("tok_embeddings.") + or name.endswith(".attention.wo.weight") + or name.endswith(".feed_forward.w2.weight") + else 0 + ) + return disk_tensors[0].cat(*disk_tensors[1:], dim=axis) + + return { + name: convert(name) + for name in {name: None for model in models for name in model} + } if __name__ == "__main__": if len(sys.argv) != 3: - raise ValueError("You must provide the model_dir and tok_path as command line parameters") + raise ValueError( + "You must provide the model_dir and tok_path as command line parameters" + ) model_dir = sys.argv[1] tokenizer_path = sys.argv[2] try: with torch.no_grad(): - tok = tokenizer.Tokenizer(model_path=tokenizer_path) - llama = load_model(model_dir, tokenizer_path) + tok = tokenizer.Tokenizer(model_path=tokenizer_path) + llama = load_model(model_dir, tokenizer_path) - '''tokens = torch.tensor([0, 2, 1]) + """tokens = torch.tensor([0, 2, 1]) out = llama(tokens.unsqueeze(0), 0) - print(out[0, :3, :10].numpy())''' + print(out[0, :3, :10].numpy())""" - tokens = tok.encode("Hello, I am ", True, False) - for i in range(0, 10): - token_tensor = torch.tensor(tokens) - logits = llama(token_tensor.unsqueeze(0), 0) - sample = logits[:, -1, :].argmax(dim=-1).item() - print(f'Sample is {sample} {tok.decode(sample)}') - tokens = tokens + [sample] + tokens = tok.encode("Hello, I am ", True, False) + for i in range(0, 10): + token_tensor = torch.tensor(tokens) + logits = llama(token_tensor.unsqueeze(0), 0) + sample = logits[:, -1, :].argmax(dim=-1).item() + print(f"Sample is {sample} {tok.decode(sample)}") + tokens = tokens + [sample] decoded = tok.decode(tokens) print(f"Sampled output: {decoded}") except Exception as e: - print(f"An error occurred: {e}") \ No newline at end of file + print(f"An error occurred: {e}") diff --git a/rust_bench/llama2-burn/llama-py/test_tokenizer.py b/rust_bench/llama2-burn/llama-py/test_tokenizer.py index 1e9a3a96..1304b823 100644 --- a/rust_bench/llama2-burn/llama-py/test_tokenizer.py +++ b/rust_bench/llama2-burn/llama-py/test_tokenizer.py @@ -1,5 +1,5 @@ import tokenizer - + if __name__ == "__main__": tok = tokenizer.Tokenizer("tokenizer.model") @@ -9,4 +9,4 @@ print(f"Test string: {test_str}") print(f"Encoded tokens: {encoded}") - print(f"Decoded string: {decoded}") \ No newline at end of file + print(f"Decoded string: {decoded}") diff --git a/rust_bench/llama2-burn/llama-py/tokenizer.py b/rust_bench/llama2-burn/llama-py/tokenizer.py index e3c04db5..40b701a8 100644 --- a/rust_bench/llama2-burn/llama-py/tokenizer.py +++ b/rust_bench/llama2-burn/llama-py/tokenizer.py @@ -4,11 +4,13 @@ # Original LLama code by Facebook AI Research # Adapted by Gadersd -from sentencepiece import SentencePieceProcessor -from typing import List import logging + +from sentencepiece import SentencePieceProcessor + logger = logging.getLogger(__name__) + class Tokenizer: def __init__(self, model_path: str): self.sp_model = SentencePieceProcessor(model_file=model_path) @@ -19,9 +21,11 @@ def __init__(self, model_path: str): self.eos_id: int = self.sp_model.eos_id() self.pad_id: int = self.sp_model.pad_id() - logger.info(f'#words: {self.n_words} BOS ID: {self.bos_id} EOS ID: {self.eos_id} PAD ID: {self.pad_id}') + logger.info( + f"#words: {self.n_words} BOS ID: {self.bos_id} EOS ID: {self.eos_id} PAD ID: {self.pad_id}" + ) - def encode(self, s: str, bos: bool, eos: bool) -> List[int]: + def encode(self, s: str, bos: bool, eos: bool) -> list[int]: assert type(s) is str t = self.sp_model.encode(s) if bos: @@ -30,5 +34,5 @@ def encode(self, s: str, bos: bool, eos: bool) -> List[int]: t = t + [self.eos_id] return t - def decode(self, t: List[int]) -> str: + def decode(self, t: list[int]) -> str: return self.sp_model.decode(t) diff --git a/rust_bench/llama2-candle/Cargo.toml b/rust_bench/llama2-candle/Cargo.toml index b6f7fbdb..67da7e93 100644 --- a/rust_bench/llama2-candle/Cargo.toml +++ b/rust_bench/llama2-candle/Cargo.toml @@ -25,4 +25,4 @@ tracing-subscriber = "0.3.7" [features] accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"] -cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda"] \ No newline at end of file +cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda"] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..6aee7c7b --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 120 +exclude = .tox,.git,*/migrations/*,*/static/CACHE/*,docs,node_modules,venv diff --git a/setup.sh b/setup.sh index 5af5f326..d8e7cf6a 100755 --- a/setup.sh +++ b/setup.sh @@ -2,7 +2,7 @@ ################################################################################ # Script: setup_and_convert.sh -# Description: This script automates the setup of a virtual environment, +# Description: This script automates the setup of a virtual environment, # installs project requirements, converts and stores models. ################################################################################ @@ -61,7 +61,7 @@ fi # Check and create llama-2-7b-burn model if [ ! -e "$BURN_MODEL_FOLDER/$BURN_MODEL_NAME.cfg" ]; then check_and_create_directory "$BURN_MODEL_FOLDER" - + if [ ! -d "$BURN_MODEL_FOLDER/params" ]; then create_and_activate_venv echo "Installing requirements for dumping" From 7d43a634ecbc4a261707ac014493f1672c1268b5 Mon Sep 17 00:00:00 2001 From: nsosio Date: Thu, 16 Nov 2023 11:58:01 +0000 Subject: [PATCH 2/5] tinygrad fixes --- .pre-commit-config.yaml | 6 ------ python_bench/tinygrad.py | 11 ++++++----- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fc065c44..af896c6a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,12 +14,6 @@ repos: - id: check-builtin-literals - id: check-case-conflict - - repo: https://github.com/asottile/pyupgrade - rev: v3.15.0 - hooks: - - id: pyupgrade - args: [--py311-plus] - - repo: https://github.com/psf/black rev: 23.11.0 hooks: diff --git a/python_bench/tinygrad.py b/python_bench/tinygrad.py index b238ffdd..15d976e7 100644 --- a/python_bench/tinygrad.py +++ b/python_bench/tinygrad.py @@ -3,6 +3,7 @@ import os import time from pathlib import Path +from typing import Optional, Union import numpy as np from tinygrad.helpers import CI, dtypes, getenv @@ -90,9 +91,9 @@ def __init__(self, dim, n_heads, n_kv_heads, linear=Linear): def __call__( self, x: Tensor, - start_pos: Variable | int, + start_pos: Union[Variable, int], freqs_cis: Tensor, - mask: Tensor | None, + mask: Optional[Tensor], ) -> Tensor: xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) xq = xq.reshape(xq.shape[0], xq.shape[1], self.n_heads, self.head_dim) @@ -176,9 +177,9 @@ def __init__( def __call__( self, x: Tensor, - start_pos: Variable | int, + start_pos: Union[Variable, int], freqs_cis: Tensor, - mask: Tensor | None, + mask: Union[Tensor, None], ): h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis, mask) return (h + self.feed_forward(self.ffn_norm(h))).realize() @@ -223,7 +224,7 @@ def __init__( self.forward_jit = TinyJit(self.forward) def forward( - self, tokens: Tensor, start_pos: Variable | int, temperature: float = 0.0 + self, tokens: Tensor, start_pos: Union[Variable, int], temperature: float = 0.0 ): _bsz, seqlen = tokens.shape freqs_cis = self.freqs_cis.shrink( From 37150babd490db7019b8a90bbea46aa0579dcdcb Mon Sep 17 00:00:00 2001 From: nsosio Date: Thu, 16 Nov 2023 13:04:02 +0100 Subject: [PATCH 3/5] added gha --- .github/workflows/precommit.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .github/workflows/precommit.yaml diff --git a/.github/workflows/precommit.yaml b/.github/workflows/precommit.yaml new file mode 100644 index 00000000..c2f7e71f --- /dev/null +++ b/.github/workflows/precommit.yaml @@ -0,0 +1,14 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - uses: pre-commit/action@v3.0.0 From 7cab5e9247f1beb18068e632511708f96bfc8fa9 Mon Sep 17 00:00:00 2001 From: nsosio Date: Thu, 16 Nov 2023 13:06:10 +0100 Subject: [PATCH 4/5] fixes --- .github/workflows/precommit.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/precommit.yaml b/.github/workflows/precommit.yaml index c2f7e71f..f1478189 100644 --- a/.github/workflows/precommit.yaml +++ b/.github/workflows/precommit.yaml @@ -2,6 +2,7 @@ name: pre-commit on: pull_request: + branches: [main] push: branches: [main] From d3281efbbc9e7764afc21297609d1d2f14ac2ded Mon Sep 17 00:00:00 2001 From: nsosio Date: Thu, 16 Nov 2023 13:07:39 +0100 Subject: [PATCH 5/5] fixes --- .github/workflows/precommit.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/precommit.yaml b/.github/workflows/precommit.yaml index f1478189..076db3ba 100644 --- a/.github/workflows/precommit.yaml +++ b/.github/workflows/precommit.yaml @@ -3,8 +3,6 @@ name: pre-commit on: pull_request: branches: [main] - push: - branches: [main] jobs: pre-commit: