Skip to content

Commit

Permalink
Merge from main
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrei-Aksionov committed Dec 30, 2024
1 parent ff6baae commit a5a15dd
Show file tree
Hide file tree
Showing 29 changed files with 1,432 additions and 668 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/check-links.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install "mistune<3.1" # a newer version is incompatible with nbconvert
pip install pytest pytest-check-links
- name: Check links
run: |
pytest --check-links README.md --check-links-ignore "http*"
pytest --check-links tutorials --check-links-ignore "http*"
pytest --check-links tutorials --check-links-ignore "http*"
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,15 @@ Every model is written from scratch to maximize performance and remove layers of
| CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma) |
| Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950) |
| Falcon | 7B, 40B, 180B | TII UAE | [TII 2023](https://falconllm.tii.ae) |
| Falcon 3 | 1B, 3B, 7B, 10B | TII UAE | [TII 2024](https://huggingface.co/blog/falcon3) |
| FreeWilly2 (Stable Beluga 2) | 70B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stable-beluga-large-instruction-fine-tuned-models) |
| Function Calling Llama 2 | 7B | Trelis | [Trelis et al. 2023](https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2) |
| Gemma | 2B, 7B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf) |
| Gemma 2 | 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf) |
| Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288) |
| Llama 3.1 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) |
| Llama 3.2 | 1B, 3B | Meta AI | [Meta AI 2024](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/) |
| Llama 3.3 | 70B | Meta AI | [Meta AI 2024](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) |
| Mathstral | 7B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mathstral/) |
| MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama) |
| Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) |
Expand All @@ -137,7 +139,10 @@ Every model is written from scratch to maximize performance and remove layers of
| Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) |
| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) |
| Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186) |
| Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) |
| QwQ | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) |
| SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm) |
| Salamandra | 2B, 7B | Barcelona Supercomputing Centre | [BSC-LTC 2024](https://github.com/BSC-LTC/salamandra) |
| StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding) |
| StableLM | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM) |
| StableLM Zephyr | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding) |
Expand Down
2 changes: 1 addition & 1 deletion extensions/thunder/unsloth/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def unsloth_apply_rope_meta(
Q: TensorProxy, cos: TensorProxy, sin: TensorProxy
) -> Tuple[TensorProxy, TensorProxy, TensorProxy, int, int, int]:
batch, n_heads, seq_len, head_dim = Q.shape
assert seq_len <= cos.shape[0]
assert seq_len <= cos.shape[-2]
BLOCK_SIZE, num_warps = kernels.calculate_settings(head_dim // 2)
div, mod = divmod(n_heads, kernels.rope_embedding.ROPE_GROUP_SIZE)
n_groups = div + (mod != 0)
Expand Down
6 changes: 3 additions & 3 deletions litgpt/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,8 @@ def __init__(self, config: Config, block_idx: int) -> None:
self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
self.block_idx = block_idx
self.apply_sliding_window_attention = (
config.sliding_window_size is not None and
block_idx % config.sliding_window_layer_placing == 0
config.sliding_window_size is not None and
block_idx % config.sliding_window_layer_stride == 0
)
self.config = config

Expand All @@ -151,7 +151,7 @@ def scaled_dot_product_attention(
ak, av = self.adapter_kv_cache
else:
prefix = self.adapter_wte.weight.reshape(1, aT, self.config.n_embd)
aqkv = self.attn(prefix)
aqkv = self.qkv(prefix)
q_per_kv = self.config.n_head // self.config.n_query_groups
aqkv = aqkv.view(1, aT, self.config.n_query_groups, q_per_kv + 2, self.config.head_size)
aqkv = aqkv.permute(0, 2, 3, 1, 4)
Expand Down
20 changes: 14 additions & 6 deletions litgpt/adapter_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from litgpt.adapter import CausalSelfAttention as BaseCausalSelfAttention
from litgpt.adapter import Config as BaseConfig
from litgpt.model import KVCache
from litgpt.scripts.convert_hf_checkpoint import qkv_reassemble
from litgpt.utils import map_old_state_dict_weights


Expand Down Expand Up @@ -163,7 +164,7 @@ def __init__(self, config: Config, block_idx: int) -> None:
nn.Module.__init__(self)
shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
# key, query, value projections for all heads, but in a batch
self.attn = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias or config.attn_bias)
self.qkv = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias or config.attn_bias)
# output projection
# if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head`
self.proj = AdapterV2Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias)
Expand All @@ -179,24 +180,31 @@ def __init__(self, config: Config, block_idx: int) -> None:
self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
self.block_idx = block_idx
self.apply_sliding_window_attention = (
config.sliding_window_size is not None and
block_idx % config.sliding_window_layer_placing == 0
config.sliding_window_size is not None and
block_idx % config.sliding_window_layer_stride == 0
)

self.config = config

def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
"""For compatibility with base checkpoints."""
"""For compatibility with base and/or legacy checkpoints."""
mapping = {
"attn.weight": "attn.linear.weight",
"attn.bias": "attn.linear.bias",
"qkv.weight": "qkv.linear.weight",
"qkv.bias": "qkv.linear.bias",
"proj.weight": "proj.linear.weight",
"proj.bias": "proj.linear.bias",
}
state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
# For compatibility with older checkpoints
if (key := prefix + "gating_factor") in state_dict and state_dict[key].size(1) == self.config.n_head:
state_dict[key] = state_dict[key].permute(0, 2, 1, 3)

for attr in ("weight", "bias"):
legacy_key = f"{prefix}attn.linear.{attr}"
current_key = f"{prefix}qkv.linear.{attr}"
if legacy_key in state_dict:
state_dict[current_key] = qkv_reassemble(state_dict.pop(legacy_key), self.config)

super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)


Expand Down
4 changes: 2 additions & 2 deletions litgpt/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ def distribute(
model.eval()

if generate_strategy == "sequential":
state_dict = torch.load(str(self.checkpoint_dir / "lit_model.pth"), mmap=True, map_location="cpu")
state_dict = torch.load(str(self.checkpoint_dir / "lit_model.pth"), mmap=True, map_location="cpu", weights_only=False)
model.load_state_dict(state_dict, assign=True)
model = fabric.setup_module(model, move_to_device=False)

Expand All @@ -405,7 +405,7 @@ def distribute(
pbar = tqdm(total=fabric.world_size, desc="Loading model weights")
for rank in range(fabric.world_size):
if fabric.global_rank == rank:
state_dict = torch.load(str(self.checkpoint_dir / "lit_model.pth"), mmap=True, map_location="cpu")
state_dict = torch.load(str(self.checkpoint_dir / "lit_model.pth"), mmap=True, map_location="cpu", weights_only=False)
model.load_state_dict(state_dict, assign=True)

# cannot use `.setup_module` because it will wrap with DDP
Expand Down
Loading

0 comments on commit a5a15dd

Please sign in to comment.