Cap Mistral's context length at 2k (#495)

Temporary fix to prevent multiple TB of memory allocated just to attention masks
TransformerLensOrg · Jan 28, 2024 · ba3fb3b · ba3fb3b
1 parent 19b3bc8
commit ba3fb3b
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py
@@ -815,7 +815,7 @@ def convert_hf_model_config(model_name: str, **kwargs):
             "n_heads": 32,
             "d_mlp": 14336,
             "n_layers": 32,
-            "n_ctx": 32768,
+            "n_ctx": 2048,  # Capped due to memory issues
             "d_vocab": 32000,
             "act_fn": "silu",
             "normalization_type": "RMS",