Skip to content

Commit

Permalink
Added Utilities for Activation Patching + A Demo of how to use them (#…
Browse files Browse the repository at this point in the history
…165)

* Patching utils

* Adding Activation Patching utils and a demo
  • Loading branch information
neelnanda-io authored Feb 4, 2023
1 parent 6054b6e commit 649d3be
Show file tree
Hide file tree
Showing 4 changed files with 409 additions and 2 deletions.
1 change: 1 addition & 0 deletions activation_patching_in_TL_demo.py.ipynb

Large diffs are not rendered by default.

29 changes: 27 additions & 2 deletions transformer_lens/ActivationCache.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def compute_head_results(

def stack_head_results(
self,
layer: int,
layer: int = -1,
return_labels: bool = False,
incl_remainder: bool = False,
pos_slice: Union[Slice, SliceInput] = None,
Expand All @@ -274,7 +274,7 @@ def stack_head_results(
Assumes that the model has been run with use_attn_results=True
Args:
layer (int): Layer index - heads at all layers strictly before this are included. layer must be in [1, n_layers]
layer (int): Layer index - heads at all layers strictly before this are included. layer must be in [1, n_layers-1], or any of (n_layers, -1, None), which all mean the final layer
return_labels (bool, optional): Whether to also return a list of labels of the form "L0H0" for the heads. Defaults to False.
incl_remainder (bool, optional): Whether to return a final term which is "the rest of the residual stream". Defaults to False.
pos_slice (Slice): A slice object to apply to the pos dimension. Defaults to None, do nothing.
Expand Down Expand Up @@ -325,6 +325,31 @@ def stack_head_results(
return components, labels
else:
return components

def stack_activation(
self,
activation_name: str,
layer: int = -1,
sublayer_type: Optional[str] = None,
) -> TT[T.layers_covered, ...]:
"""Returns a stack of all head results (ie residual stream contribution) up to layer L. A good way to decompose the outputs of attention layers into attribution by specific heads. The output shape is exactly the same shape as the activations, just with a leading layers dimension.
Args:
activation_name (str): The name of the activation to be stacked
layer (int): Layer index - heads at all layers strictly before this are included. layer must be in [1, n_layers-1], or any of (n_layers, -1, None), which all mean the final layer
sublayer_type (str, *optional*): The sub layer type of the activation, passed to utils.get_act_name. Can normally be inferred
incl_remainder (bool, optional): Whether to return a final term which is "the rest of the residual stream". Defaults to False.
"""

if layer is None or layer == -1:
# Default to the residual stream immediately pre unembed
layer = self.model.cfg.n_layers

components = []
for l in range(layer):
components.append(self[(activation_name, l, sublayer_type)])

return torch.stack(components, dim=0)

def get_neuron_results(
self,
Expand Down
1 change: 1 addition & 0 deletions transformer_lens/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .ActivationCache import ActivationCache
from .HookedTransformer import HookedTransformer
from . import loading_from_pretrained as loading
from . import patching
from . import train

from .past_key_value_caching import (
Expand Down
Loading

0 comments on commit 649d3be

Please sign in to comment.