Add replace and zero activations hooks (#111)

alan-cooney · web-flow · commit b2c821f5f6be · 2023-11-26T17:18:07.000-05:00
diff --git a/sparse_autoencoder/source_model/replace_activations_hook.py b/sparse_autoencoder/source_model/replace_activations_hook.py
@@ -0,0 +1,38 @@
+"""Replace activations hook."""
+from typing import TYPE_CHECKING
+
+from torch import Tensor
+from transformer_lens.hook_points import HookPoint
+
+from sparse_autoencoder.autoencoder.abstract_autoencoder import AbstractAutoencoder
+
+
+if TYPE_CHECKING:
+    from sparse_autoencoder.tensor_types import InputOutputActivationBatch
+
+
+def replace_activations_hook(
+    value: Tensor,
+    hook: HookPoint,  # noqa: ARG001
+    sparse_autoencoder: AbstractAutoencoder,
+) -> Tensor:
+    """Replace activations hook.
+
+    Args:
+        value: The activations to replace.
+        hook: The hook point.
+        sparse_autoencoder: The sparse autoencoder. This should be pre-initialised with
+            `functools.partial`.
+
+    Returns:
+        Replaced activations.
+    """
+    # Squash to just have a "*items" and a "batch" dimension
+    original_shape = value.shape
+    squashed_value: InputOutputActivationBatch = value.view(-1, value.size(-1))
+
+    # Get the output activations from a forward pass of the SAE
+    _learned_activations, output_activations = sparse_autoencoder.forward(squashed_value)
+
+    # Reshape to the original shape
+    return output_activations.view(*original_shape)
diff --git a/sparse_autoencoder/source_model/store_activations_hook.py b/sparse_autoencoder/source_model/store_activations_hook.py
@@ -15,32 +15,33 @@ def store_activations_hook(
     Useful for getting just the specific activations wanted, rather than the full cache.
 
     Example:
-    First we'll need a source model from TransformerLens and an activation store.
-
-    >>> from functools import partial
-    >>> from transformer_lens import HookedTransformer
-    >>> from sparse_autoencoder.activation_store.list_store import ListActivationStore
-    >>> store = ListActivationStore()
-    >>> model = HookedTransformer.from_pretrained("tiny-stories-1M")
-    Loaded pretrained model tiny-stories-1M into HookedTransformer
-
-    Next we can add the hook to specific neurons (in this case the first MLP neurons), and create
-    the tokens for a forward pass.
-
-    >>> model.add_hook(
-    ...     "blocks.0.mlp.hook_post", partial(store_activations_hook, store=store)
-    ... )
-    >>> tokens = model.to_tokens("Hello world")
-    >>> tokens.shape
-    torch.Size([1, 3])
-
-    Then when we run the model, we should get one activation vector for each token (as we just have
-    one batch item). Note we also set `stop_at_layer=1` as we don't need the logits or any other
-    activations after the hook point that we've specified (in this case the first MLP layer).
-
-    >>> _output = model.forward("Hello world", stop_at_layer=1) # Change this layer as required
-    >>> len(store)
-    3
+        First we'll need a source model from TransformerLens and an activation store.
+
+        >>> from functools import partial
+        >>> from transformer_lens import HookedTransformer
+        >>> from sparse_autoencoder.activation_store.list_store import ListActivationStore
+        >>> store = ListActivationStore()
+        >>> model = HookedTransformer.from_pretrained("tiny-stories-1M")
+        Loaded pretrained model tiny-stories-1M into HookedTransformer
+
+        Next we can add the hook to specific neurons (in this case the first MLP neurons), and
+        create the tokens for a forward pass.
+
+        >>> model.add_hook(
+        ...     "blocks.0.mlp.hook_post", partial(store_activations_hook, store=store)
+        ... )
+        >>> tokens = model.to_tokens("Hello world")
+        >>> tokens.shape
+        torch.Size([1, 3])
+
+        Then when we run the model, we should get one activation vector for each token (as we just
+        have one batch item). Note we also set `stop_at_layer=1` as we don't need the logits or any
+        other activations after the hook point that we've specified (in this case the first MLP
+        layer).
+
+        >>> _output = model.forward("Hello world", stop_at_layer=1) # Change this layer as required
+        >>> len(store)
+        3
 
     Args:
         value: The activations to store.
diff --git a/sparse_autoencoder/source_model/tests/test_replace_activations_hook.py b/sparse_autoencoder/source_model/tests/test_replace_activations_hook.py
@@ -0,0 +1,32 @@
+"""Replace activations hook tests."""
+from functools import partial
+
+import torch
+from transformer_lens import HookedTransformer
+
+from sparse_autoencoder.autoencoder.model import SparseAutoencoder
+from sparse_autoencoder.source_model.replace_activations_hook import replace_activations_hook
+from sparse_autoencoder.tensor_types import BatchTokenizedPrompts
+
+
+def test_hook_stores_activations() -> None:
+    """Test that the hook replaces activations."""
+    torch.random.manual_seed(0)
+    source_model = HookedTransformer.from_pretrained("tiny-stories-1M", device="cpu")
+    autoencoder = SparseAutoencoder(source_model.cfg.d_model, source_model.cfg.d_model * 2)
+
+    tokens: BatchTokenizedPrompts = source_model.to_tokens("Hello world")
+    loss_without_hook = source_model.forward(tokens, return_type="loss")
+    loss_with_hook = source_model.run_with_hooks(
+        tokens,
+        return_type="loss",
+        fwd_hooks=[
+            (
+                "blocks.0.hook_mlp_out",
+                partial(replace_activations_hook, sparse_autoencoder=autoencoder),
+            )
+        ],
+    )
+
+    # Check it decrease performance (as the sae is untrained so it will output nonsense).
+    assert torch.all(torch.gt(loss_with_hook, loss_without_hook))
diff --git a/sparse_autoencoder/source_model/tests/test_zero_ablate_hook.py b/sparse_autoencoder/source_model/tests/test_zero_ablate_hook.py
@@ -0,0 +1,57 @@
+"""Test the zero ablate hook."""
+import pytest
+import torch
+from transformer_lens.hook_points import HookPoint
+
+from sparse_autoencoder.source_model.zero_ablate_hook import zero_ablate_hook
+
+
+class MockHookPoint(HookPoint):
+    """Mock HookPoint class."""
+
+
+@pytest.fixture()
+def mock_hook_point() -> MockHookPoint:
+    """Fixture to provide a mock HookPoint instance."""
+    return MockHookPoint()
+
+
+def test_zero_ablate_hook_with_standard_tensor(mock_hook_point: MockHookPoint) -> None:
+    """Test zero_ablate_hook with a standard tensor.
+
+    Args:
+        mock_hook_point: A mock HookPoint instance.
+    """
+    value = torch.ones(3, 4)
+    expected = torch.zeros(3, 4)
+    result = zero_ablate_hook(value, mock_hook_point)
+    assert torch.equal(result, expected), "The output tensor should contain only zeros."
+
+
+@pytest.mark.parametrize("shape", [(10,), (5, 5), (2, 3, 4)])
+def test_zero_ablate_hook_with_various_shapes(
+    mock_hook_point: MockHookPoint, shape: tuple[int, ...]
+) -> None:
+    """Test zero_ablate_hook with tensors of various shapes.
+
+    Args:
+        mock_hook_point: A mock HookPoint instance.
+        shape: A tuple representing the shape of the tensor.
+    """
+    value = torch.ones(*shape)
+    expected = torch.zeros(*shape)
+    result = zero_ablate_hook(value, mock_hook_point)
+    assert torch.equal(
+        result, expected
+    ), f"The output tensor should be of shape {shape} with zeros."
+
+
+def test_float_dtype_maintained(mock_hook_point: MockHookPoint) -> None:
+    """Test that the float dtype is maintained.
+
+    Args:
+        mock_hook_point: A mock HookPoint instance.
+    """
+    value = torch.ones(3, 4, dtype=torch.float)
+    result = zero_ablate_hook(value, mock_hook_point)
+    assert result.dtype == torch.float, "The output tensor should be of dtype float."
diff --git a/sparse_autoencoder/source_model/zero_ablate_hook.py b/sparse_autoencoder/source_model/zero_ablate_hook.py
@@ -0,0 +1,27 @@
+"""Zero ablate hook."""
+import torch
+from torch import Tensor
+from transformer_lens.hook_points import HookPoint
+
+
+def zero_ablate_hook(
+    value: Tensor,
+    hook: HookPoint,  # noqa: ARG001
+) -> Tensor:
+    """Zero ablate hook.
+
+    Args:
+        value: The activations to store.
+        hook: The hook point.
+
+    Example:
+        >>> dummy_hook_point = HookPoint()
+        >>> value = torch.ones(2, 3)
+        >>> zero_ablate_hook(value, dummy_hook_point)
+        tensor([[0., 0., 0.],
+                [0., 0., 0.]])
+
+    Returns:
+        Replaced activations.
+    """
+    return torch.zeros_like(value)