Implement Capacity Metric and W&B Histogram Logging (#48)

HoagyC · web-flow · commit 55566bbf94bb · 2023-11-09T07:32:24.000+08:00
diff --git a/.vscode/cspell.json b/.vscode/cspell.json
@@ -23,6 +23,7 @@
         "dunder",
         "earlyterminate",
         "einops",
+        "einsum",
         "endoftext",
         "gelu",
         "githistory",
@@ -55,6 +56,7 @@
         "penality",
         "polysemantic",
         "polysemantically",
+        "polysemanticity",
         "precommit",
         "pyproject",
         "pyright",
@@ -69,6 +71,7 @@
         "randperm",
         "relu",
         "resid",
+        "rtol",
         "runcap",
         "sharded",
         "snapshottest",
diff --git a/poetry.lock b/poetry.lock
diff --git a/sparse_autoencoder/train/metrics/capacity.py b/sparse_autoencoder/train/metrics/capacity.py
@@ -0,0 +1,64 @@
+"""Capacity metrics for sets of learned features."""
+import einops
+from jaxtyping import Float
+from numpy import histogram
+import numpy as np
+from numpy.typing import NDArray
+import torch
+from torch import Tensor
+import wandb
+
+
+def calc_capacities(features: Float[Tensor, "n_feats feat_dim"]) -> Float[Tensor, " n_feats"]:
+    """Calculate capacities.
+
+    Measure the capacity of a set of features as defined in [Polysemanticity and Capacity in Neural Networks](https://arxiv.org/pdf/2210.01892.pdf).
+
+    Capacity is intuitively measuring the 'proportion of a dimension' assigned to a feature.
+    Formally it's the ratio of the squared dot product of a feature with itself to the sum of its
+    squared dot products of all features.
+
+    If the features are orthogonal, the capacity is 1. If they are all the same, the capacity is
+    1/n.
+
+    Example:
+        >>> import torch
+        >>> orthogonal_features = torch.tensor([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
+        >>> orthogonal_caps = calc_capacities(orthogonal_features)
+        >>> orthogonal_caps
+        tensor([1., 1., 1.])
+
+    Args:
+        features: A collection of features.
+
+    Returns:
+        A 1D tensor of capacities, where each element is the capacity of the corresponding feature.
+    """
+    squared_dot_products = (
+        einops.einsum(
+            features, features, "n_feats1 feat_dim, n_feats2 feat_dim -> n_feats1 n_feats2"
+        )
+        ** 2
+    )
+    sum_of_sq_dot = squared_dot_products.sum(dim=-1)
+    return torch.diag(squared_dot_products) / sum_of_sq_dot
+
+
+def wandb_capacities_histogram(
+    capacities: Float[Tensor, " n_feats"],
+) -> wandb.Histogram:
+    """Create a W&B histogram of the capacities.
+
+    This can be logged with Weights & Biases using e.g. `wandb.log({"capacities_histogram":
+    wandb_capacities_histogram(capacities)})`.
+
+    Args:
+        capacities: Capacity of each feature. Can be calculated using :func:`calc_capacities`.
+
+    Returns:
+        Weights & Biases histogram for logging with `wandb.log`.
+    """
+    numpy_capacities: NDArray[np.float_] = capacities.detach().cpu().numpy()
+
+    bins, values = histogram(numpy_capacities, bins=20, range=(0, 1))
+    return wandb.Histogram(np_histogram=(bins, values))
diff --git a/sparse_autoencoder/train/metrics/tests/__snapshots__/test_capacities.ambr b/sparse_autoencoder/train/metrics/tests/__snapshots__/test_capacities.ambr
@@ -0,0 +1,25 @@
+# serializer version: 1
+# name: test_wandb_capacity_histogram
+  list([
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    3,
+  ])
+# ---
diff --git a/sparse_autoencoder/train/metrics/tests/test_capacities.py b/sparse_autoencoder/train/metrics/tests/test_capacities.py
@@ -0,0 +1,48 @@
+"""Tests for the capacity calculation and histogram creation."""
+
+import math
+
+from jaxtyping import Float
+import pytest
+from syrupy.session import SnapshotSession
+import torch
+from torch import Tensor
+
+from sparse_autoencoder.train.metrics.capacity import calc_capacities, wandb_capacities_histogram
+
+
+@pytest.mark.parametrize(
+    ("features", "expected_capacities"),
+    [
+        (
+            torch.tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]),
+            torch.tensor([1.0, 1.0]),
+        ),
+        (
+            torch.tensor([[-0.8, -0.8, -0.8], [-0.8, -0.8, -0.8]]),
+            torch.ones(2) / 2,
+        ),
+        (
+            torch.tensor(
+                [[1.0, 0.0, 0], [1 / math.sqrt(2), 1 / math.sqrt(2), 0.0], [0.0, 0.0, 1.0]]
+            ),
+            torch.tensor([2 / 3, 2 / 3, 1.0]),
+        ),
+    ],
+)
+def test_calc_capacities(
+    features: Float[Tensor, "n_feats feat_dim"], expected_capacities: Float[Tensor, " n_feats"]
+) -> None:
+    """Check that the capacity calculation is correct."""
+    capacities = calc_capacities(features)
+    assert torch.allclose(
+        capacities, expected_capacities, rtol=1e-3
+    ), "Capacity calculation is incorrect."
+
+
+def test_wandb_capacity_histogram(snapshot: SnapshotSession) -> None:
+    """Check the Weights & Biases Histogram is created correctly."""
+    capacities = torch.tensor([0.5, 0.1, 1, 1, 1])
+    res = wandb_capacities_histogram(capacities)
+
+    assert res.histogram == snapshot

-Original file line number
+Diff line change
@@ @@ -0,0 +1,25 @@ @@
 +# serializer version: 1
 +# name: test_wandb_capacity_histogram
 +  list([
 +    0,
 +    0,
 +    1,
 +    0,
 +    0,
 +    0,
 +    0,
 +    0,
 +    0,
 +    0,
 +    1,
 +    0,
 +    0,
 +    0,
 +    0,
 +    0,
 +    0,
 +    0,
 +    0,
 +    3,
 +  ])
 +# ---