Add feature density metric (#11)

alan-cooney · web-flow · commit a2349ebefe67 · 2023-11-06T10:50:15.000+08:00
diff --git a/sparse_autoencoder/train/metrics/feature_density.py b/sparse_autoencoder/train/metrics/feature_density.py
@@ -0,0 +1,60 @@
+"""Feature density metrics & histogram."""
+
+import einops
+from jaxtyping import Float
+from numpy import histogram
+import numpy as np
+from numpy.typing import NDArray
+import torch
+from torch import Tensor
+import wandb
+
+
+def calc_feature_density(
+    activations: Float[Tensor, "sample activation"], threshold: float = 0.001
+) -> Float[Tensor, " activation"]:
+    """Count how many times each feature was active.
+
+    Percentage of samples in which each feature was active (i.e. the neuron has "fired").
+
+    Example:
+        >>> import torch
+        >>> activations = torch.tensor([[0.5, 0.5, 0.0], [0.5, 0.0, 0.0001]])
+        >>> calc_feature_density(activations).tolist()
+        [1.0, 0.5, 0.0]
+
+    Args:
+        activations: Sample of cached activations (the Autoencoder's learned features).
+        threshold: Threshold for considering a feature active (i.e. the neuron has "fired"). This
+            should be close to zero.
+
+    Returns:
+        Number of times each feature was active in a sample.
+    """
+    has_fired: Float[Tensor, "sample activation"] = torch.gt(activations, threshold).to(
+        # Use float as einops requires this (64 as some features are very sparse)
+        dtype=torch.float64
+    )
+
+    return einops.reduce(has_fired, "sample activation -> activation", "mean")
+
+
+def wandb_feature_density_histogram(
+    feature_density: Float[Tensor, " activation"],
+) -> wandb.Histogram:
+    """Create a W&B histogram of the feature density.
+
+    This can be logged with Weights & Biases using e.g. `wandb.log({"feature_density_histogram":
+    wandb_feature_density_histogram(feature_density)})`.
+
+    Args:
+        feature_density: Number of times each feature was active in a sample. Can be calculated
+            using :func:`feature_activity_count`.
+
+    Returns:
+        Weights & Biases histogram for logging with `wandb.log`.
+    """
+    numpy_feature_density: NDArray[np.float_] = feature_density.detach().cpu().numpy()
+
+    bins, values = histogram(numpy_feature_density, bins="auto")
+    return wandb.Histogram(np_histogram=(bins, values))
diff --git a/sparse_autoencoder/train/metrics/tests/test_feature_density.py b/sparse_autoencoder/train/metrics/tests/test_feature_density.py
@@ -0,0 +1,31 @@
+"""Test the feature density metric."""
+
+import torch
+
+from sparse_autoencoder.train.metrics.feature_density import (
+    calc_feature_density,
+    wandb_feature_density_histogram,
+)
+
+
+def test_calc_feature_density() -> None:
+    """Check that the feature density matches an alternative way of doing the calc."""
+    activations = torch.tensor([[0.5, 0.5, 0.0], [0.5, 0.0, 0.0001], [0.0, 0.1, 0.0]])
+
+    # Use different approach to check
+    threshold = 0.01
+    above_threshold = activations > threshold
+    expected = above_threshold.sum(dim=0, dtype=torch.float64) / above_threshold.shape[0]
+
+    res = calc_feature_density(activations)
+    assert torch.allclose(res, expected), "Output does not match the expected result."
+
+
+def test_wandb_feature_density_histogram() -> None:
+    """Check the Weights & Biases Histogram is created correctly."""
+    feature_density = torch.tensor([0.001, 0.001, 0.001, 0.5, 0.5, 1.0])
+    res = wandb_feature_density_histogram(feature_density)
+
+    # Check 0.001 is in the first bin 3 times
+    expected_first_bin_value = 3
+    assert res.histogram[0] == expected_first_bin_value
diff --git a/sparse_autoencoder/train/train_autoencoder.py b/sparse_autoencoder/train/train_autoencoder.py
@@ -65,15 +65,14 @@ def train_autoencoder(
                 l1_loss_learned_activations,
                 sweep_parameters.l1_coefficient,
             )
-            # TODO: Log dead neurons metric (get_frequencies in Neel's code)
+
+            # TODO: Store the learned activations (default every 25k steps)
 
             # Backwards pass
             total_loss.backward()
 
             optimizer.step()
 
-            # TODO: Enable neuron resampling here
-
             # Log
             if step % log_interval == 0 and wandb.run is not None:
                 wandb.log(
@@ -84,6 +83,10 @@ def train_autoencoder(
                     },
                 )
 
+            # TODO: Get the feature density & also log to wandb
+
+            # TODO: Apply neuron resampling if enabled
+
             progress_bar.update(batch_size)
 
         progress_bar.close()