ai-safety-foundation
diff --git a/‎.vscode/settings.json‎
Lines changed: 1 addition & 1 deletion b/‎.vscode/settings.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎poetry.lock‎
Lines changed: 133 additions & 130 deletions b/‎poetry.lock‎
Lines changed: 133 additions & 130 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎sparse_autoencoder/autoencoder/abstract_autoencoder.py‎
Lines changed: 63 additions & 0 deletions b/‎sparse_autoencoder/autoencoder/abstract_autoencoder.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎sparse_autoencoder/autoencoder/components/abstract_decoder.py‎
Lines changed: 68 additions & 0 deletions b/‎sparse_autoencoder/autoencoder/components/abstract_decoder.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎sparse_autoencoder/autoencoder/components/abstract_encoder.py‎
Lines changed: 85 additions & 0 deletions b/‎sparse_autoencoder/autoencoder/components/abstract_encoder.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎sparse_autoencoder/autoencoder/components/abstract_outer_bias.py‎
Lines changed: 40 additions & 0 deletions b/‎sparse_autoencoder/autoencoder/components/abstract_outer_bias.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎sparse_autoencoder/autoencoder/components/linear_encoder.py‎
Lines changed: 105 additions & 0 deletions b/‎sparse_autoencoder/autoencoder/components/linear_encoder.py‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎sparse_autoencoder/autoencoder/components/tests/__snapshots__/test_linear_encoder.ambr‎
Lines changed: 9 additions & 0 deletions b/‎sparse_autoencoder/autoencoder/components/tests/__snapshots__/test_linear_encoder.ambr‎
Lines changed: 9 additions & 0 deletions
@@ -43,4 +43,4 @@
   "rewrap.autoWrap.enabled": true,
   "rewrap.wrappingColumn": 100,
   "python.analysis.diagnosticMode": "workspace"
-}
+}
@@ -24,6 +24,7 @@
             pyright=">=1.1.334"
             pytest=">=7"
             pytest-cov=">=4"
+            pytest-timeout="^2.2.0"
             ruff=">=0.1.4"
             syrupy="^4.6.0"
 
@@ -116,6 +117,8 @@
 
 [tool.pytest]
     cache_dir=".cache/pytest"
+    durations=3
+    timeout=60
 
     [tool.pytest.ini_options]
         addopts="""--jaxtyping-packages=sparse_autoencoder,beartype.beartype --doctest-modules"""
 
@@ -0,0 +1,63 @@
+"""Abstract Sparse Autoencoder Model."""
+from abc import ABC, abstractmethod
+
+from torch.nn import Module
+
+from sparse_autoencoder.autoencoder.components.abstract_decoder import AbstractDecoder
+from sparse_autoencoder.autoencoder.components.abstract_encoder import AbstractEncoder
+from sparse_autoencoder.autoencoder.components.abstract_outer_bias import AbstractOuterBias
+from sparse_autoencoder.tensor_types import (
+    InputOutputActivationBatch,
+    LearnedActivationBatch,
+)
+
+
+class AbstractAutoencoder(Module, ABC):
+    """Abstract Sparse Autoencoder Model."""
+
+    @property
+    @abstractmethod
+    def encoder(self) -> AbstractEncoder:
+        """Encoder."""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def decoder(self) -> AbstractDecoder:
+        """Decoder."""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def pre_encoder_bias(self) -> AbstractOuterBias:
+        """Pre-encoder bias."""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def post_decoder_bias(self) -> AbstractOuterBias:
+        """Post-decoder bias."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def forward(
+        self,
+        x: InputOutputActivationBatch,
+    ) -> tuple[
+        LearnedActivationBatch,
+        InputOutputActivationBatch,
+    ]:
+        """Forward Pass.
+
+        Args:
+            x: Input activations (e.g. activations from an MLP layer in a transformer model).
+
+        Returns:
+            Tuple of learned activations and decoded activations.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def reset_parameters(self) -> None:
+        """Reset the parameters."""
+        raise NotImplementedError
@@ -0,0 +1,68 @@
+"""Abstract Sparse Autoencoder Model."""
+from abc import ABC, abstractmethod
+from typing import final
+
+import torch
+from torch.nn import Module
+
+from sparse_autoencoder.tensor_types import (
+    DeadDecoderNeuronWeightUpdates,
+    DecoderWeights,
+    InputOutputActivationBatch,
+    InputOutputNeuronIndices,
+    LearnedActivationBatch,
+)
+
+
+class AbstractDecoder(Module, ABC):
+    """Abstract Decoder Module.
+
+    Typically includes just a :attr:`weight` parameter.
+    """
+
+    @property
+    @abstractmethod
+    def weight(self) -> DecoderWeights:
+        """Weight."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def forward(
+        self,
+        x: LearnedActivationBatch,
+    ) -> InputOutputActivationBatch:
+        """Forward Pass.
+
+        Args:
+            x: Learned activations.
+
+        Returns:
+            Decoded activations.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def reset_parameters(self) -> None:
+        """Reset the parameters."""
+        raise NotImplementedError
+
+    @final
+    def update_dictionary_vectors(
+        self,
+        dictionary_vector_indices: InputOutputNeuronIndices,
+        updated_weights: DeadDecoderNeuronWeightUpdates,
+    ) -> None:
+        """Update decoder dictionary vectors.
+
+        Updates the dictionary vectors (rows in the weight matrix) with the given values. Typically
+        this is used when resampling neurons (dictionary vectors) that have died.
+
+        Args:
+            dictionary_vector_indices: Indices of the dictionary vectors to update.
+            updated_weights: Updated weights for just these dictionary vectors.
+        """
+        if len(dictionary_vector_indices) == 0:
+            return
+
+        with torch.no_grad():
+            self.weight[dictionary_vector_indices, :] = updated_weights
@@ -0,0 +1,85 @@
+"""Abstract Encoder."""
+from abc import ABC, abstractmethod
+from typing import final
+
+import torch
+from torch.nn import Module
+
+from sparse_autoencoder.tensor_types import (
+    DeadEncoderNeuronWeightUpdates,
+    EncoderWeights,
+    InputOutputActivationBatch,
+    InputOutputNeuronIndices,
+    LearnedActivationBatch,
+    LearntActivationVector,
+)
+
+
+class AbstractEncoder(Module, ABC):
+    """Abstract encoder module.
+
+    Typically includes :attr:`weights` and :attr:`bias` parameters, as well as an activation
+    function.
+    """
+
+    @property
+    @abstractmethod
+    def weight(self) -> EncoderWeights:
+        """Weight."""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def bias(self) -> LearntActivationVector:
+        """Bias."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def forward(self, x: InputOutputActivationBatch) -> LearnedActivationBatch:
+        """Forward pass.
+
+        Args:
+            x: Input activations.
+
+        Returns:
+            Resulting activations.
+        """
+        raise NotImplementedError
+
+    @final
+    def update_dictionary_vectors(
+        self,
+        dictionary_vector_indices: InputOutputNeuronIndices,
+        updated_dictionary_weights: DeadEncoderNeuronWeightUpdates,
+    ) -> None:
+        """Update encoder dictionary vectors.
+
+        Updates the dictionary vectors (columns in the weight matrix) with the given values.
+
+        Args:
+            dictionary_vector_indices: Indices of the dictionary vectors to update.
+            updated_dictionary_weights: Updated weights for just these dictionary vectors.
+        """
+        if len(dictionary_vector_indices) == 0:
+            return
+
+        with torch.no_grad():
+            self.weight[:, dictionary_vector_indices] = updated_dictionary_weights
+
+    @final
+    def update_bias(
+        self,
+        update_parameter_indices: InputOutputNeuronIndices,
+        updated_bias_features: LearntActivationVector | float,
+    ) -> None:
+        """Update encoder bias.
+
+        Args:
+            update_parameter_indices: Indices of the bias features to update.
+            updated_bias_features: Updated bias features for just these indices.
+        """
+        if len(update_parameter_indices) == 0:
+            return
+
+        with torch.no_grad():
+            self.bias[update_parameter_indices] = updated_bias_features
@@ -0,0 +1,40 @@
+"""Abstract Outer Bias.
+
+This can be extended to create e.g. a pre-encoder and post-decoder bias.
+"""
+from abc import ABC, abstractmethod
+
+from torch.nn import Module
+
+from sparse_autoencoder.tensor_types import (
+    InputOutputActivationBatch,
+    InputOutputActivationVector,
+)
+
+
+class AbstractOuterBias(Module, ABC):
+    """Abstract Pre-Encoder or Post-Decoder Bias Module."""
+
+    @property
+    @abstractmethod
+    def bias(self) -> InputOutputActivationVector:
+        """Bias.
+
+        May be a reference to a bias parameter in the parent module, if using e.g. a tied bias.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def forward(
+        self,
+        x: InputOutputActivationBatch,
+    ) -> InputOutputActivationBatch:
+        """Forward Pass.
+
+        Args:
+            x: Input activations (e.g. activations from an MLP layer in a transformer model).
+
+        Returns:
+            Resulting activations.
+        """
+        raise NotImplementedError
@@ -0,0 +1,105 @@
+"""Linear encoder layer."""
+import math
+from typing import final
+
+import einops
+import torch
+from torch.nn import Parameter, ReLU, init
+
+from sparse_autoencoder.autoencoder.components.abstract_encoder import AbstractEncoder
+from sparse_autoencoder.tensor_types import (
+    EncoderWeights,
+    InputOutputActivationBatch,
+    LearnedActivationBatch,
+    LearntActivationVector,
+)
+
+
+@final
+class LinearEncoder(AbstractEncoder):
+    """Linear encoder layer."""
+
+    _learnt_features: int
+    """Number of learnt features (inputs to this layer)."""
+
+    _input_features: int
+    """Number of decoded features (outputs from this layer)."""
+
+    _weight: EncoderWeights
+
+    _bias: LearntActivationVector
+
+    @property
+    def weight(self) -> EncoderWeights:
+        """Weight."""
+        return self._weight
+
+    @property
+    def bias(self) -> LearntActivationVector:
+        """Bias."""
+        return self._bias
+
+    activation_function: ReLU
+
+    def __init__(
+        self,
+        input_features: int,
+        learnt_features: int,
+    ):
+        """Initialize the linear encoder layer."""
+        super().__init__()
+        self._learnt_features = learnt_features
+        self._input_features = input_features
+        self.activation_function = ReLU()
+
+        self._weight = Parameter(
+            torch.empty(
+                (learnt_features, input_features),
+            )
+        )
+
+        self._bias = Parameter(torch.zeros(learnt_features))
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        """Initialize or reset the parameters."""
+        # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
+        # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
+        # https://github.com/pytorch/pytorch/issues/57109
+        init.kaiming_uniform_(self._weight, a=math.sqrt(5))
+
+        # Bias (approach from nn.Linear)
+        fan_in = self._weight.size(1)
+        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+        init.uniform_(self._bias, -bound, bound)
+
+    def forward(self, x: InputOutputActivationBatch) -> LearnedActivationBatch:
+        """Forward pass.
+
+        Args:
+            x: Input tensor.
+
+        Returns:
+            Output of the forward pass.
+        """
+        learned_activation_batch: LearnedActivationBatch = einops.einsum(
+            x,
+            self.weight,
+            "batch input_output_feature, \
+                learnt_feature_dim input_output_feature_dim \
+                -> batch learnt_feature_dim",
+        )
+
+        learned_activation_batch = einops.einsum(
+            learned_activation_batch,
+            self.bias,
+            "batch learnt_feature_dim, \
+                learnt_feature_dim -> batch learnt_feature_dim",
+        )
+
+        return self.activation_function(learned_activation_batch)
+
+    def extra_repr(self) -> str:
+        """String extra representation of the module."""
+        return f"in_features={self._input_features}, out_features={self._learnt_features}"
@@ -0,0 +1,9 @@
+# serializer version: 1
+# name: test_extra_repr
+  '''
+  LinearEncoder(
+    in_features=10, out_features=5
+    (activation_function): ReLU()
+  )
+  '''
+# ---