Improve log names (#109)

alan-cooney · web-flow · commit 9a0052a25c23 · 2023-11-26T17:12:50.000-05:00
diff --git a/sparse_autoencoder/loss/abstract_loss.py b/sparse_autoencoder/loss/abstract_loss.py
@@ -37,6 +37,14 @@ class AbstractLoss(Module, ABC):
     _modules: dict[str, "AbstractLoss"]  # type: ignore[assignment] (narrowing)
     """Children loss modules."""
 
+    @abstractmethod
+    def log_name(self) -> str:
+        """Log name.
+
+        Returns:
+            Name of the loss module for logging.
+        """
+
     @abstractmethod
     def forward(
         self,
@@ -85,7 +93,6 @@ def batch_scalar_loss(
             case LossReductionType.SUM:
                 return itemwise_loss.sum().squeeze()
 
-    @final
     def batch_scalar_loss_with_log(
         self,
         source_activations: InputOutputActivationBatch,
@@ -131,8 +138,8 @@ def batch_scalar_loss_with_log(
             )
 
         # Add in the current loss module's metric
-        class_name = self.__class__.__name__
-        metrics[class_name] = current_module_loss.detach().cpu().item()
+        log_name = self.log_name()
+        metrics[log_name] = current_module_loss.detach().cpu().item()
 
         return current_module_loss, metrics
 
diff --git a/sparse_autoencoder/loss/decoded_activations_l2.py b/sparse_autoencoder/loss/decoded_activations_l2.py
@@ -29,9 +29,17 @@ class L2ReconstructionLoss(AbstractLoss):
         >>> unused_activations = torch.zeros_like(input_activations)
         >>> # Outputs both loss and metrics to log
         >>> loss(input_activations, unused_activations, output_activations)
-        (tensor(11.), {'L2ReconstructionLoss': 11.0})
+        (tensor(11.), {'l2_reconstruction_loss': 11.0})
     """
 
+    def log_name(self) -> str:
+        """Log name.
+
+        Returns:
+            Name of the loss module for logging.
+        """
+        return "l2_reconstruction_loss"
+
     def forward(
         self,
         source_activations: InputOutputActivationBatch,
diff --git a/sparse_autoencoder/loss/learned_activations_l1.py b/sparse_autoencoder/loss/learned_activations_l1.py
@@ -3,9 +3,10 @@
 
 import torch
 
-from sparse_autoencoder.loss.abstract_loss import AbstractLoss
+from sparse_autoencoder.loss.abstract_loss import AbstractLoss, LossLogType, LossReductionType
 from sparse_autoencoder.tensor_types import (
     InputOutputActivationBatch,
+    ItemTensor,
     LearnedActivationBatch,
     TrainBatchStatistic,
 )
@@ -23,13 +24,21 @@ class LearnedActivationsL1Loss(AbstractLoss):
         >>> learned_activations = torch.tensor([[2.0, -3], [2.0, -3]])
         >>> unused_activations = torch.zeros_like(learned_activations)
         >>> # Returns loss and metrics to log
-        >>> l1_loss(unused_activations, learned_activations, unused_activations)
-        (tensor(0.5000), {'LearnedActivationsL1Loss': 0.5})
+        >>> l1_loss(unused_activations, learned_activations, unused_activations)[0]
+        tensor(0.5000)
     """
 
     l1_coefficient: float
     """L1 coefficient."""
 
+    def log_name(self) -> str:
+        """Log name.
+
+        Returns:
+            Name of the loss module for logging.
+        """
+        return "learned_activations_l1_loss_penalty"
+
     def __init__(self, l1_coefficient: float) -> None:
         """Initialize the absolute error loss.
 
@@ -42,11 +51,33 @@ def __init__(self, l1_coefficient: float) -> None:
         self.l1_coefficient = l1_coefficient
         super().__init__()
 
-    def forward(
+    def _l1_loss(
         self,
         source_activations: InputOutputActivationBatch,  # noqa: ARG002
         learned_activations: LearnedActivationBatch,
         decoded_activations: InputOutputActivationBatch,  # noqa: ARG002
+    ) -> tuple[TrainBatchStatistic, TrainBatchStatistic]:
+        """Learned activations L1 (absolute error) loss.
+
+        Args:
+            source_activations: Source activations (input activations to the autoencoder from the
+                source model).
+            learned_activations: Learned activations (intermediate activations in the autoencoder).
+            decoded_activations: Decoded activations.
+
+        Returns:
+            Tuple of itemwise absolute loss, and itemwise absolute loss multiplied by the l1
+            coefficient.
+        """
+        absolute_loss = torch.abs(learned_activations).sum(dim=-1)
+        absolute_loss_penalty = absolute_loss * self.l1_coefficient
+        return absolute_loss, absolute_loss_penalty
+
+    def forward(
+        self,
+        source_activations: InputOutputActivationBatch,
+        learned_activations: LearnedActivationBatch,
+        decoded_activations: InputOutputActivationBatch,
     ) -> TrainBatchStatistic:
         """Learned activations L1 (absolute error) loss.
 
@@ -59,9 +90,48 @@ def forward(
         Returns:
             Loss per batch item.
         """
-        absolute_loss = torch.abs(learned_activations)
+        return self._l1_loss(source_activations, learned_activations, decoded_activations)[1]
+
+    # Override to add both the loss and the penalty to the log
+    def batch_scalar_loss_with_log(
+        self,
+        source_activations: InputOutputActivationBatch,
+        learned_activations: LearnedActivationBatch,
+        decoded_activations: InputOutputActivationBatch,
+        reduction: LossReductionType = LossReductionType.MEAN,
+    ) -> tuple[ItemTensor, LossLogType]:
+        """Learned activations L1 (absolute error) loss, with log.
+
+        Args:
+            source_activations: Source activations (input activations to the autoencoder from the
+                source model).
+            learned_activations: Learned activations (intermediate activations in the autoencoder).
+            decoded_activations: Decoded activations.
+            reduction: Loss reduction type. Typically you would choose LossReductionType.MEAN to
+                make the loss independent of the batch size.
+
+        Returns:
+            Tuple of the L1 absolute error batch scalar loss and a dict of the properties to log
+                (loss before and after the l1 coefficient).
+        """
+        absolute_loss, absolute_loss_penalty = self._l1_loss(
+            source_activations, learned_activations, decoded_activations
+        )
+
+        match reduction:
+            case LossReductionType.MEAN:
+                batch_scalar_loss = absolute_loss.mean().squeeze()
+                batch_scalar_loss_penalty = absolute_loss_penalty.mean().squeeze()
+            case LossReductionType.SUM:
+                batch_scalar_loss = absolute_loss.sum().squeeze()
+                batch_scalar_loss_penalty = absolute_loss_penalty.sum().squeeze()
+
+        metrics = {
+            "learned_activations_l1_loss": batch_scalar_loss.item(),
+            self.log_name(): batch_scalar_loss_penalty.item(),
+        }
 
-        return absolute_loss.sum(dim=-1) * self.l1_coefficient
+        return batch_scalar_loss_penalty, metrics
 
     def extra_repr(self) -> str:
         """Extra representation string."""
diff --git a/sparse_autoencoder/loss/reducer.py b/sparse_autoencoder/loss/reducer.py
@@ -38,6 +38,14 @@ class LossReducer(AbstractLoss):
     _modules: dict[str, "AbstractLoss"]
     """Children loss modules."""
 
+    def log_name(self) -> str:
+        """Log name.
+
+        Returns:
+            Name of the loss module for logging.
+        """
+        return "total_loss"
+
     def __init__(
         self,
         *loss_modules: AbstractLoss,
diff --git a/sparse_autoencoder/loss/tests/test_abstract_loss.py b/sparse_autoencoder/loss/tests/test_abstract_loss.py
@@ -23,6 +23,10 @@ def forward(
         # Simple dummy implementation for testing
         return torch.tensor([1.0, 2.0, 3.0])
 
+    def log_name(self) -> str:
+        """Log name."""
+        return "dummy"
+
 
 @pytest.fixture()
 def dummy_loss() -> DummyLoss:
@@ -61,15 +65,13 @@ def test_batch_scalar_loss_with_log(dummy_loss: DummyLoss) -> None:
     _loss, log = dummy_loss.batch_scalar_loss_with_log(
         source_activations, learned_activations, decoded_activations
     )
-    assert "DummyLoss" in log
     expected = 2.0  # Mean of [1.0, 2.0, 3.0]
-    assert log["DummyLoss"] == expected
+    assert log["dummy"] == expected
 
 
 def test_call_method(dummy_loss: DummyLoss) -> None:
     """Test the call method."""
     source_activations = learned_activations = decoded_activations = torch.ones((1, 3))
     _loss, log = dummy_loss(source_activations, learned_activations, decoded_activations)
-    assert "DummyLoss" in log
     expected = 2.0  # Mean of [1.0, 2.0, 3.0]
-    assert log["DummyLoss"] == expected
+    assert log["dummy"] == expected
diff --git a/sparse_autoencoder/metrics/train/feature_density.py b/sparse_autoencoder/metrics/train/feature_density.py
@@ -21,6 +21,10 @@ class TrainBatchFeatureDensityMetric(AbstractTrainMetric):
     Percentage of samples in which each feature was active (i.e. the neuron has "fired"), in a
     training batch.
 
+    Generally we want a small number of features to be active in each batch, so average feature
+    density should be low. By contrast if the average feature density is high, it means that the
+    features are not sparse enough.
+
     Warning:
         This is not the same as the feature density of the entire training set. It's main use is
         tracking the progress of training.
diff --git a/sparse_autoencoder/train/abstract_pipeline.py b/sparse_autoencoder/train/abstract_pipeline.py
@@ -264,7 +264,7 @@ def run_pipeline(
                 last_checkpoint += num_activation_vectors_in_store
                 total_activations += num_activation_vectors_in_store
                 if wandb.run is not None:
-                    wandb.log({"total_activations": total_activations}, commit=False)
+                    wandb.log({"activations_generated": total_activations}, commit=False)
 
                 # Train
                 progress_bar.set_postfix({"stage": "train"})