KylevdLangemheen
diff --git a/‎lightly/loss/supcon_loss.py‎
Lines changed: 71 additions & 241 deletions b/‎lightly/loss/supcon_loss.py‎
Lines changed: 71 additions & 241 deletions
@@ -3,8 +3,7 @@
 # Copyright (c) 2020. Lightly AG and its affiliates.
 # All Rights Reserved
 
-from enum import Enum
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -15,42 +14,6 @@
 from lightly.utils import dist
 
 
-def divide_no_nan(numerator: Tensor, denominator: Tensor) -> Tensor:
-    """Performs tensor division, setting result to zero where denominator is zero.
-
-    Args:
-        numerator:
-            Numerator tensor.
-        denominator:
-            Denominator tensor with possible zeroes.
-
-    Returns:
-        Result with zeros where denominator is zero.
-    """
-    result = torch.zeros_like(numerator)
-    nonzero_mask = denominator != 0
-    result[nonzero_mask] = numerator[nonzero_mask] / denominator[nonzero_mask]
-    return result
-
-
-class ContrastMode(Enum):
-    """Contrast Mode Enum for SupCon Loss.
-
-    Offers the three contrast modes as enum for the SupCon loss. The three modes are:
-
-    - ContrastMode.ALL: Uses all positives and negatives.
-    - ContrastMode.ONE_POSITIVE: Uses only one positive, and all negatives.
-    - ContrastMode.ONLY_NEGATIVES: Uses no positives, only negatives.
-    """
-
-    ALL = 1
-    ONE_POSITIVE = 2
-    ONLY_NEGATIVES = 3
-
-
-VALID_CONTRAST_MODES = set(item.name for item in ContrastMode)
-
-
 class SupConLoss(nn.Module):
     """Implementation of the Supervised Contrastive Loss.
 
@@ -61,64 +24,55 @@ class SupConLoss(nn.Module):
     Attributes:
         temperature:
             Scale logits by the inverse of the temperature.
-        contrast_mode:
-            Whether to use all positives, one positive, or none. All negatives are
-            used in all cases.
         gather_distributed:
             If True then negatives from all GPUs are gathered before the
-            loss calculation.
-
+            loss calculation. If a memory bank is used and gather_distributed is True,
+            then tensors from all gpus are gathered before the memory bank is updated.
+        rescale:
+            Optionally rescale final loss by the temperature for stability.
     Raises:
         ValueError: If abs(temperature) < 1e-8 to prevent divide by zero.
-        ValueError: If gather_distributed is True but torch.distributed is not available.
-        ValueError: If contrast_mode is outside the accepted ContrastMode values.
 
     Examples:
-        >>> # initialize loss function
-        >>> loss_fn = SupConLoss()
+        >>> # initialize loss function without memory bank
+        >>> loss_fn = NTXentLoss(memory_bank_size=0)
         >>>
-        >>> # generate two or more views of images
+        >>> # generate two random transforms of images
         >>> t0 = transforms(images)
         >>> t1 = transforms(images)
         >>>
-        >>> # feed through SimCLR model
+        >>> # feed through SimCLR or MoCo model
         >>> out0, out1 = model(t0), model(t1)
         >>>
-        >>> # Stack views along 2nd dimensions
-        >>> features = torch.stack([out0, out1], dim=1)
-        >>>
         >>> # calculate loss
-        >>> loss = loss_fn(features, labels)
+        >>> loss = loss_fn(out0, out1)
 
     """
 
     def __init__(
         self,
         temperature: float = 0.5,
-        contrast_mode: ContrastMode = ContrastMode.ALL,
         gather_distributed: bool = False,
+        rescale: bool = True,
     ):
         """Initializes the SupConLoss module with the specified parameters.
 
         Args:
             temperature:
                  Scale logits by the inverse of the temperature.
-            contrast_mode:
-                Whether to use all positives, one positive, or none. All negatives are
-                used in all cases.
             gather_distributed:
                  If True, negatives from all GPUs are gathered before the loss calculation.
+            rescale:
+                Optionally rescale final loss by the temperature for stability.
 
         Raises:
             ValueError: If temperature is less than 1e-8 to prevent divide by zero.
             ValueError: If gather_distributed is True but torch.distributed is not available.
-            ValueError: If contrast_mode is outside the accepted ContrastMode values.
         """
         super().__init__()
         self.temperature = temperature
-        self.contrast_mode = contrast_mode
-        self.positives_cap = -1  # Unused at the moment
         self.gather_distributed = gather_distributed
+        self.rescale = rescale
         self.cross_entropy = nn.CrossEntropyLoss(reduction="mean")
         self.eps = 1e-8
 
@@ -133,206 +87,82 @@ def __init__(
                 "distributed support."
             )
 
-        if contrast_mode.name not in VALID_CONTRAST_MODES:
-            raise ValueError(
-                f"contrast_mode is {contrast_mode} but must be one of ContrastMode.{VALID_CONTRAST_MODES}"
-            )
-
-    def forward(self, features: Tensor, labels: Optional[Tensor] = None) -> Tensor:
+    def forward(
+        self, out0: Tensor, out1: Tensor, labels: Optional[Tensor] = None
+    ) -> Tensor:
         """Forward pass through Supervised Contrastive Loss.
 
         Computes the loss based on contrast_mode setting.
 
         Args:
-            features:
-                Tensor of at least 3 dimensions, corresponding to
-                (batch_size, num_views, ...)
+            out0:
+                Output projections of the first set of transformed images.
+                Shape: (batch_size, embedding_size)
+            out1:
+                Output projections of the second set of transformed images.
+                Shape: (batch_size, embedding_size)
             labels:
-                Onehot labels for each sample. Must match shape
-                (batch_size, num_classes)
-
-        Raises:
-            ValueError: If features does not have at least 3 dimensions.
-            ValueError: If number of labels does not match batch_size.
-            ValueError: If labels is not one-hot encoded.
+                Onehot labels for each sample. Must be a vector of length `batch_size`.
 
         Returns:
             Supervised Contrastive Loss value.
         """
+        # Stack the views for efficient computation
+        # Allows for more views to be added easily
+        features = (out0, out1)
+        n_views = len(features)
+        out_small = torch.vstack(features)
 
-        if len(features.shape) < 3:
-            raise ValueError(
-                f"Features must have at least 3 dimensions, got {len(features.shape)}."
-            )
-
-        device = features.device
-        batch_size, num_views = features.shape[:2]
-
-        if labels is not None and labels.size(0) != batch_size:
-            raise ValueError(
-                f"When setting labels, labels must match batch_size {batch_size}, got {labels.size(0)}."
-            )
-
-        if labels is not None:
-            if not self._is_one_hot(labels):
-                raise ValueError(
-                    "labels must be a 2D matrix representing the one-hot encoded classes."
-                )
-
-        # Flatten the features in case they are still images or other
-        features = features.flatten(2)
-
-        # Normalize the features to length 1
-        features = F.normalize(features, dim=2)
-
-        # Memory bank could be used here but labelled samples are not yet supported.
+        device = out_small.device
+        batch_size = out_small.shape[0] // n_views
 
-        # Use cosine similarity (dot product) as all vectors are normalized to unit length
+        # Normalize the output to length 1
+        out_small = nn.functional.normalize(out_small, dim=1)
 
-        # Use other samples from different classes in batch as negatives
-        # and create diagonal mask that only selects similarities between
-        # views of the same image / same class
+        # Gather hidden representations from other processes if distributed
+        # and compute the diagonal self-contrast mask
         if self.gather_distributed and dist.world_size() > 1:
-            # Gather hidden representations and optional labels from other processes
-            global_features = torch.cat(dist.gather(features), 0)
-            diag_mask = dist.eye_rank(batch_size, device=device)
-            if labels is not None:
-                global_labels = torch.cat(dist.gather(labels), 0)
+            out_large = torch.cat(dist.gather(out_small), 0)
+            diag_mask = dist.eye_rank(n_views * batch_size, device=device)
         else:
             # Single process
-            global_features = features
-            diag_mask = torch.eye(batch_size, device=device, dtype=torch.bool)
-            if labels is not None:
-                global_labels = labels
-
-        # Use the diagonal mask if labels is none, else compute the mask based on labels
-        if labels is None:
-            # No labels, typical semi-supervised contrastive learning like SimCLR
-            mask = diag_mask
-        else:
-            mask = (labels @ global_labels.T).to(device)
+            out_large = out_small
+            diag_mask = torch.eye(n_views * batch_size, device=device, dtype=torch.bool)
 
-        # Get features in shape [num_views * batch_size, c]
-        all_global_features = global_features.permute(1, 0, 2).reshape(
-            -1, global_features.size(-1)
-        )
-
-        if self.contrast_mode == ContrastMode.ONE_POSITIVE:
-            # We take only the first view as anchor
-            anchor_features = features[:, 0]
-            num_anchor_views = 1
-        else:
-            # We take all views as anchors in the same shape as the global features
-            anchor_features = features.permute(1, 0, 2).reshape(-1, features.size(-1))
-            num_anchor_views = num_views
-
-        # Obtain the logits between anchor features and features across all processes
-        # Logits will be shaped [local_batch_size * num_anchor_views, global_batch_size * num_views]
-        # We then temperature scale it and subtract the max to improve numerical stability
-        # In the einsum, n is local_batch_size * num_anchor_views, m is global_batch_size * num_views,
-        # and c is the flattened feature length
-        # Note: features are ordered by view first, i.e. first all samples of view 0, then all samples
-        # of view 1, and so on.
-        logits = torch.einsum("nc,mc->nm", anchor_features, all_global_features)
+        # Use cosine similarity (dot product) as all vectors are normalized to unit length
+        # Calculate similiarities
+        logits = out_small @ out_large.T
         logits /= self.temperature
-        logits -= logits.max(dim=1, keepdim=True)[0].detach()
-        exp_logits = torch.exp(logits)
 
-        # Get the positive and negative masks for numerator & denominator
-        positives_mask, negatives_mask = self._create_tiled_masks(
-            mask.long(),
-            diag_mask.long(),
-            num_views,
-            num_anchor_views,
-            self.positives_cap,
-        )
-        num_positives_per_row = positives_mask.sum(dim=1)
+        # Set self-similarities to infinitely small value
+        logits[diag_mask] = -1e9
 
-        # Calculate denominator based on contrast_mode
-        if self.contrast_mode == ContrastMode.ONE_POSITIVE:
-            denominator = exp_logits + (exp_logits * negatives_mask).sum(
-                dim=1, keepdim=True
-            )
-        elif self.contrast_mode == ContrastMode.ALL:
-            denominator = (exp_logits * negatives_mask).sum(dim=1, keepdim=True)
-            denominator += (exp_logits * positives_mask).sum(dim=1, keepdim=True)
-        else:  # ContrastMode.ONLY_NEGATIVES
-            denominator = (exp_logits * negatives_mask).sum(dim=1, keepdim=True)
-
-        # num_positives_per_row can be zero iff 1 view is used. Here we use a safe
-        # dividing method seting those values to zero to prevent division by zero errors.
-
-        # Only implements SupCon_{out}.
-        log_probs = (logits - torch.log(denominator)) * positives_mask
-        log_probs = log_probs.sum(dim=1)
-        log_probs = divide_no_nan(log_probs, num_positives_per_row)
-
-        loss = -log_probs
-
-        # Adjust for num_positives_per_row being zero when using exactly 1 view
-        if num_views != 1:
-            loss = loss.mean(dim=0)
-        else:
-            num_valid_views_per_sample = num_positives_per_row.unsqueeze(0)
-            loss = divide_no_nan(loss, num_valid_views_per_sample).squeeze()
+        # Create labels if None
+        if labels is None:
+            labels = torch.arange(batch_size, device=device, dtype=torch.long)
+            if self.gather_distributed:
+                labels = labels + dist.rank() * batch_size
+        labels = labels.repeat(n_views)
+
+        # Soft labels are 0 unless the logit represents a similarity
+        # between two of the same classes. We manually set self-similarity
+        # (same view of the same item) to 0. When not 0, the value is
+        # 1 / n, where n is the number of positive samples
+        # (different views of the same item, and all views of other items sharing
+        # classes with the item)
+        soft_labels = torch.eq(labels, labels.view(-1, 1)).float()
+        soft_labels.fill_diagonal_(0.0)
+        soft_labels /= soft_labels.sum(dim=1)
+
+        # Compute log probabilities
+        log_proba = F.log_softmax(logits, dim=-1)
+
+        # Compute soft cross-entropy loss
+        loss = (soft_labels * log_proba).sum(-1)
+        loss = -loss.mean()
+
+        # Optional: rescale for stable training
+        if self.rescale:
+            loss *= self.temperature
 
         return loss
-
-    def _create_tiled_masks(
-        self,
-        untiled_mask: Tensor,
-        diagonal_mask: Tensor,
-        num_views: int,
-        num_anchor_views: int,
-        positives_cap: int,
-    ) -> Tuple[Tensor, Tensor]:
-        # Get total batch size across all processes
-        global_batch_size = untiled_mask.size(1)
-
-        # Find index of the anchor for each sample
-        labels = torch.argmax(diagonal_mask, dim=1)
-
-        # Generate tiled labels across views
-        tiled_labels = []
-        for i in range(num_anchor_views):
-            tiled_labels.append(labels + global_batch_size * i)
-        tiled_labels_tensor = torch.cat(tiled_labels, 0)
-        tiled_diagonal_mask = F.one_hot(
-            tiled_labels_tensor, global_batch_size * num_views
-        )
-
-        # Mask to zero the diagonal at the end
-        all_but_diagonal_mask = 1 - tiled_diagonal_mask
-
-        # All tiled positives
-        uncapped_positives_mask = torch.tile(
-            untiled_mask, [num_anchor_views, num_views]
-        )
-
-        # The negatives is simply the bitflipped positives
-        negatives_mask = 1.0 - uncapped_positives_mask
-
-        # For when positives_cap is implemented
-        if positives_cap > -1:
-            raise NotImplementedError("Capping positives is not yet implemented.")
-        else:
-            positives_mask = uncapped_positives_mask
-
-        # Zero out the self-contrast
-        positives_mask *= all_but_diagonal_mask
-
-        return positives_mask, negatives_mask
-
-    def _is_one_hot(self, tensor: Tensor) -> bool:
-        # Tensor is not a 2D matrix
-        if tensor.ndim != 2:
-            return False
-
-        # Check values are only 0 or 1
-        is_binary = ((tensor == 0) | (tensor == 1)).all()
-
-        # Check each row sums to 1
-        row_sums = tensor.sum(dim=1)
-        has_single_one = (row_sums == 1).all()
-
-        return bool(is_binary.item() and has_single_one.item())