Implement masking

LocalToasty · LocalToasty · commit c7498f4a0c66 · 2025-01-09T12:57:39.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "stamp"
-version = "2.0.0-dev7"
+version = "2.0.0-dev8"
 authors = [
     { name = "Omar El Nahhas", email = "omar.el_nahhas@tu-dresden.de" },
     { name = "Marko van Treeck", email = "markovantreeck@gmail.com" },
diff --git a/src/stamp/modeling/lightning_model.py b/src/stamp/modeling/lightning_model.py
@@ -5,6 +5,7 @@
 
 import lightning
 import numpy as np
+import torch
 from jaxtyping import Float
 from packaging.version import Version
 from torch import Tensor, nn, optim
@@ -71,7 +72,7 @@ def __init__(
         # Check if version is compatible.
         # This should only happen when the model is loaded,
         # otherwise the default value will make these checks pass.
-        if stamp_version < Version("2.0.0.dev1"):
+        if stamp_version < Version("2.0.0.dev8"):
             # Update this as we change our model in incompatible ways!
             raise ValueError(
                 f"model has been built with stamp version {stamp_version} "
@@ -112,9 +113,14 @@ def _step(
     ) -> Loss:
         _ = batch_idx  # unused
 
-        bags, _, targets = batch
+        bags, bag_sizes, targets = batch
 
-        logits = self.vision_transformer(bags)
+        max_possible_bag_size = bags.size(1)
+        mask = torch.arange(max_possible_bag_size).type_as(bag_sizes).unsqueeze(
+            0
+        ).repeat(len(bags), 1) >= bag_sizes.unsqueeze(1)
+
+        logits = self.vision_transformer(bags, mask=mask)
 
         loss = nn.functional.cross_entropy(
             logits, targets.type_as(logits), weight=self.class_weights.type_as(logits)
diff --git a/src/stamp/modeling/vision_transformer.py b/src/stamp/modeling/vision_transformer.py
@@ -2,14 +2,13 @@
 In parts from https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/vit.py
 """
 
-# TODO implement masking
-
-from typing import Iterable, cast
+from collections.abc import Iterable
+from typing import cast
 
 import torch
 from beartype import beartype
 from einops import repeat
-from jaxtyping import Float, jaxtyped
+from jaxtyping import Bool, Float, jaxtyped
 from torch import Tensor, nn
 
 
@@ -42,10 +41,31 @@ def __init__(
 
     @jaxtyped(typechecker=beartype)
     def forward(
-        self, x: Float[Tensor, "batch sequence proj_feature"]
+        self,
+        x: Float[Tensor, "batch sequence proj_feature"],
+        *,
+        attn_mask: Bool[Tensor, "batch sequence sequence"] | None,
     ) -> Float[Tensor, "batch sequence proj_feature"]:
+        """
+        Args:
+            attn_mask:
+                Which of the features to ignore during self-attention.
+                `attn_mask[b,q,k] == False` means that
+                query `q` of batch `b` can attend to key `k`.
+                If `attn_mask` is `None`, all tokens can attend to all others.
+        """
         x = self.norm(x)
-        attn_output, _ = self.mhsa(x, x, x, need_weights=False)
+        attn_output, _ = self.mhsa(
+            x,
+            x,
+            x,
+            need_weights=False,
+            attn_mask=(
+                attn_mask.repeat(self.mhsa.num_heads, 1, 1)
+                if attn_mask is not None
+                else None
+            ),
+        )
         return attn_output
 
 
@@ -83,10 +103,13 @@ def __init__(
 
     @jaxtyped(typechecker=beartype)
     def forward(
-        self, x: Float[Tensor, "batch sequence proj_feature"]
+        self,
+        x: Float[Tensor, "batch sequence proj_feature"],
+        *,
+        attn_mask: Bool[Tensor, "batch sequence sequence"] | None,
     ) -> Float[Tensor, "batch sequence proj_feature"]:
         for attn, ff in cast(Iterable[tuple[nn.Module, nn.Module]], self.layers):
-            x_attn = attn(x)
+            x_attn = attn(x, attn_mask=attn_mask)
             x = x_attn + x
             x = ff(x) + x
 
@@ -127,18 +150,36 @@ def __init__(
 
     @jaxtyped(typechecker=beartype)
     def forward(
-        self, bags: Float[Tensor, "batch tile feature"]
+        self,
+        bags: Float[Tensor, "batch tile feature"],
+        *,
+        mask: Bool[Tensor, "batch tile"] | None,
     ) -> Float[Tensor, "batch logit"]:
         batch_size, _n_tiles, _n_features = bags.shape
 
-        # map input sequence to latent space of TransMIL
+        # Map input sequence to latent space of TransMIL
         bags = self.project_features(bags)
 
+        # Prepend a class token to every bag,
+        # include it in the mask.
+        # TODO should the tiles be able to refer to the class token? Test!
         cls_tokens = repeat(self.class_token, "d -> b 1 d", b=batch_size)
-        bags = torch.cat((cls_tokens, bags), dim=1)
-
-        bags = self.transformer(bags)
-
-        bags = bags[:, 0]  # only take class token
+        bags = torch.cat([cls_tokens, bags], dim=1)
+        if mask is not None:
+            mask_with_class_token = torch.cat(
+                [torch.zeros(mask.shape[0], 1).type_as(mask), mask], dim=1
+            )
+            square_attn_mask = torch.einsum(
+                "bq,bk->bqk", mask_with_class_token, mask_with_class_token
+            )
+            # Don't allow other tiles to reference the class token
+            square_attn_mask[:, 1:, 0] = True
+
+            bags = self.transformer(bags, attn_mask=square_attn_mask)
+        else:
+            bags = self.transformer(bags, attn_mask=None)
+
+        # Only take class token
+        bags = bags[:, 0]
 
         return self.mlp_head(bags)
diff --git a/uv.lock b/uv.lock