release orthogonal regularization loss, from https://arxiv.org/abs/2112.00384

lucidrains · lucidrains · commit 0e8aa7687d6b · 2021-12-02T21:29:33.000-08:00
diff --git a/README.md b/README.md
@@ -126,9 +126,31 @@ x = torch.randn(1, 1024, 256)
 quantized, indices, commit_loss = vq(x)
 ```
 
+## Orthogonal regularization loss
+
+VQ-VAE / VQ-GAN is quickly gaining popularity. A <a href="https://arxiv.org/abs/2112.00384">recent paper</a> proposes that when using vector quantization on images, enforcing the codebook to be orthogonal leads to translation equivariance of the discretized codes, leading to large improvements in downstream text to image generation tasks.
+
+You can use this feature by simply setting the `orthogonal_reg_weight` to be greater than `0`, in which case the orthogonal regularization will be added to the auxiliary loss outputted by the module.
+
+```python
+import torch
+from vector_quantize_pytorch import VectorQuantize
+
+vq = VectorQuantize(
+    dim = 256,
+    codebook_size = 256,
+    orthogonal_reg_weight = 10  # in paper, they recommended a value of 10
+)
+
+x = torch.randn(1, 1024, 256)
+quantized, indices, loss = vq(x)
+
+# loss now contains the orthogonal regularization loss with the weight as assigned
+```
+
+
 ## Todo
 
-- [ ] add orthogonality loss on codebook, from https://arxiv.org/abs/2112.00384
 - [ ] allow for multi-headed codebooks, from https://openreview.net/forum?id=GxjCYmQAody
 
 ## Citations
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'vector_quantize_pytorch',
   packages = find_packages(),
-  version = '0.3.11',
+  version = '0.4.0',
   license='MIT',
   description = 'Vector Quantization - Pytorch',
   author = 'Phil Wang',
diff --git a/vector_quantize_pytorch/vector_quantize_pytorch.py b/vector_quantize_pytorch/vector_quantize_pytorch.py
@@ -57,6 +57,16 @@ def kmeans(samples, num_clusters, num_iters = 10, use_cosine_sim = False):
 
     return means, bins
 
+# regularization losses
+
+def orthgonal_loss_fn(t):
+    # eq (2) from https://arxiv.org/abs/2112.00384
+    n = t.shape[0]
+    normed_codes = l2norm(t)
+    identity = torch.eye(n, device = t.device)
+    cosine_sim = einsum('i d, j d -> i j', normed_codes, normed_codes)
+    return ((cosine_sim - identity) ** 2).sum() / (n ** 2)
+
 # distance types
 
 class EuclideanCodebook(nn.Module):
@@ -244,6 +254,7 @@ def __init__(
         codebook_dim = None,
         decay = 0.8,
         commitment = 1.,
+        orthogonal_reg_weight = 0.,
         eps = 1e-5,
         kmeans_init = False,
         kmeans_iters = 10,
@@ -263,6 +274,7 @@ def __init__(
 
         self.eps = eps
         self.commitment = commitment
+        self.orthogonal_reg_weight = orthogonal_reg_weight
 
         codebook_class = EuclideanCodebook if not use_cosine_sim \
                          else CosineSimCodebook
@@ -285,6 +297,8 @@ def codebook(self):
         return self._codebook.embed
 
     def forward(self, x):
+        device, codebook_size = x.device, self.codebook_size
+
         need_transpose = not self.channel_last
 
         if need_transpose:
@@ -295,14 +309,22 @@ def forward(self, x):
         quantize, embed_ind = self._codebook(x)
 
         if self.training:
-            commit_loss = F.mse_loss(quantize.detach(), x) * self.commitment
             quantize = x + (quantize - x).detach()
-        else:
-            commit_loss = torch.tensor([0.], device = x.device)
+
+        loss = torch.tensor([0.], device = device)
+
+        if self.training:
+            if self.commitment > 0:
+                commit_loss = F.mse_loss(quantize.detach(), x)
+                loss = loss + commit_loss * self.commitment
+
+            if self.orthogonal_reg_weight > 0:
+                orthogonal_reg_loss = orthgonal_loss_fn(self.codebook)
+                loss = loss + orthogonal_reg_loss * self.orthogonal_reg_weight
 
         quantize = self.project_out(quantize)
 
         if need_transpose:
             quantize = rearrange(quantize, 'b d n -> b n d')
 
-        return quantize, embed_ind, commit_loss
+        return quantize, embed_ind, loss