allow for passing in image feature maps, taking care of transposes

lucidrains · lucidrains · commit dd5e4a36a3d7 · 2021-12-03T10:52:13.000-08:00
diff --git a/README.md b/README.md
@@ -139,16 +139,16 @@ from vector_quantize_pytorch import VectorQuantize
 vq = VectorQuantize(
     dim = 256,
     codebook_size = 256,
-    orthogonal_reg_weight = 10  # in paper, they recommended a value of 10
+    accept_image_fmap = True,    # set this true to be able to pass in an image feature map
+    orthogonal_reg_weight = 10,  # in paper, they recommended a value of 10
 )
 
-x = torch.randn(1, 1024, 256)
-quantized, indices, loss = vq(x)
+img_fmap = torch.randn(1, 256, 32, 32)
+quantized, indices, loss = vq(x) # (1, 256, 32, 32), (1, 32, 32), (1,)
 
 # loss now contains the orthogonal regularization loss with the weight as assigned
 ```
 
-
 ## Todo
 
 - [ ] allow for multi-headed codebooks, from https://openreview.net/forum?id=GxjCYmQAody
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'vector_quantize_pytorch',
   packages = find_packages(),
-  version = '0.4.2',
+  version = '0.4.3',
   license='MIT',
   description = 'Vector Quantization - Pytorch',
   author = 'Phil Wang',
diff --git a/vector_quantize_pytorch/vector_quantize_pytorch.py b/vector_quantize_pytorch/vector_quantize_pytorch.py
@@ -261,6 +261,7 @@ def __init__(
         use_cosine_sim = False,
         threshold_ema_dead_code = 0,
         channel_last = True,
+        accept_image_fmap = False,
         commitment = 1. # deprecate in next version, turn off by default
     ):
         super().__init__()
@@ -291,19 +292,25 @@ def __init__(
         )
 
         self.codebook_size = codebook_size
+
+        self.accept_image_fmap = accept_image_fmap
         self.channel_last = channel_last
 
     @property
     def codebook(self):
         return self._codebook.embed
 
     def forward(self, x):
-        device, codebook_size = x.device, self.codebook_size
+        shape, device, codebook_size = x.shape, x.device, self.codebook_size
+
+        need_transpose = not self.channel_last and not self.accept_image_fmap
 
-        need_transpose = not self.channel_last
+        if self.accept_image_fmap:
+            height, width = x.shape[-2:]
+            x = rearrange(x, 'b c h w -> b (h w) c')
 
         if need_transpose:
-            x = rearrange(x, 'b n d -> b d n')
+            x = rearrange(x, 'b d n -> b n d')
 
         x = self.project_in(x)
 
@@ -326,6 +333,10 @@ def forward(self, x):
         quantize = self.project_out(quantize)
 
         if need_transpose:
-            quantize = rearrange(quantize, 'b d n -> b n d')
+            quantize = rearrange(quantize, 'b n d -> b d n')
+
+        if self.accept_image_fmap:
+            quantize = rearrange(quantize, 'b (h w) c -> b c h w', h = height, w = width)
+            embed_ind = rearrange(embed_ind, 'b (h w) -> b h w', h = height, w = width)
 
         return quantize, embed_ind, loss