Allow for >1 batch size in splatfacto

akristoffersen · akristoffersen · commit abff91c2d346 · 2025-01-27T01:55:54.000Z
diff --git a/nerfstudio/cameras/camera_optimizers.py b/nerfstudio/cameras/camera_optimizers.py
@@ -152,7 +152,7 @@ def apply_to_raybundle(self, raybundle: RayBundle) -> None:
             raybundle.origins = raybundle.origins + correction_matrices[:, :3, 3]
             raybundle.directions = torch.bmm(correction_matrices[:, :3, :3], raybundle.directions[..., None]).squeeze()
 
-    def apply_to_camera(self, camera: Cameras) -> torch.Tensor:
+    def apply_to_camera(self, camera: Cameras) -> Float[Tensor, "b 3 4"]:
         """Apply the pose correction to the world-to-camera matrix in a Camera object"""
         if self.config.mode == "off":
             return camera.camera_to_worlds
diff --git a/nerfstudio/data/datamanagers/full_images_datamanager.py b/nerfstudio/data/datamanagers/full_images_datamanager.py
@@ -79,6 +79,8 @@ class FullImageDatamanagerConfig(DataManagerConfig):
     fps_reset_every: int = 100
     """The number of iterations before one resets fps sampler repeatly, which is essentially drawing fps_reset_every
     samples from the pool of all training cameras without replacement before a new round of sampling starts."""
+    batch_size: int = 1
+    """The batch size for the dataloader."""
 
 
 class FullImageDatamanager(DataManager, Generic[TDataset]):
@@ -336,31 +338,51 @@ def get_train_rays_per_batch(self):
         if len(self.cached_train) != 0:
             h = self.cached_train[0]["image"].shape[0]
             w = self.cached_train[0]["image"].shape[1]
-            return h * w
+            return h * w * self.config.batch_size
         else:
             return 800 * 800
 
     def next_train(self, step: int) -> Tuple[Cameras, Dict]:
         """Returns the next training batch
 
         Returns a Camera instead of raybundle"""
-        image_idx = self.train_unseen_cameras.pop(0)
-        # Make sure to re-populate the unseen cameras list if we have exhausted it
-        if len(self.train_unseen_cameras) == 0:
-            self.train_unseen_cameras = self.sample_train_cameras()
 
-        data = self.cached_train[image_idx]
-        # We're going to copy to make sure we don't mutate the cached dictionary.
-        # This can cause a memory leak: https://github.com/nerfstudio-project/nerfstudio/issues/3335
-        data = data.copy()
-        data["image"] = data["image"].to(self.device)
-
-        assert len(self.train_cameras.shape) == 1, "Assumes single batch dimension"
-        camera = self.train_cameras[image_idx : image_idx + 1].to(self.device)
-        if camera.metadata is None:
-            camera.metadata = {}
-        camera.metadata["cam_idx"] = image_idx
-        return camera, data
+        image_indices = []
+        for _ in range(self.config.batch_size):
+            # Make sure to re-populate the unseen cameras list if we have exhausted it
+            if len(self.train_unseen_cameras) == 0:
+                self.train_unseen_cameras = self.sample_train_cameras()
+            image_indices.append(self.train_unseen_cameras.pop(0))
+
+        all_keys = self.cached_train[0].keys()
+
+        data = {}
+        for key in all_keys:
+            if key == "image":
+                data[key] = torch.stack([self.cached_train[i][key] for i in image_indices]).to(self.device)
+            else:
+                data[key] = [self.cached_train[i][key] for i in image_indices]
+
+        cameras = Cameras(
+            camera_to_worlds=self.train_cameras.camera_to_worlds[image_indices],
+            fx=self.train_cameras.fx[image_indices],
+            fy=self.train_cameras.fy[image_indices],
+            cx=self.train_cameras.cx[image_indices],
+            cy=self.train_cameras.cy[image_indices],
+            width=self.train_cameras.width[image_indices],
+            height=self.train_cameras.height[image_indices],
+            camera_type=self.train_cameras.camera_type[image_indices],
+        ).to(self.device)
+
+        if self.train_cameras.distortion_params is not None:
+            cameras.distortion_params = self.train_cameras.distortion_params[image_indices]
+
+        if cameras.metadata is None:
+            cameras.metadata = {}
+
+        cameras.metadata["cam_idx"] = image_indices
+
+        return cameras, data
 
     def next_eval(self, step: int) -> Tuple[Cameras, Dict]:
         """Returns the next evaluation batch
diff --git a/nerfstudio/models/splatfacto.py b/nerfstudio/models/splatfacto.py
@@ -46,20 +46,28 @@
 from nerfstudio.utils.spherical_harmonics import RGB2SH, SH2RGB, num_sh_bases
 
 
-def resize_image(image: torch.Tensor, d: int):
+def resize_image(image: torch.Tensor, d: int) -> torch.Tensor:
     """
     Downscale images using the same 'area' method in opencv
 
-    :param image shape [H, W, C]
+    :param image shape [B, H, W, C]
     :param d downscale factor (must be 2, 4, 8, etc.)
 
-    return downscaled image in shape [H//d, W//d, C]
+    return downscaled image in shape [B, H//d, W//d, C]
     """
     import torch.nn.functional as tf
 
-    image = image.to(torch.float32)
     weight = (1.0 / (d * d)) * torch.ones((1, 1, d, d), dtype=torch.float32, device=image.device)
-    return tf.conv2d(image.permute(2, 0, 1)[:, None, ...], weight, stride=d).squeeze(1).permute(1, 2, 0)
+
+    B, H, W, C = image.shape
+    image = image.permute(0, 3, 1, 2)  # [B, C, H, W]
+    image = image.reshape(B * C, 1, H, W)  # Combine batch and channel dimensions for Conv2D
+
+    downscaled = tf.conv2d(image, weight, stride=d)
+    downscaled = downscaled.reshape(B, C, downscaled.shape[-2], downscaled.shape[-1])
+    downscaled = downscaled.permute(0, 2, 3, 1)  # [B, H//d, W//d, C]
+
+    return downscaled
 
 
 @torch_compile()
@@ -482,32 +490,31 @@ def _apply_bilateral_grid(self, rgb: torch.Tensor, cam_idx: int, H: int, W: int)
         )
         return out["rgb"]
 
-    def get_outputs(self, camera: Cameras) -> Dict[str, Union[torch.Tensor, List]]:
-        """Takes in a camera and returns a dictionary of outputs.
+    def get_outputs(self, cameras: Cameras) -> Dict[str, Union[torch.Tensor, List]]:
+        """Takes in cameras and returns a dictionary of outputs.
 
         Args:
-            camera: The camera(s) for which output images are rendered. It should have
+            cameras: The camera(s) for which output images are rendered. It should have
             all the needed information to compute the outputs.
 
         Returns:
             Outputs of model. (ie. rendered colors)
         """
-        if not isinstance(camera, Cameras):
+        if not isinstance(cameras, Cameras):
             print("Called get_outputs with not a camera")
             return {}
 
         if self.training:
-            assert camera.shape[0] == 1, "Only one camera at a time"
-            optimized_camera_to_world = self.camera_optimizer.apply_to_camera(camera)
+            optimized_camera_to_world = self.camera_optimizer.apply_to_camera(cameras)
         else:
-            optimized_camera_to_world = camera.camera_to_worlds
+            optimized_camera_to_world = cameras.camera_to_worlds
 
         # cropping
         if self.crop_box is not None and not self.training:
             crop_ids = self.crop_box.within(self.means).squeeze()
             if crop_ids.sum() == 0:
                 return self.get_empty_outputs(
-                    int(camera.width.item()), int(camera.height.item()), self.background_color
+                    int(cameras.width.item()), int(cameras.height.item()), self.background_color
                 )
         else:
             crop_ids = None
@@ -530,12 +537,16 @@ def get_outputs(self, camera: Cameras) -> Dict[str, Union[torch.Tensor, List]]:
         colors_crop = torch.cat((features_dc_crop[:, None, :], features_rest_crop), dim=1)
 
         camera_scale_fac = self._get_downscale_factor()
-        camera.rescale_output_resolution(1 / camera_scale_fac)
-        viewmat = get_viewmat(optimized_camera_to_world)
-        K = camera.get_intrinsics_matrices().cuda()
-        W, H = int(camera.width.item()), int(camera.height.item())
+        cameras.rescale_output_resolution(1 / camera_scale_fac)
+        viewmats = get_viewmat(optimized_camera_to_world)
+        Ks = cameras.get_intrinsics_matrices().cuda()
+
+        W, H = (
+            int(cameras.width[0]),
+            int(cameras.height[0]),
+        )  # assume all cameras have the same resolution
         self.last_size = (H, W)
-        camera.rescale_output_resolution(camera_scale_fac)  # type: ignore
+        cameras.rescale_output_resolution(camera_scale_fac)  # type: ignore
 
         # apply the compensation of screen space blurring to gaussians
         if self.config.rasterize_mode not in ["antialiased", "classic"]:
@@ -558,8 +569,8 @@ def get_outputs(self, camera: Cameras) -> Dict[str, Union[torch.Tensor, List]]:
             scales=torch.exp(scales_crop),
             opacities=torch.sigmoid(opacities_crop).squeeze(-1),
             colors=colors_crop,
-            viewmats=viewmat,  # [1, 4, 4]
-            Ks=K,  # [1, 3, 3]
+            viewmats=viewmats,  # [1, 4, 4]
+            Ks=Ks,  # [1, 3, 3]
             width=W,
             height=H,
             packed=False,
@@ -585,24 +596,28 @@ def get_outputs(self, camera: Cameras) -> Dict[str, Union[torch.Tensor, List]]:
 
         # apply bilateral grid
         if self.config.use_bilateral_grid and self.training:
-            if camera.metadata is not None and "cam_idx" in camera.metadata:
-                rgb = self._apply_bilateral_grid(rgb, camera.metadata["cam_idx"], H, W)
+            if cameras.metadata is not None and "cam_idx" in cameras.metadata:
+                rgb = self._apply_bilateral_grid(rgb, cameras.metadata["cam_idx"], H, W)
 
         if render_mode == "RGB+ED":
             depth_im = render[:, ..., 3:4]
-            depth_im = torch.where(alpha > 0, depth_im, depth_im.detach().max()).squeeze(0)
+            depth_im = torch.where(alpha > 0, depth_im, depth_im.detach().max())
         else:
             depth_im = None
 
         if background.shape[0] == 3 and not self.training:
             background = background.expand(H, W, 3)
 
-        return {
-            "rgb": rgb.squeeze(0),  # type: ignore
-            "depth": depth_im,  # type: ignore
-            "accumulation": alpha.squeeze(0),  # type: ignore
-            "background": background,  # type: ignore
-        }  # type: ignore
+        outputs = {
+            "rgb": rgb,
+            "depth": depth_im,
+            "accumulation": alpha,
+            "background": background,
+        }
+
+        if self.training:
+            return outputs
+        return {k: v.squeeze(0) if k != "background" else v for k, v in outputs.items()}
 
     def get_gt_img(self, image: torch.Tensor):
         """Compute groundtruth image with iteration dependent downscale factor for evaluation purpose
@@ -622,7 +637,7 @@ def composite_with_background(self, image, background) -> torch.Tensor:
             image: the image to composite
             background: the background color
         """
-        if image.shape[2] == 4:
+        if image.shape[-1] == 4:
             alpha = image[..., -1].unsqueeze(-1).repeat((1, 1, 3))
             return alpha * image[..., :3] + (1 - alpha) * background
         else:
@@ -671,7 +686,7 @@ def get_loss_dict(self, outputs, batch, metrics_dict=None) -> Dict[str, torch.Te
             pred_img = pred_img * mask
 
         Ll1 = torch.abs(gt_img - pred_img).mean()
-        simloss = 1 - self.ssim(gt_img.permute(2, 0, 1)[None, ...], pred_img.permute(2, 0, 1)[None, ...])
+        simloss = 1 - self.ssim(gt_img.permute(0, 3, 1, 2), pred_img.permute(0, 3, 1, 2))
         if self.config.use_scale_regularization and self.step % 10 == 0:
             scale_exp = torch.exp(self.scales)
             scale_reg = (