diff --git a/benchmarks/ucf101/vjepa-vit-base.py b/benchmarks/ucf101/vjepa-vit-base.py
new file mode 100644
index 000000000..d6124e88e
--- /dev/null
+++ b/benchmarks/ucf101/vjepa-vit-base.py
@@ -0,0 +1,367 @@
+"""V-JEPA ViT-Base on UCF-101.
+
+Self-supervised video representation learning via tube-masked spatio-temporal
+prediction.  A ViT-Base is trained with the V-JEPA objective: a lightweight
+predictor recovers teacher representations of masked spatio-temporal tubes from
+the surrounding context encoded by the student.
+
+The UCF-101 dataset is downloaded automatically from HuggingFace
+(MichiganNLP/ucf-101) on the first run and extracted to the configured data
+directory.
+
+References:
+    Bardes et al. "V-JEPA: Latent Video Prediction for Visual Representation
+    Learning." ICLR 2024. https://arxiv.org/abs/2404.08471
+"""
+
+import sys
+import types
+import zipfile
+from pathlib import Path
+
+import lightning as pl
+import torch
+import torch.nn as nn
+import torchmetrics
+import torchvision
+import torchvision.transforms.functional as TF
+
+import stable_pretraining as spt
+from stable_pretraining.methods.vjepa import VJEPA
+
+NUM_FRAMES = 8
+SPATIAL_SIZE = 224
+NUM_CLASSES = 101
+EMBED_DIM = 768  # ViT-Base
+
+
+# ---------------------------------------------------------------------------
+# Dataset helpers
+# ---------------------------------------------------------------------------
+
+
+def _download_and_extract(data_dir: Path) -> tuple[Path, Path]:
+    """Download UCF-101 zip files from HuggingFace and extract them.
+
+    Returns:
+        Tuple of (videos_root_dir, annotation_dir) paths.
+    """
+    from huggingface_hub import hf_hub_download
+
+    videos_dir = data_dir / "UCF-101"
+    splits_dir = data_dir / "ucfTrainTestlist"
+
+    if not videos_dir.exists():
+        print("Downloading UCF101.zip from HuggingFace (MichiganNLP/ucf-101)...")
+        zip_path = hf_hub_download(
+            repo_id="MichiganNLP/ucf-101",
+            filename="UCF101.zip",
+            repo_type="dataset",
+            cache_dir=str(data_dir / "hf_cache"),
+        )
+        print(f"Extracting {zip_path} -> {data_dir}")
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            zf.extractall(data_dir)
+
+    if not splits_dir.exists():
+        print(
+            "Downloading UCF101TrainTestSplits-RecognitionTask.zip from HuggingFace..."
+        )
+        splits_zip = hf_hub_download(
+            repo_id="MichiganNLP/ucf-101",
+            filename="UCF101TrainTestSplits-RecognitionTask.zip",
+            repo_type="dataset",
+            cache_dir=str(data_dir / "hf_cache"),
+        )
+        print(f"Extracting {splits_zip} -> {data_dir}")
+        with zipfile.ZipFile(splits_zip, "r") as zf:
+            zf.extractall(data_dir)
+
+    return videos_dir, splits_dir
+
+
+class UCF101VideoDataset(torch.utils.data.Dataset):
+    """UCF-101 wrapper that returns ``{"video": (C,T,H,W), "label": int}`` dicts.
+
+    Videos are loaded via :class:`torchvision.datasets.UCF101`.  Each clip is
+    returned as a ``(C, T, H, W)`` float32 tensor normalised with UCF-101
+    channel statistics.  Spatial augmentations (random-resized crop + flip for
+    train; resize + centre-crop for val) are applied **consistently across all
+    frames** by computing crop parameters once and reapplying them per frame.
+
+    Args:
+        root: Path to the extracted ``UCF-101/`` video directory.
+        annotation_path: Path to the ``ucfTrainTestlist/`` annotation directory.
+        train: If ``True``, use the training split with augmentations;
+            otherwise use the validation split.
+        frames_per_clip: Number of frames per returned clip.
+        step_between_clips: Temporal stride between consecutive clips.
+        fold: UCF-101 split fold index (1, 2, or 3).
+    """
+
+    _mean = torch.tensor([0.43216, 0.394666, 0.37645]).view(3, 1, 1, 1)
+    _std = torch.tensor([0.22803, 0.22145, 0.216989]).view(3, 1, 1, 1)
+
+    def __init__(
+        self,
+        root: str,
+        annotation_path: str,
+        train: bool,
+        frames_per_clip: int = NUM_FRAMES,
+        step_between_clips: int = 4,
+        fold: int = 1,
+    ):
+        self.dataset = torchvision.datasets.UCF101(
+            root=root,
+            annotation_path=annotation_path,
+            frames_per_clip=frames_per_clip,
+            step_between_clips=step_between_clips,
+            fold=fold,
+            train=train,
+        )
+        self.train = train
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def __getitem__(self, idx: int) -> dict:
+        video, _audio, label = self.dataset[idx]
+        # torchvision UCF101 returns video as (T, H, W, C) uint8
+        video = video.permute(3, 0, 1, 2).float() / 255.0  # (C, T, H, W)
+
+        if self.train:
+            video = self._train_transform(video)
+        else:
+            video = self._val_transform(video)
+
+        video = (video - self._mean) / self._std
+        return {"video": video, "label": label, "sample_idx": idx}
+
+    @staticmethod
+    def _train_transform(video: torch.Tensor) -> torch.Tensor:
+        """Consistent random-resized crop + horizontal flip across all T frames."""
+        C, T, H, W = video.shape
+        # Reference frame for computing crop parameters
+        ref = video[:, 0]  # (C, H, W)
+        i, j, h, w = torchvision.transforms.RandomResizedCrop.get_params(
+            ref, scale=(0.5, 1.0), ratio=(0.75, 1.333)
+        )
+        # Apply the same crop to every frame
+        frames = torch.stack(
+            [
+                TF.resized_crop(video[:, t], i, j, h, w, [SPATIAL_SIZE, SPATIAL_SIZE])
+                for t in range(T)
+            ]
+        )  # (T, C, H, W)
+        if torch.rand(1).item() > 0.5:
+            frames = TF.hflip(frames)
+        return frames.permute(1, 0, 2, 3)  # (C, T, H, W)
+
+    @staticmethod
+    def _val_transform(video: torch.Tensor) -> torch.Tensor:
+        """Consistent centre-crop across all T frames."""
+        C, T, H, W = video.shape
+        scale = round(SPATIAL_SIZE * 256 / 224)
+        frames = torch.stack(
+            [
+                TF.center_crop(
+                    TF.resize(video[:, t], [scale]), [SPATIAL_SIZE, SPATIAL_SIZE]
+                )
+                for t in range(T)
+            ]
+        )  # (T, C, H, W)
+        return frames.permute(1, 0, 2, 3)  # (C, T, H, W)
+
+
+# ---------------------------------------------------------------------------
+# Forward function
+# ---------------------------------------------------------------------------
+
+
+def vjepa_forward(self, batch, stage):
+    """V-JEPA forward step for Lightning training loop.
+
+    Args:
+        self: VJEPA module instance.
+        batch: Dict with ``"video"`` ``(B, C, T, H, W)`` and optional ``"label"``.
+        stage: Training stage string (``"fit"``, ``"validate"``, etc.).
+
+    Returns:
+        Dict with ``"loss"``, ``"embedding"``, and optionally ``"label"``.
+    """
+    output = VJEPA.forward(self, batch["video"], embedding_source="teacher")
+    embedding = output.embedding  # [B, D] mean-pooled
+
+    if self.training:
+        embedding = embedding.detach()
+
+    self.log(f"{stage}/loss", output.loss, on_step=True, on_epoch=True, sync_dist=True)
+    self.log(
+        f"{stage}/num_targets",
+        float(output.num_targets),
+        on_step=False,
+        on_epoch=True,
+        sync_dist=True,
+    )
+
+    return {
+        "loss": output.loss,
+        "embedding": embedding,
+        **({"label": batch["label"].long()} if "label" in batch else {}),
+    }
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main():
+    sys.path.append(str(Path(__file__).parent.parent))
+    from utils import get_data_dir
+
+    num_gpus = torch.cuda.device_count() or 1
+    batch_size = 16  # video batches are memory-intensive; adjust per GPU VRAM
+    num_workers = 8
+    max_epochs = 200
+
+    data_dir = Path(get_data_dir("ucf101"))
+    videos_dir, splits_dir = _download_and_extract(data_dir)
+
+    # ------------------------------------------------------------------
+    # Data
+    # ------------------------------------------------------------------
+    data = spt.data.DataModule(
+        train=torch.utils.data.DataLoader(
+            dataset=UCF101VideoDataset(
+                root=str(videos_dir),
+                annotation_path=str(splits_dir),
+                train=True,
+                frames_per_clip=NUM_FRAMES,
+                step_between_clips=4,
+            ),
+            batch_size=batch_size,
+            num_workers=num_workers,
+            drop_last=True,
+            persistent_workers=num_workers > 0,
+            shuffle=True,
+        ),
+        val=torch.utils.data.DataLoader(
+            dataset=UCF101VideoDataset(
+                root=str(videos_dir),
+                annotation_path=str(splits_dir),
+                train=False,
+                frames_per_clip=NUM_FRAMES,
+                step_between_clips=8,
+            ),
+            batch_size=batch_size,
+            num_workers=num_workers,
+            persistent_workers=num_workers > 0,
+        ),
+    )
+
+    # ------------------------------------------------------------------
+    # Model
+    # ------------------------------------------------------------------
+    module = VJEPA(
+        encoder_name="vit_base_patch16_224",
+        num_frames=NUM_FRAMES,
+        predictor_embed_dim=384,
+        predictor_depth=6,
+        num_targets=8,
+        target_scale=(0.15, 0.2),
+        target_aspect_ratio=(0.75, 1.5),
+        context_scale=(1.0, 1.0),
+        ema_decay_start=0.996,
+        ema_decay_end=1.0,
+        pretrained=False,
+    )
+
+    module.forward = types.MethodType(vjepa_forward, module)
+    module.optim = {
+        "optimizer": {
+            "type": "AdamW",
+            "lr": (lr := 1.5e-4),
+            "weight_decay": 0.05,
+            "betas": (0.9, 0.95),
+        },
+        "scheduler": {
+            "type": "LinearWarmupCosineAnnealing",
+            "peak_step": 15 / max_epochs,
+            "start_factor": 0.01,
+            "end_lr": lr / 100,
+            "total_steps": (len(data.train) // num_gpus) * max_epochs,
+        },
+        "interval": "step",
+    }
+
+    # ------------------------------------------------------------------
+    # Trainer
+    # ------------------------------------------------------------------
+    trainer = pl.Trainer(
+        max_epochs=max_epochs,
+        num_sanity_val_steps=0,
+        callbacks=[
+            spt.callbacks.TeacherStudentCallback(
+                update_frequency=1,
+                update_after_backward=True,
+            ),
+            spt.callbacks.OnlineProbe(
+                module,
+                name="linear_probe",
+                input="embedding",
+                target="label",
+                probe=nn.Linear(EMBED_DIM, NUM_CLASSES),
+                loss=nn.CrossEntropyLoss(),
+                metrics={
+                    "top1": torchmetrics.classification.MulticlassAccuracy(NUM_CLASSES),
+                    "top5": torchmetrics.classification.MulticlassAccuracy(
+                        NUM_CLASSES, top_k=5
+                    ),
+                },
+                optimizer={"type": "AdamW", "lr": 0.03, "weight_decay": 0.0},
+            ),
+            spt.callbacks.OnlineKNN(
+                name="knn_probe",
+                input="embedding",
+                target="label",
+                queue_length=4096,
+                metrics={
+                    "top1": torchmetrics.classification.MulticlassAccuracy(NUM_CLASSES)
+                },
+                input_dim=EMBED_DIM,
+                k=20,
+            ),
+            spt.callbacks.RankMe(
+                name="rankme",
+                target="embedding",
+                queue_length=1000,
+                target_shape=EMBED_DIM,
+            ),
+            pl.pytorch.callbacks.ModelCheckpoint(
+                dirpath=str(Path(__file__).parent / "checkpoints" / "vjepa-vitb"),
+                filename="vjepa-vitb-{epoch:03d}",
+                save_top_k=-1,
+                every_n_epochs=50,
+                save_last=True,
+            ),
+            pl.pytorch.callbacks.LearningRateMonitor(logging_interval="step"),
+        ],
+        logger=pl.pytorch.loggers.WandbLogger(
+            entity="stable-ssl",
+            project="ucf101-methods",
+            name="vjepa-vitb-ucf101",
+            log_model=False,
+        ),
+        precision="16-mixed",
+        devices=num_gpus,
+        accelerator="gpu",
+        strategy="ddp_find_unused_parameters_true" if num_gpus > 1 else "auto",
+    )
+
+    manager = spt.Manager(trainer=trainer, module=module, data=data)
+    manager()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/stable_pretraining/backbone/__init__.py b/stable_pretraining/backbone/__init__.py
index d27c3cdbf..38835d061 100644
--- a/stable_pretraining/backbone/__init__.py
+++ b/stable_pretraining/backbone/__init__.py
@@ -16,7 +16,13 @@
     AutoLinearClassifier,
     AutoTuneMLP,
 )
-from .patch_masking import PatchMasking, IJEPAMasking, IJEPAMaskOutput
+from .patch_masking import (
+    PatchMasking,
+    IJEPAMasking,
+    IJEPAMaskOutput,
+    VJEPAMasking,
+    VJEPAMaskOutput,
+)
 from .utils import (
     EvalOnly,
     FeaturesConcat,
@@ -62,6 +68,8 @@
     AutoTuneMLP,
     HiddenStateExtractor,
     PatchMasking,
+    VJEPAMasking,
+    VJEPAMaskOutput,
     MAEDecoder,
     MaskedEncoder,
     MaskedEncoderOutput,
@@ -71,6 +79,8 @@
     FlexibleTransformer,
     IJEPAMasking,
     IJEPAMaskOutput,
+    VJEPAMasking,
+    VJEPAMaskOutput,
     modulate,
 ]
 
diff --git a/stable_pretraining/backbone/patch_masking.py b/stable_pretraining/backbone/patch_masking.py
index 6f3bd56c1..987a9b5fa 100644
--- a/stable_pretraining/backbone/patch_masking.py
+++ b/stable_pretraining/backbone/patch_masking.py
@@ -7,7 +7,14 @@
 import torch.nn.functional as F
 from typing import List, Tuple
 
-__all__ = ["PatchMasking", "MaskingOutput", "IJEPAMasking", "IJEPAMaskOutput"]
+__all__ = [
+    "PatchMasking",
+    "MaskingOutput",
+    "IJEPAMasking",
+    "IJEPAMaskOutput",
+    "VJEPAMasking",
+    "VJEPAMaskOutput",
+]
 
 
 @dataclass
@@ -630,3 +637,264 @@ def extra_repr(self) -> str:
             f"target_aspect_ratio={self.target_aspect_ratio}, "
             f"context_scale={self.context_scale}"
         )
+
+
+# =============================================================================
+# V-JEPA tube masking
+# =============================================================================
+
+
+@dataclass
+class VJEPAMaskOutput:
+    """Output from V-JEPA tube masking operation.
+
+    :ivar context_idx: Indices of context (visible) patches [B, N_ctx]
+        (flat indices into T*N_spatial)
+    :ivar target_idx: Combined indices of all target patches [B, N_tgt]
+    :ivar target_block_masks: Per-tube boolean masks [num_targets × (B, T*N_spatial)]
+    :ivar mask: Full mask where 1 = target, 0 = context [B, T*N_spatial]
+    """
+
+    context_idx: torch.Tensor
+    target_idx: torch.Tensor
+    target_block_masks: List[torch.Tensor]
+    mask: torch.Tensor
+
+
+class VJEPAMasking(nn.Module):
+    """V-JEPA multi-block tube masking for video joint-embedding predictive architecture.
+
+    Extends I-JEPA multi-block masking to video by replicating each 2D spatial
+    block across **all** temporal frames, forming *tubes*.  Target tubes share
+    the same spatial footprint across every frame; context is all non-target
+    spatio-temporal tokens (optionally subsampled).
+
+    Strategy:
+        1. Sample M spatial blocks (identical scale / aspect-ratio rules as I-JEPA)
+        2. Extend each spatial block to a tube: same region across all T frames
+        3. Context = all T×N_spatial tokens NOT in any tube
+        4. Optionally subsample context to ``context_scale`` ratio
+
+    :param num_targets: Number of target tubes to sample (default: 8)
+    :param target_scale: (min, max) fraction of *spatial* patches per block
+    :param target_aspect_ratio: (min, max) aspect ratio of spatial blocks
+    :param context_scale: (min, max) fraction of non-target patches kept as context.
+        V-JEPA uses the full context so the default is ``(1.0, 1.0)``.
+    :param allow_target_overlap: Allow tubes to overlap spatially (default: False)
+
+    Example::
+
+        masking = VJEPAMasking(num_targets=8)
+
+        # x: video patch embeddings [B, T*N_spatial, D]
+        output = masking(x, grid_t=8, grid_h=14, grid_w=14)
+
+        context = x.gather(1, output.context_idx.unsqueeze(-1).expand(-1, -1, D))
+        targets = x.gather(1, output.target_idx.unsqueeze(-1).expand(-1, -1, D))
+
+    References:
+        Bardes et al. "V-JEPA: Latent Video Prediction for Visual Representation
+        Learning." ICLR 2024. https://arxiv.org/abs/2404.08471
+    """
+
+    def __init__(
+        self,
+        num_targets: int = 8,
+        target_scale: Tuple[float, float] = (0.15, 0.2),
+        target_aspect_ratio: Tuple[float, float] = (0.75, 1.5),
+        context_scale: Tuple[float, float] = (1.0, 1.0),
+        allow_target_overlap: bool = False,
+    ):
+        super().__init__()
+
+        if num_targets < 1:
+            raise ValueError(f"num_targets must be >= 1, got {num_targets}")
+        if not (0 < target_scale[0] <= target_scale[1] < 1):
+            raise ValueError(f"target_scale must be in (0, 1), got {target_scale}")
+        if not (0 < target_aspect_ratio[0] <= target_aspect_ratio[1]):
+            raise ValueError("target_aspect_ratio values must be positive")
+        if not (0 < context_scale[0] <= context_scale[1] <= 1):
+            raise ValueError(f"context_scale must be in (0, 1], got {context_scale}")
+
+        self.num_targets = num_targets
+        self.target_scale = target_scale
+        self.target_aspect_ratio = target_aspect_ratio
+        self.context_scale = context_scale
+        self.allow_target_overlap = allow_target_overlap
+
+    def _sample_block_params(
+        self,
+        grid_h: int,
+        grid_w: int,
+        device: torch.device,
+    ) -> Tuple[int, int, int, int]:
+        """Sample (top, left, block_h, block_w) for a single spatial target block."""
+        num_patches = grid_h * grid_w
+        scale = torch.empty(1, device=device).uniform_(*self.target_scale).item()
+        log_ar = (
+            torch.empty(1, device=device)
+            .uniform_(
+                torch.tensor(self.target_aspect_ratio[0]).log().item(),
+                torch.tensor(self.target_aspect_ratio[1]).log().item(),
+            )
+            .item()
+        )
+        aspect_ratio = torch.tensor(log_ar).exp().item()
+        block_area = num_patches * scale
+        block_h = int(round((block_area / aspect_ratio) ** 0.5))
+        block_w = int(round((block_area * aspect_ratio) ** 0.5))
+        block_h = max(1, min(block_h, grid_h))
+        block_w = max(1, min(block_w, grid_w))
+        top = torch.randint(0, max(1, grid_h - block_h + 1), (1,), device=device).item()
+        left = torch.randint(
+            0, max(1, grid_w - block_w + 1), (1,), device=device
+        ).item()
+        return top, left, block_h, block_w
+
+    def _create_spatial_mask(
+        self,
+        top: int,
+        left: int,
+        block_h: int,
+        block_w: int,
+        grid_h: int,
+        grid_w: int,
+        device: torch.device,
+    ) -> torch.Tensor:
+        """Create 2D spatial block mask [grid_h, grid_w], True = in block."""
+        mask = torch.zeros(grid_h, grid_w, dtype=torch.bool, device=device)
+        mask[top : top + block_h, left : left + block_w] = True
+        return mask
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_t: int,
+        grid_h: int,
+        grid_w: int,
+    ) -> VJEPAMaskOutput:
+        """Apply V-JEPA tube masking.
+
+        :param x: Video patch embeddings [B, T*N_spatial, D]
+        :param grid_t: Number of temporal frames (T)
+        :param grid_h: Spatial grid height
+        :param grid_w: Spatial grid width
+        :return: VJEPAMaskOutput with context/target indices
+
+        In eval mode all patches are returned as context (no masking).
+        Tube masks are sampled once per batch (shared across samples) to keep
+        the spatiotemporal structure consistent within a batch.
+        """
+        if x.dim() != 3:
+            raise ValueError(f"Expected 3D input (B, T*N, D), got {x.dim()}D")
+
+        B, TN, D = x.shape
+        N_spatial = grid_h * grid_w
+        N_total = grid_t * N_spatial
+        device = x.device
+
+        if TN != N_total:
+            raise ValueError(
+                f"TN={TN} doesn't match grid_t*grid_h*grid_w="
+                f"{grid_t}*{grid_h}*{grid_w}={N_total}"
+            )
+
+        # Eval mode: all patches as context, no masking
+        if not self.training:
+            all_idx = torch.arange(N_total, device=device).unsqueeze(0).expand(B, -1)
+            empty_masks = [
+                torch.zeros(B, N_total, dtype=torch.bool, device=device)
+                for _ in range(self.num_targets)
+            ]
+            return VJEPAMaskOutput(
+                context_idx=all_idx,
+                target_idx=torch.empty(B, 0, dtype=torch.long, device=device),
+                target_block_masks=empty_masks,
+                mask=torch.zeros(B, N_total, device=device),
+            )
+
+        # Sample spatial blocks (shared across the batch for consistent structure)
+        spatial_masks: List[torch.Tensor] = []
+        combined_spatial = torch.zeros(grid_h, grid_w, dtype=torch.bool, device=device)
+
+        for _ in range(self.num_targets):
+            block_mask = None
+            for _ in range(100):  # max attempts per block
+                top, left, bh, bw = self._sample_block_params(grid_h, grid_w, device)
+                candidate = self._create_spatial_mask(
+                    top, left, bh, bw, grid_h, grid_w, device
+                )
+                if (
+                    self.allow_target_overlap
+                    or not (candidate & combined_spatial).any()
+                ):
+                    block_mask = candidate
+                    break
+            if block_mask is not None:
+                spatial_masks.append(block_mask)
+                combined_spatial = combined_spatial | block_mask
+            else:
+                # Could not place non-overlapping block; append empty
+                spatial_masks.append(
+                    torch.zeros(grid_h, grid_w, dtype=torch.bool, device=device)
+                )
+
+        assert len(spatial_masks) == self.num_targets
+
+        # Extend each spatial mask into a tube: replicate across T frames.
+        # Flat index for token at (t, h, w): t * N_spatial + h * grid_w + w
+        tube_masks: List[torch.Tensor] = []
+        for s_mask in spatial_masks:
+            # s_mask: [grid_h, grid_w] -> tube: [grid_t, grid_h, grid_w] -> [T*N]
+            tube = s_mask.unsqueeze(0).expand(grid_t, -1, -1).reshape(-1)
+            tube_masks.append(tube)
+
+        # Combined target mask over all T*N_spatial positions
+        combined_tube = torch.zeros(N_total, dtype=torch.bool, device=device)
+        for tube in tube_masks:
+            combined_tube = combined_tube | tube
+
+        # Per-batch tube masks [B, T*N_spatial]
+        target_block_masks_batch = [t.unsqueeze(0).expand(B, -1) for t in tube_masks]
+
+        # Target indices [B, N_tgt]
+        target_idx = combined_tube.nonzero(as_tuple=True)[0].unsqueeze(0).expand(B, -1)
+
+        # Context indices: non-target patches, subsampled per-sample
+        available_idx = (~combined_tube).nonzero(as_tuple=True)[0]
+        n_available = len(available_idx)
+
+        if n_available == 0:
+            # Degenerate: all patches are targets → fall back to full context
+            context_idx = (
+                torch.arange(N_total, device=device).unsqueeze(0).expand(B, -1)
+            )
+        else:
+            ctx_ratio = (
+                torch.empty(1, device=device).uniform_(*self.context_scale).item()
+            )
+            n_context = max(1, int(n_available * ctx_ratio))
+
+            context_idx_list = []
+            for _ in range(B):
+                perm = torch.randperm(n_available, device=device)[:n_context]
+                ctx_idx = available_idx[perm].sort().values
+                context_idx_list.append(ctx_idx)
+            context_idx = torch.stack(context_idx_list)  # [B, N_ctx]
+
+        mask = combined_tube.float().unsqueeze(0).expand(B, -1)  # [B, N_total]
+
+        return VJEPAMaskOutput(
+            context_idx=context_idx,
+            target_idx=target_idx,
+            target_block_masks=target_block_masks_batch,
+            mask=mask,
+        )
+
+    def extra_repr(self) -> str:
+        return (
+            f"num_targets={self.num_targets}, "
+            f"target_scale={self.target_scale}, "
+            f"target_aspect_ratio={self.target_aspect_ratio}, "
+            f"context_scale={self.context_scale}"
+        )
diff --git a/stable_pretraining/methods/__init__.py b/stable_pretraining/methods/__init__.py
index a17a6c493..7f9152da2 100644
--- a/stable_pretraining/methods/__init__.py
+++ b/stable_pretraining/methods/__init__.py
@@ -2,5 +2,6 @@
 from .mae import MAE
 from .lejepa import LeJEPA
 from .nepa import NEPA
+from .vjepa import VJEPA
 
-__all__ = ["IJEPA", "MAE", "LeJEPA", "NEPA"]
+__all__ = ["IJEPA", "MAE", "LeJEPA", "NEPA", "VJEPA"]
diff --git a/stable_pretraining/methods/vjepa.py b/stable_pretraining/methods/vjepa.py
new file mode 100644
index 000000000..12bef432f
--- /dev/null
+++ b/stable_pretraining/methods/vjepa.py
@@ -0,0 +1,444 @@
+"""V-JEPA: Video Joint-Embedding Predictive Architecture.
+
+Self-supervised learning on video via predicting target spatio-temporal patch
+representations from context patches using a lightweight predictor.  Masking
+uses *tube* masking: the same spatial block is masked across **all** temporal
+frames, so the predictor must recover coherent motion and appearance across
+time from surrounding context.
+
+References:
+    Bardes et al. "V-JEPA: Latent Video Prediction for Visual Representation
+    Learning." ICLR 2024. https://arxiv.org/abs/2404.08471
+
+Example::
+
+    from stable_pretraining.methods.vjepa import VJEPA
+    from stable_pretraining.callbacks import TeacherStudentCallback
+    import lightning as pl
+
+    model = VJEPA(
+        encoder_name="vit_base_patch16_224",
+        num_frames=8,
+        predictor_embed_dim=384,
+        predictor_depth=6,
+        num_targets=8,
+    )
+
+    trainer = pl.Trainer(
+        max_epochs=200,
+        callbacks=[TeacherStudentCallback()],
+    )
+    trainer.fit(model, video_dataloader)
+
+    # Access trained encoder for downstream tasks
+    encoder = model.encoder.student
+"""
+
+from dataclasses import dataclass
+from typing import Tuple
+
+import math
+import torch
+import torch.nn.functional as F
+
+from stable_pretraining.backbone import (
+    FlexibleTransformer,
+    MaskedEncoder,
+    TeacherStudentWrapper,
+    VJEPAMasking,
+)
+from stable_pretraining.backbone.pos_embed import get_1d_sincos_pos_embed
+from stable_pretraining import Module
+
+
+@dataclass
+class VJEPAOutput:
+    """Output from VJEPA forward pass.
+
+    :ivar loss: Prediction loss (0 in eval mode)
+    :ivar embedding: Mean-pooled spatio-temporal embeddings [B, D] for downstream use
+    :ivar predictions: Predicted target representations [B, N_tgt, D]
+    :ivar targets: Target representations from teacher [B, N_tgt, D]
+    :ivar num_targets: Number of target patches (0 in eval mode)
+    :ivar num_context: Number of context patches
+    """
+
+    loss: torch.Tensor
+    embedding: torch.Tensor
+    predictions: torch.Tensor
+    targets: torch.Tensor
+    num_targets: int
+    num_context: int
+
+
+class VJEPA(Module):
+    """V-JEPA: Video Joint-Embedding Predictive Architecture.
+
+    Architecture:
+        - **Context Encoder** (student): Encodes visible/context spatio-temporal patches
+        - **Target Encoder** (teacher): EMA copy, encodes all patches
+        - **Predictor**: Lightweight transformer that predicts target tube representations
+          from context
+
+    The encoder processes video clips of shape ``(B, C, T, H, W)``.  Each frame
+    is tokenised by the ViT patch embedding into ``N_spatial = (H/p) × (W/p)``
+    tokens; the full clip thus yields ``T × N_spatial`` tokens.  Spatial and
+    temporal sinusoidal position embeddings are summed and added to every token
+    before the transformer blocks run over the full spatiotemporal sequence.
+
+    Masking uses :class:`~stable_pretraining.backbone.VJEPAMasking`: spatial
+    blocks are sampled and replicated across all frames to form *tubes*.  The
+    context encoder only sees non-tube tokens; the teacher encodes the full
+    sequence (all tokens visible) and the target is the tube subset.
+
+    The context encoder is wrapped with :class:`TeacherStudentWrapper`, enabling
+    automatic EMA updates via :class:`TeacherStudentCallback`.
+
+    :param encoder_name: timm model name (e.g., ``"vit_base_patch16_224"``)
+    :param num_frames: Number of video frames per clip (default: 8)
+    :param predictor_embed_dim: Predictor hidden dimension (default: 384)
+    :param predictor_depth: Number of predictor transformer blocks (default: 6)
+    :param num_targets: Number of target tubes to sample (default: 8)
+    :param target_scale: (min, max) fraction of *spatial* patches per tube block
+    :param target_aspect_ratio: (min, max) aspect ratio of spatial blocks
+    :param context_scale: (min, max) fraction of non-target tokens kept as context
+    :param ema_decay_start: Initial EMA decay (default: 0.996)
+    :param ema_decay_end: Final EMA decay (default: 1.0)
+    :param pretrained: Load pretrained encoder weights
+
+    Example::
+
+        model = VJEPA("vit_small_patch16_224", num_frames=8)
+
+        # Training mode: tube-masked prediction
+        model.train()
+        videos = torch.randn(4, 3, 8, 224, 224)
+        output = model(videos)
+        output.loss.backward()
+
+        # Eval mode: encode all patches, zero loss
+        model.eval()
+        output = model(videos)
+        features = output.embedding  # [B, D]
+
+    Example with Lightning::
+
+        import types
+        import lightning as pl
+        from stable_pretraining.callbacks import TeacherStudentCallback
+
+
+        def vjepa_forward(self, batch, stage):
+            output = VJEPA.forward(self, batch["video"])
+            self.log(
+                f"{stage}/loss",
+                output.loss,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=True,
+            )
+            return {
+                "loss": output.loss,
+                "embedding": output.embedding.detach()
+                if self.training
+                else output.embedding,
+                **({"label": batch["label"].long()} if "label" in batch else {}),
+            }
+
+
+        module = VJEPA("vit_base_patch16_224", num_frames=8)
+        module.forward = types.MethodType(vjepa_forward, module)
+
+        trainer = pl.Trainer(
+            max_epochs=200,
+            callbacks=[
+                TeacherStudentCallback(update_frequency=1, update_after_backward=True)
+            ],
+        )
+
+    Note:
+        - Use :class:`TeacherStudentCallback` for EMA updates (same as I-JEPA)
+        - In eval mode, ``num_targets=0`` and all tokens are returned as context
+        - Access trained encoder via ``model.encoder.student``
+        - ``embedding`` is the mean-pooled teacher (or student) representation
+    """
+
+    def __init__(
+        self,
+        encoder_name: str = "vit_base_patch16_224",
+        num_frames: int = 8,
+        predictor_embed_dim: int = 384,
+        predictor_depth: int = 6,
+        num_targets: int = 8,
+        target_scale: Tuple[float, float] = (0.15, 0.2),
+        target_aspect_ratio: Tuple[float, float] = (0.75, 1.5),
+        context_scale: Tuple[float, float] = (1.0, 1.0),
+        ema_decay_start: float = 0.996,
+        ema_decay_end: float = 1.0,
+        pretrained: bool = False,
+    ):
+        super().__init__()
+
+        self.num_frames = num_frames
+
+        # 2D ViT encoder shared across frames (student, wrapped with EMA teacher)
+        base_encoder = MaskedEncoder(
+            encoder_name,
+            masking=None,
+            pretrained=pretrained,
+        )
+        self.encoder = TeacherStudentWrapper(
+            base_encoder,
+            warm_init=True,
+            base_ema_coefficient=ema_decay_start,
+            final_ema_coefficient=ema_decay_end,
+        )
+
+        embed_dim = base_encoder.embed_dim
+        default_grid_h = base_encoder.default_grid_h
+        default_grid_w = base_encoder.default_grid_w
+        num_spatio_temporal = num_frames * default_grid_h * default_grid_w
+
+        # Lightweight predictor over the full T*N_spatial sequence
+        self.predictor = FlexibleTransformer(
+            input_dim=embed_dim,
+            hidden_dim=predictor_embed_dim,
+            output_dim=embed_dim,
+            num_patches=num_spatio_temporal,
+            depth=predictor_depth,
+            num_heads=max(1, predictor_embed_dim // 64),
+            self_attn=True,
+            cross_attn=False,
+            add_mask_token=True,
+            use_adaln=False,
+            num_prefix_tokens=0,
+            pos_embed_type="sincos_1d",
+            zero_init_output=False,
+        )
+
+        # V-JEPA tube masking (spatial blocks replicated across all frames)
+        self.masking = VJEPAMasking(
+            num_targets=num_targets,
+            target_scale=target_scale,
+            target_aspect_ratio=target_aspect_ratio,
+            context_scale=context_scale,
+        )
+
+        self.embed_dim = embed_dim
+        self._fix_init_weight()
+
+    def _encode_video(
+        self,
+        videos: torch.Tensor,
+        indices: torch.Tensor,
+        grid_t: int,
+        grid_h: int,
+        grid_w: int,
+        encoder: MaskedEncoder,
+    ) -> torch.Tensor:
+        """Encode selected spatio-temporal patches from a video clip.
+
+        Applies the 2D ViT patch embedding frame-by-frame, adds spatial and
+        temporal sinusoidal position embeddings, then selects the desired
+        patches by flat index and runs them through the transformer blocks.
+
+        :param videos: Video tensor [B, C, T, H, W]
+        :param indices: Flat spatio-temporal patch indices [B, K],
+            where index ``t * grid_h * grid_w + h * grid_w + w`` refers to
+            frame ``t``, row ``h``, column ``w``.
+        :param grid_t: Number of frames (T)
+        :param grid_h: Spatial grid height
+        :param grid_w: Spatial grid width
+        :param encoder: MaskedEncoder (student or teacher)
+        :return: Encoded representations [B, K, D]
+        """
+        B, C, T, H, W = videos.shape
+        N_spatial = grid_h * grid_w
+        D = encoder.embed_dim
+        device = videos.device
+        dtype = next(encoder.parameters()).dtype
+
+        # 1. Patch-embed all frames: (B*T, C, H, W) -> (B, T, N_spatial, D)
+        frames = videos.permute(0, 2, 1, 3, 4).reshape(B * T, C, H, W)
+        patches = encoder.patch_embed(frames)  # (B*T, N_spatial, D)
+        patches = patches.reshape(B, T, N_spatial, D)  # (B, T, N_spatial, D)
+
+        # 2. Spatial positional embedding (position within a frame, same for all t)
+        #    _get_pos_embed returns (prefix_pos, patch_pos); we only need patch_pos.
+        _, spatial_pos = encoder._get_pos_embed(grid_h, grid_w)  # (1, N_spatial, D)
+
+        # 3. Temporal positional embedding (which frame, broadcast over spatial)
+        temp_pos_np = get_1d_sincos_pos_embed(D, grid_t)  # (T, D) tensor
+        temp_pos = temp_pos_np.to(device=device, dtype=dtype)  # (T, D)
+
+        # 4. Add both PEs to patch tokens
+        #    spatial_pos: (1, N_spatial, D) -> unsqueeze(1) -> (1, 1, N_spatial, D)
+        #    temp_pos:    (T, D)            -> view         -> (1, T, 1, D)
+        patches = (
+            patches
+            + spatial_pos.unsqueeze(1)  # broadcast (B, T, N_spatial, D)
+            + temp_pos.view(1, T, 1, D)  # broadcast (B, T, N_spatial, D)
+        )
+
+        # 5. Flatten to (B, T*N_spatial, D) and select patches by index
+        x = patches.reshape(B, T * N_spatial, D)
+        x = torch.gather(x, 1, indices.unsqueeze(-1).expand(-1, -1, D))
+
+        # 6. Run through ViT trunk (dropout -> blocks -> norm)
+        x = encoder.vit.pos_drop(x)
+        x = encoder.vit.blocks(x)
+        return encoder.vit.norm(x)
+
+    def forward(
+        self, videos: torch.Tensor, embedding_source: str = "teacher"
+    ) -> VJEPAOutput:
+        """Forward pass.
+
+        In **training** mode:
+            - Samples target tubes and context via :class:`VJEPAMasking`
+            - Student encoder sees only context tokens
+            - Teacher encoder (EMA, no grad) encodes the full sequence; tube
+              tokens are selected as targets
+            - Predictor attends over ``[context + masked query tokens]`` and
+              outputs predictions at target positions
+            - Smooth L1 loss between predictions and (layer-normalised) targets
+
+        In **eval** mode:
+            - No masking; all tokens treated as context
+            - Returns zero loss and mean-pooled student embeddings
+
+        :param videos: Input video clips [B, C, T, H, W].
+            T must equal ``self.num_frames`` (or the model's default grid).
+        :param embedding_source: Which encoder to use for the ``embedding``
+            field: ``"teacher"`` (default) or ``"student"``.  Eval mode always
+            uses student.
+        :return: :class:`VJEPAOutput`
+        """
+        if embedding_source not in ("teacher", "student"):
+            raise ValueError(
+                f"embedding_source must be 'teacher' or 'student', "
+                f"got '{embedding_source}'"
+            )
+
+        B, C, T, H, W = videos.shape
+        grid_h, grid_w = self.encoder.student._get_grid_size(
+            videos[:, :, 0]  # single-frame spatial grid
+        )
+        grid_t = T
+        N_spatial = grid_h * grid_w
+        N_total = grid_t * N_spatial
+
+        # Compute patch embeddings via the student's patch_embed for masking
+        # (Masking only needs the shape; we pass a dummy tensor of correct size)
+        device = videos.device
+        dtype = next(self.encoder.student.parameters()).dtype
+        dummy = torch.empty(B, N_total, self.embed_dim, device=device, dtype=dtype)
+        mask_out = self.masking(dummy, grid_t, grid_h, grid_w)
+
+        if self.training:
+            # --- Context: student encodes only non-target tokens ---
+            context = self._encode_video(
+                videos,
+                mask_out.context_idx,
+                grid_t,
+                grid_h,
+                grid_w,
+                self.encoder.student,
+            )
+
+            with torch.no_grad():
+                # --- Teacher: encode full sequence, then select target tokens ---
+                all_idx = (
+                    torch.arange(N_total, device=device).unsqueeze(0).expand(B, -1)
+                )
+                teacher_full = self._encode_video(
+                    videos,
+                    all_idx,
+                    grid_t,
+                    grid_h,
+                    grid_w,
+                    self.encoder.teacher,
+                )  # [B, T*N, D]
+
+                # Extra LayerNorm on targets (affine-free, as in I-JEPA)
+                teacher_normed = F.layer_norm(teacher_full, [teacher_full.size(-1)])
+
+                # Gather tube target tokens
+                D = teacher_full.size(-1)
+                targets = torch.gather(
+                    teacher_normed,
+                    1,
+                    mask_out.target_idx.unsqueeze(-1).expand(-1, -1, D),
+                )  # [B, N_tgt, D]
+
+                # Embedding for downstream probes
+                if embedding_source == "teacher":
+                    embedding = teacher_full.mean(dim=1)  # [B, D]
+                else:
+                    embedding = self._encode_video(
+                        videos,
+                        all_idx,
+                        grid_t,
+                        grid_h,
+                        grid_w,
+                        self.encoder.student,
+                    ).mean(dim=1)
+
+            # --- Predictor: joint attention over [context + masked queries] ---
+            N_tgt = mask_out.target_idx.shape[1]
+            queries = torch.zeros(
+                B, N_tgt, self.embed_dim, device=device, dtype=context.dtype
+            )
+            query_mask = torch.ones(B, N_tgt, device=device, dtype=torch.bool)
+            predictions = self.predictor(
+                context=context,
+                queries=queries,
+                context_idx=mask_out.context_idx,
+                query_idx=mask_out.target_idx,
+                query_mask=query_mask,
+            )  # [B, N_tgt, D]
+
+            loss = F.smooth_l1_loss(predictions, targets, beta=1.0)
+
+        else:
+            # Eval: encode all tokens through student (no masking)
+            with torch.no_grad():
+                all_idx = (
+                    torch.arange(N_total, device=device).unsqueeze(0).expand(B, -1)
+                )
+                context = self._encode_video(
+                    videos,
+                    all_idx,
+                    grid_t,
+                    grid_h,
+                    grid_w,
+                    self.encoder.student,
+                )
+            predictions = context
+            targets = context
+            embedding = context.mean(dim=1)
+            loss = torch.tensor(0.0, device=device)
+
+        return VJEPAOutput(
+            loss=loss,
+            embedding=embedding,
+            predictions=predictions,
+            targets=targets,
+            num_targets=mask_out.target_idx.shape[1],
+            num_context=mask_out.context_idx.shape[1],
+        )
+
+    def _fix_init_weight(self):
+        """Rescale attention-proj and MLP output weights by depth (I-JEPA init)."""
+
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for enc in (self.encoder.student, self.encoder.teacher):
+            for layer_id, block in enumerate(enc.vit.blocks):
+                rescale(block.attn.proj.weight.data, layer_id + 1)
+                rescale(block.mlp.fc2.weight.data, layer_id + 1)
+
+        for layer_id, block in enumerate(self.predictor.blocks):
+            rescale(block.attn.proj.weight.data, layer_id + 1)
+            rescale(block.mlp.fc2.weight.data, layer_id + 1)
diff --git a/stable_pretraining/tests/integration/test_vjepa_smoke.py b/stable_pretraining/tests/integration/test_vjepa_smoke.py
new file mode 100644
index 000000000..7f95fe945
--- /dev/null
+++ b/stable_pretraining/tests/integration/test_vjepa_smoke.py
@@ -0,0 +1,135 @@
+"""Deterministic smoke test for the V-JEPA training pipeline."""
+
+import types
+
+import lightning as pl
+import pytest
+import torch
+
+import stable_pretraining as spt
+from stable_pretraining.methods.vjepa import VJEPA
+
+
+class SyntheticVideoDataset(torch.utils.data.Dataset):
+    """Fixed synthetic video dataset for deterministic testing.
+
+    All tensors are pre-generated from a seeded RNG so the dataset is fully
+    reproducible with no external downloads required.
+    """
+
+    def __init__(self, num_samples: int = 64, num_frames: int = 4, seed: int = 0):
+        rng = torch.Generator()
+        rng.manual_seed(seed)
+        self.videos = torch.randn(num_samples, 3, num_frames, 224, 224, generator=rng)
+        self.labels = torch.randint(0, 10, (num_samples,), generator=rng)
+
+    def __len__(self) -> int:
+        return len(self.videos)
+
+    def __getitem__(self, idx: int):
+        return {"video": self.videos[idx], "label": self.labels[idx]}
+
+
+@pytest.mark.integration
+@pytest.mark.filterwarnings("ignore:`isinstance.treespec, LeafSpec.` is deprecated")
+@pytest.mark.filterwarnings("ignore:.*does not have many workers")
+@pytest.mark.filterwarnings("ignore:Trying to infer the `batch_size`")
+class TestVJEPASmoke:
+    """Run VJEPA (vit_tiny) on synthetic video for 3 steps and check determinism."""
+
+    def test_vjepa_3_steps(self):
+        """Train VJEPA for 3 steps and assert loss matches expected value."""
+        pl.seed_everything(42, workers=True)
+
+        num_frames = 4
+
+        data = spt.data.DataModule(
+            train=torch.utils.data.DataLoader(
+                dataset=SyntheticVideoDataset(
+                    num_samples=64, num_frames=num_frames, seed=0
+                ),
+                batch_size=4,
+                num_workers=0,
+                drop_last=True,
+                shuffle=True,
+            ),
+            val=torch.utils.data.DataLoader(
+                dataset=SyntheticVideoDataset(
+                    num_samples=16, num_frames=num_frames, seed=1
+                ),
+                batch_size=4,
+                num_workers=0,
+            ),
+        )
+
+        def vjepa_forward(self, batch, stage):
+            output = VJEPA.forward(self, batch["video"])
+            embedding = output.embedding.detach() if self.training else output.embedding
+
+            self.log(
+                f"{stage}/loss",
+                output.loss,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=True,
+            )
+
+            return {
+                "loss": output.loss,
+                "embedding": embedding,
+                "label": batch["label"].long(),
+            }
+
+        module = VJEPA(
+            encoder_name="vit_tiny_patch16_224",
+            num_frames=num_frames,
+            predictor_embed_dim=192,
+            predictor_depth=4,
+            num_targets=4,
+            target_scale=(0.15, 0.2),
+            target_aspect_ratio=(0.75, 1.5),
+            context_scale=(1.0, 1.0),
+            ema_decay_start=0.996,
+            ema_decay_end=1.0,
+            pretrained=False,
+        )
+
+        module.forward = types.MethodType(vjepa_forward, module)
+        module.optim = {
+            "optimizer": {
+                "type": "AdamW",
+                "lr": 6e-4,
+                "weight_decay": 0.05,
+                "betas": (0.9, 0.95),
+            },
+            "scheduler": {"type": "LinearWarmupCosineAnnealing"},
+            "interval": "epoch",
+        }
+
+        trainer = pl.Trainer(
+            max_steps=3,
+            num_sanity_val_steps=0,
+            callbacks=[
+                spt.callbacks.TeacherStudentCallback(
+                    update_frequency=1,
+                    update_after_backward=True,
+                ),
+            ],
+            logger=False,
+            enable_checkpointing=False,
+            devices=1,
+            accelerator="cpu",
+            enable_progress_bar=False,
+        )
+
+        manager = spt.Manager(trainer=trainer, module=module, data=data, seed=42)
+        manager()
+
+        final_loss = trainer.callback_metrics.get("fit/loss_step")
+        assert final_loss is not None, "No loss logged"
+        print(f"\nVJEPA final loss after 3 steps: {final_loss.item():.6f}")
+        # NOTE: Update this expected value after the first successful reference run.
+        expected = torch.tensor(0.422097)  # calibrated on first run
+        assert torch.isclose(final_loss.cpu(), expected, atol=1e-4), (
+            f"VJEPA loss {final_loss.item():.6f} != expected {expected.item():.6f}"
+        )