vllm-project · Isotr0py · Oct 20, 2025 · Oct 21, 2025
@@ -538,6 +538,13 @@ def __post_init__(self):
                 self.compilation_config.cudagraph_mode.has_full_cudagraphs()
             )
 
+            # enable mrope custom op with triton kernel when model uses mrope
+            if (
+                self.model_config.uses_mrope
+                and "-mrope" not in self.compilation_config.custom_ops
+            ):
+                self.compilation_config.custom_ops.append("+mrope")
+
         if self.parallel_config.enable_dbo:
             a2a_backend = self.parallel_config.all2all_backend
             assert a2a_backend in ["deepep_low_latency", "deepep_high_throughput"], (

diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -5,6 +5,7 @@
 import numpy as np
 import torch
 
+from vllm.model_executor.custom_op import CustomOp
 from vllm.triton_utils import tl, triton
 
 from .base import RotaryEmbedding
@@ -199,6 +200,7 @@ def apply_interleaved_rope(x: torch.Tensor, mrope_section: list[int]) -> torch.T
     return x_t
 
 
+@CustomOp.register(name="mrope")
 class MRotaryEmbedding(RotaryEmbedding):
     """Rotary Embedding with Multimodal Sections."""