diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index fa7310f13b03..42b0c1873084 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -538,6 +538,13 @@ def __post_init__(self): self.compilation_config.cudagraph_mode.has_full_cudagraphs() ) + # enable mrope custom op with triton kernel when model uses mrope + if ( + self.model_config.uses_mrope + and "-mrope" not in self.compilation_config.custom_ops + ): + self.compilation_config.custom_ops.append("+mrope") + if self.parallel_config.enable_dbo: a2a_backend = self.parallel_config.all2all_backend assert a2a_backend in ["deepep_low_latency", "deepep_high_throughput"], ( diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index d269733083d8..cffe6b875072 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -5,6 +5,7 @@ import numpy as np import torch +from vllm.model_executor.custom_op import CustomOp from vllm.triton_utils import tl, triton from .base import RotaryEmbedding @@ -199,6 +200,7 @@ def apply_interleaved_rope(x: torch.Tensor, mrope_section: list[int]) -> torch.T return x_t +@CustomOp.register(name="mrope") class MRotaryEmbedding(RotaryEmbedding): """Rotary Embedding with Multimodal Sections."""