PaddlePaddle · Le-soleile · Oct 1, 2025 · Oct 1, 2025
diff --git a/paddle/phi/kernels/legacy/gpu/cal_aux_loss_kernel.cu b/paddle/phi/kernels/legacy/gpu/cal_aux_loss_kernel.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/legacy/gpu/cal_aux_loss_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
 
 namespace phi {

diff --git a/paddle/phi/kernels/legacy/gpu/cal_aux_loss_kernel.h b/paddle/phi/kernels/legacy/gpu/cal_aux_loss_kernel.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+void cal_aux_loss(const T* gate_prob,
+                  const int64_t row_gate_prob, /*seq_len*/
+                  const int64_t col_gate_prob, /*expert_num*/
+                  const int64_t* dispatch_mask,
+                  const int64_t row_dispatch_mask,
+                  const int64_t col_dispatch_mask,
+                  const T* tokens_mask,
+                  const bool* dispatch_tokens_mask,
+                  const int64_t dispatch_tokens_mask_len, /*global_seq_len*/
+                  const int64_t num_experts,              /*global_num_experts*/
+                  const bool use_group,
+                  const int64_t moe_k,
+                  const float clip_min,
+                  T* l_aux_loss, /*output*/
+                  T* seqlen_float,
+                  T* ce,
+                  cudaStream_t stream);
+
+template <typename T, typename Context>
+void CalAuxLossKernel(const Context& dev_ctx,
+                      const DenseTensor& gate_prob,
+                      const DenseTensor& dispatch_mask,
+                      const paddle::optional<DenseTensor>& tokens_mask,
+                      const paddle::optional<DenseTensor>& dispatch_tokens_mask,
+                      int64_t num_experts,
+                      bool use_group,
+                      int64_t moe_k,
+                      float clip_min,
+                      DenseTensor* l_aux_loss,
+                      DenseTensor* seqlen_float,
+                      DenseTensor* ce);
+
+}  // namespace phi