refactor: extend NPU kernel parameter set and adjust conditional compilation structure.

yingxudeng · yingxudeng · commit 97f924ac5f1d · 2025-11-11T00:22:33.000+08:00
diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
@@ -390,4 +390,4 @@ DEFINE_string(reasoning_parser,
 // --- qwen3 reranker config ---
 DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");
 
-DEFINE_bool(enable_native_npu, true, "Whether to enable native NPU support.");
+DEFINE_bool(enable_npu_torch, true, "Whether to enable native NPU support.");
diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h
@@ -203,4 +203,4 @@ DECLARE_string(reasoning_parser);
 
 DECLARE_bool(enable_shm);
 
-DECLARE_bool(enable_native_npu);
+DECLARE_bool(enable_npu_torch);
diff --git a/xllm/core/distributed_runtime/worker_server.cpp b/xllm/core/distributed_runtime/worker_server.cpp
@@ -103,7 +103,7 @@ void WorkerServer::create_server(
 #elif defined(USE_NPU)
   // TODO: Refactor to use model_type or other appropriate enumeration for
   // condition checking
-  if (FLAGS_enable_native_npu) {
+  if (FLAGS_enable_npu_torch) {
     comm.create_process_groups(master_node_addr, device);
   }
 #endif
diff --git a/xllm/core/framework/parallel_state/collective_communicator.cpp b/xllm/core/framework/parallel_state/collective_communicator.cpp
@@ -18,8 +18,6 @@ limitations under the License.
 #include "mapping_npu.h"
 
 #if defined(USE_NPU)
-#include <torch_npu/csrc/distributed/ProcessGroupHCCL.hpp>
-
 #include "npu_process_group.h"
 #include "xllm_kernels/core/include/atb_speed/base/external_comm_manager.h"
 #include "xllm_kernels/core/include/atb_speed/utils/singleton.h"
diff --git a/xllm/core/framework/parallel_state/npu_process_group.h b/xllm/core/framework/parallel_state/npu_process_group.h
@@ -18,10 +18,6 @@ limitations under the License.
 #include "hccl/hccl.h"
 #include "process_group.h"
 
-namespace c10d_npu {
-class ProcessGroupHCCL;
-}
-
 namespace xllm {
 
 class ProcessGroupHCCL : public ProcessGroup {
diff --git a/xllm/core/framework/parallel_state/process_group.h b/xllm/core/framework/parallel_state/process_group.h
@@ -66,6 +66,9 @@ class ProcessGroup {
 
  protected:
 #if defined(USE_NPU)
+  // Using ProcessGroupHCCL for NPU devices
+  // Note: torch_npu uses an older torch version where c10d::Backend lacks
+  // shutdown() method
   std::unique_ptr<c10d_npu::ProcessGroupHCCL> pg_{nullptr};
 #else
   std::unique_ptr<c10d::Backend> pg_{nullptr};
diff --git a/xllm/core/kernels/npu/active.cpp b/xllm/core/kernels/npu/active.cpp
@@ -20,7 +20,11 @@ limitations under the License.
 
 namespace xllm::kernel::npu {
 
-torch::Tensor active(const torch::Tensor& input) {
+torch::Tensor active(const torch::Tensor& input, const std::string& act_mode) {
+  if (act_mode != "silu" && act_mode != "swiglu") {
+    throw std::runtime_error(
+        "Only swiglu activation is supported in NPU active");
+  }
   return at_npu::native::custom_ops::npu_swiglu(input);
 }
 }  // namespace xllm::kernel::npu
diff --git a/xllm/core/kernels/npu/attention.cpp b/xllm/core/kernels/npu/attention.cpp
@@ -31,31 +31,34 @@ void batch_prefill(const torch::Tensor& query,
                    const torch::Tensor& mask,
                    const torch::Tensor& seq_len,
                    float scale,
-                   int num_heads,
-                   int num_kv_heads,
                    torch::Tensor& output) {
+  auto num_heads = query.size(-2);
+  auto num_kv_heads = key.size(-2);
   atb::_npu_flash_attention(
       query, key, value, mask, seq_len, scale, num_heads, num_kv_heads, output);
 }
 
 void batch_decode(const torch::Tensor& query,
                   const torch::Tensor& k_cache,
                   const torch::Tensor& v_cache,
-                  int num_kv_heads,
-                  int num_heads,
                   float scale,
                   const torch::Tensor& block_table,
                   const torch::Tensor& seq_lens,
                   torch::Tensor& output) {
-  atb::_npu_paged_attention(query,
+  auto head_size = query.size(-1);
+  auto num_heads = query.size(-2);
+  auto num_kv_heads = k_cache.size(-2);
+  auto q = query.view({-1, num_heads, head_size});
+  auto o = output.view({-1, num_heads, head_size});
+  atb::_npu_paged_attention(q,
                             k_cache,
                             v_cache,
                             num_kv_heads,
                             num_heads,
                             scale,
                             block_table,
                             seq_lens,
-                            output);
+                            o);
 }
 
 }  // namespace xllm::kernel::npu
diff --git a/xllm/core/kernels/npu/fused_layernorm.cpp b/xllm/core/kernels/npu/fused_layernorm.cpp
@@ -21,7 +21,12 @@ namespace xllm::kernel::npu {
 
 torch::Tensor fused_layernorm(const torch::Tensor& input,
                               const torch::Tensor& weight,
-                              double eps) {
+                              double eps,
+                              const std::string& mode) {
+  if (mode != "rmsnorm") {
+    throw std::runtime_error(
+        "Only rmsnorm mode is supported in NPU fused_layernorm");
+  }
   std::tuple<at::Tensor, at::Tensor> result =
       at_npu::native::custom_ops::npu_rms_norm(input, weight, eps);
   auto normalized_input = std::get<0>(result);
diff --git a/xllm/core/kernels/npu/npu_ops_api.h b/xllm/core/kernels/npu/npu_ops_api.h
@@ -34,15 +34,11 @@ void batch_prefill(const torch::Tensor& query,
                    const torch::Tensor& mask,
                    const torch::Tensor& seq_len,
                    float scale,
-                   int num_heads,
-                   int num_kv_heads,
                    torch::Tensor& output);
 
 void batch_decode(const torch::Tensor& query,
                   const torch::Tensor& k_cache,
                   const torch::Tensor& v_cache,
-                  int num_kv_heads,
-                  int num_heads,
                   float scale,
                   const torch::Tensor& block_table,
                   const torch::Tensor& seq_lens,
@@ -52,11 +48,12 @@ torch::Tensor matmul(const torch::Tensor& a,
                      const torch::Tensor& b,
                      const std::optional<torch::Tensor>& bias);
 
-torch::Tensor active(const torch::Tensor& input);
+torch::Tensor active(const torch::Tensor& input, const std::string& act_mode);
 
 torch::Tensor fused_layernorm(const torch::Tensor& input,
                               const torch::Tensor& weight,
-                              double eps);
+                              double eps,
+                              const std::string& mode);
 
 void apply_rotary(torch::Tensor& q,
                   torch::Tensor& k,
diff --git a/xllm/core/kernels/ops_api.cpp b/xllm/core/kernels/ops_api.cpp
@@ -54,7 +54,7 @@ void active(ActivationParams& params) {
 
 torch::Tensor active_tensor(ActivationParams& params) {
 #if defined(USE_NPU)
-  return npu::active(params.input);
+  return npu::active(params.input, params.act_mode);
 #else
   throw std::runtime_error("active not implemented");
 #endif
@@ -110,8 +110,6 @@ void batch_prefill(AttentionParams& params) {
                      params.attn_mask,
                      params.seq_lens,
                      params.scale,
-                     params.num_heads,
-                     params.num_kv_heads,
                      params.output);
 #else
   throw std::runtime_error("batch_prefill not implemented");
@@ -144,8 +142,6 @@ void batch_decode(AttentionParams& params) {
   npu::batch_decode(params.query,
                     params.k_cache,
                     params.v_cache,
-                    params.num_kv_heads,
-                    params.num_heads,
                     params.scale,
                     params.block_table.value(),
                     params.seq_lens,
@@ -179,7 +175,8 @@ void fused_layernorm(FusedLayerNormParams& params) {
 
 torch::Tensor fused_layernorm_tensor(FusedLayerNormParams& params) {
 #if defined(USE_NPU)
-  return npu::fused_layernorm(params.input, params.weight, params.eps);
+  return npu::fused_layernorm(
+      params.input, params.weight, params.eps, params.mode);
 #else
   throw std::runtime_error("fused_layernorm not implemented");
 #endif
diff --git a/xllm/core/kernels/param.h b/xllm/core/kernels/param.h
@@ -82,9 +82,8 @@ struct AttentionParams {
   bool return_lse = false;
   // for npu
   torch::Tensor seq_lens;
-  int num_heads;
-  int num_kv_heads;
   torch::Tensor attn_mask;
+
   // for flashinfer
   torch::Tensor paged_kv_indptr;
   torch::Tensor paged_kv_indices;
diff --git a/xllm/core/layers/common/attention.cpp b/xllm/core/layers/common/attention.cpp
@@ -21,29 +21,18 @@ DECLARE_bool(enable_chunked_prefill);
 namespace xllm {
 namespace layer {
 
-#if defined(USE_NPU)
-AttentionMetadata AttentionMetadata::build(const ModelInputParams& params,
-                                           bool is_prefill,
-                                           const torch::Tensor& attn_mask) {
+AttentionMetadata AttentionMetadata::build(
+    const ModelInputParams& params,
+    bool is_prefill,
+    const std::optional<torch::Tensor>& attn_mask) {
   return AttentionMetadata::build(params, "float", is_prefill, attn_mask);
 }
-#else
-AttentionMetadata AttentionMetadata::build(const ModelInputParams& params,
-                                           bool is_prefill) {
-  return AttentionMetadata::build(params, "float", is_prefill);
-}
-#endif
 
-#if defined(USE_NPU)
-AttentionMetadata AttentionMetadata::build(const ModelInputParams& params,
-                                           const std::string& compute_dtype,
-                                           bool is_prefill,
-                                           const torch::Tensor& attn_mask) {
-#else
-AttentionMetadata AttentionMetadata::build(const ModelInputParams& params,
-                                           const std::string& compute_dtype,
-                                           bool is_prefill) {
-#endif
+AttentionMetadata AttentionMetadata::build(
+    const ModelInputParams& params,
+    const std::string& compute_dtype,
+    bool is_prefill,
+    const std::optional<torch::Tensor>& attn_mask) {
   AttentionMetadata attn_metadata;
   attn_metadata.query_start_loc = params.q_seq_lens;
   attn_metadata.seq_start_loc = params.kv_seq_lens;
@@ -52,10 +41,11 @@ AttentionMetadata AttentionMetadata::build(const ModelInputParams& params,
   attn_metadata.slot_mapping = params.new_cache_slots;
   attn_metadata.compute_dtype = compute_dtype;
 
-#if defined(USE_NPU)
-  attn_metadata.attn_mask = attn_mask;
-  attn_metadata.seq_lens = params.kv_seq_lens.to(torch::kCPU);
-#endif
+  // for npu
+  if (attn_mask.has_value()) {
+    attn_metadata.attn_mask = attn_mask.value();
+    attn_metadata.seq_lens = params.kv_seq_lens.to(torch::kCPU);
+  }
 
   bool is_start_loc_match = (params.q_seq_lens_vec == params.kv_seq_lens_vec);
   attn_metadata.is_chunked_prefill = is_prefill && !is_start_loc_match;
@@ -123,8 +113,6 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>> AttentionImpl::forward(
     attention_params.seq_start_loc = attn_metadata.seq_start_loc;
     attention_params.max_query_len = attn_metadata.max_query_len;
 #if defined(USE_NPU)
-    attention_params.num_heads = num_heads_;
-    attention_params.num_kv_heads = num_kv_heads_;
     attention_params.attn_mask = attn_metadata.attn_mask;
     attention_params.seq_lens = attn_metadata.seq_lens;
 #endif
@@ -139,15 +127,10 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>> AttentionImpl::forward(
 
     xllm::kernel::batch_prefill(attention_params);
   } else {
-#if defined(USE_NPU)
-    query = query.view({-1, num_heads_, head_size_});
-    output = output.view({-1, num_heads_, head_size_});
-    attention_params.num_heads = num_heads_;
-    attention_params.num_kv_heads = num_kv_heads_;
-    attention_params.seq_lens = attn_metadata.seq_lens;
-#else
     query = query.view({-1, 1, num_heads_, head_size_});
     output = output.view({-1, 1, num_heads_, head_size_});
+#if defined(USE_NPU)
+    attention_params.seq_lens = attn_metadata.seq_lens;
 #endif
 
     attention_params.query = query;
diff --git a/xllm/core/layers/common/attention.h b/xllm/core/layers/common/attention.h
@@ -27,25 +27,15 @@ namespace layer {
 
 struct AttentionMetadata {
  public:
-#if defined(USE_NPU)
-  static AttentionMetadata build(const ModelInputParams& params,
-                                 bool is_prefill,
-                                 const torch::Tensor& attn_mask);
-
-  static AttentionMetadata build(const ModelInputParams& params,
-                                 const std::string& compute_dtype,
-                                 bool is_prefill,
-                                 const torch::Tensor& attn_mask);
-  torch::Tensor attn_mask;
-  torch::Tensor seq_lens;
-#else
-  static AttentionMetadata build(const ModelInputParams& params,
-                                 bool is_prefill);
-
-  static AttentionMetadata build(const ModelInputParams& params,
-                                 const std::string& compute_dtype,
-                                 bool is_prefill);
-#endif
+  static AttentionMetadata build(
+      const ModelInputParams& params,
+      bool is_prefill,
+      const std::optional<torch::Tensor>& attn_mask = std::nullopt);
+  static AttentionMetadata build(
+      const ModelInputParams& params,
+      const std::string& compute_dtype,
+      bool is_prefill,
+      const std::optional<torch::Tensor>& attn_mask = std::nullopt);
 
   torch::Tensor query_start_loc;
   torch::Tensor seq_start_loc;
@@ -57,6 +47,10 @@ struct AttentionMetadata {
   std::string compute_dtype;
   bool is_prefill;
   bool is_chunked_prefill;
+
+  // for npu
+  torch::Tensor attn_mask;
+  torch::Tensor seq_lens;
 };
 
 class AttentionImpl : public torch::nn::Module {
diff --git a/xllm/core/layers/common/dense_mlp.cpp b/xllm/core/layers/common/dense_mlp.cpp
@@ -92,6 +92,7 @@ torch::Tensor DenseMLPImpl::forward(const torch::Tensor& hidden_states) {
 #if defined(USE_NPU)
     xllm::kernel::ActivationParams activation_params;
     activation_params.input = gate_up;
+    activation_params.act_mode = hidden_act_;
     auto output = xllm::kernel::active_tensor(activation_params);
 #else
     auto output = torch::empty(
diff --git a/xllm/core/layers/common/fuse_norm.cpp b/xllm/core/layers/common/fuse_norm.cpp
@@ -42,6 +42,7 @@ torch::Tensor FusedRMSNormImpl::forward(torch::Tensor& input) {
   fused_layernorm_params.input = input;
   fused_layernorm_params.weight = weight_;
   fused_layernorm_params.eps = eps_;
+  fused_layernorm_params.mode = kRmsNormMode;
   auto output = xllm::kernel::fused_layernorm_tensor(fused_layernorm_params);
 #else
   auto output = torch::empty_like(input);
diff --git a/xllm/models/llm/llm_model_base.h b/xllm/models/llm/llm_model_base.h
@@ -351,9 +351,8 @@ class LlmModelImplBase : public torch::nn::Module {
     }
   }
 
-#if defined(USE_NPU)
+#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
   virtual void verify_loaded_weights(const std::string& prefix) const {
-#if !defined(USE_NPU_TORCH)
     for (auto i = 0; i < FLAGS_micro_batch_num; i++) {
       npu_embed_tokens_[i]->verify_loaded_weights(prefix + "embed_tokens.");
     }
@@ -362,19 +361,16 @@ class LlmModelImplBase : public torch::nn::Module {
                                         ".");
     }
     npu_norm_->verify_loaded_weights(prefix + "norm.");
-#endif
   }
 
   virtual void merge_loaded_weights() {
-#if !defined(USE_NPU_TORCH)
     for (auto i = 0; i < FLAGS_micro_batch_num; i++) {
       npu_embed_tokens_[i]->merge_loaded_weights();
     }
     for (int i = 0; i < layers_.size(); i++) {
       layers_[i]->merge_loaded_weights();
     }
     npu_norm_->merge_loaded_weights();
-#endif
   }
 #endif
 
@@ -500,16 +496,14 @@ class LlmForCausalLMImplBase : public torch::nn::Module {
       }
 #endif
     }
-#if defined(USE_NPU)
+#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
     // verify
     model_->verify_loaded_weights(prefix + "model.");
     model_->merge_loaded_weights();
-#if !defined(USE_NPU_TORCH)
-    npu_lm_head_->verify_loaded_weights(prefix + "lm_head.");
 
+    npu_lm_head_->verify_loaded_weights(prefix + "lm_head.");
     // test
     npu_lm_head_->merge_loaded_weights();
-#endif
 #endif
   }
 
diff --git a/xllm/models/llm/qwen3.h b/xllm/models/llm/qwen3.h

Original file line number	Diff line number	Diff line change
`@@ -203,4 +203,4 @@ DECLARE_string(reasoning_parser);`
`203`	`203`
`204`	`204`	`DECLARE_bool(enable_shm);`
`205`	`205`
`206`		`-DECLARE_bool(enable_native_npu);`
	`206`	`+DECLARE_bool(enable_npu_torch);`
Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ void WorkerServer::create_server(`
`103`	`103`	`#elif defined(USE_NPU)`
`104`	`104`	`// TODO: Refactor to use model_type or other appropriate enumeration for`
`105`	`105`	`// condition checking`
`106`		`- if (FLAGS_enable_native_npu) {`
	`106`	`+ if (FLAGS_enable_npu_torch) {`
`107`	`107`	`comm.create_process_groups(master_node_addr, device);`
`108`	`108`	`}`
`109`	`109`	`#endif`