clean some log info

K11OntheBoat · K11OntheBoat · commit 7c1a09e63e08 · 2025-12-01T19:27:43.000+08:00
diff --git a/examples/splitwise/stop.sh b/examples/splitwise/stop.sh
@@ -1,6 +1,7 @@
 pkill -9 -f python
 pkill -9 -f fastdeploy
 pkill -9 -f gunicorn
+# Kill redis-server if you need.
 #pkill -9 -f redis-server
 
 sleep 1
diff --git a/fastdeploy/cache_manager/cache_messager.py b/fastdeploy/cache_manager/cache_messager.py
@@ -204,7 +204,6 @@ def __init__(
 
             elif protocol == "rdma":
                 logger.info(f"splitwise_role rdma: {self.splitwise_role}, rank: {self.rank}, gpu_id: {gpu_id}")
-                logger.info(f"====RyanDebug, the cache_v_ptr_list is:{cache_v_ptr_list}")
                 self.messager[protocol] = RDMACommManager(
                     splitwise_role,
                     rank,
@@ -217,7 +216,6 @@ def __init__(
                     nranks,
                     rank,
                 )
-                logger.info("===RyanDebug, #218 Finish RDMACommManager create!!!!!!!")
 
         self.gpu_id = gpu_id
         self.cache_info = dict()
@@ -825,7 +823,6 @@ def main():
     num_extra_layers = speculative_config.num_extra_cache_layer
     key_cache_shape_list = [int(i) for i in args.key_cache_shape.split(",")]
     value_cache_shape_list = []
-    print("===RyanDebug #786 of cache_messager,the args.value_cache_shape is:", args.value_cache_shape)
     if args.value_cache_shape:
         value_cache_shape_list = [int(i) for i in args.value_cache_shape.split(",")]
     total_gpu_blocks = key_cache_shape_list[0]
diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h
@@ -149,7 +149,7 @@ class RDMACommunicator {
   struct ibv_pd* g_pd = NULL;          // fd
   int RDMACommunicator_status;         // Communicator status flag
   bool start_client_listener = false;  // Client listener flag
-  bool has_value_cache_;               // MLA doest not have value cache.
+  bool has_value_cache_;               // MLA does not have value cache.
 };
 
 #endif  // KVCACHE_RDMA_H
diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp
@@ -70,17 +70,13 @@ RDMACommunicator::RDMACommunicator(std::string& role,
 
     // Step 1: Initialize KV cache config
     KVCacheConfig::getInstance().displayConfiguration();
-    printf(
-        "====RyanDebugRDMA, Finish  #69 KVCacheConfig::getInstance(). ===== "
-        "\n");
 
     // Step 2: Initialize KV cache structure
     // Validate and set number of layers
     layer_number = static_cast<int>(local_cache_key_ptr_layer_head_.size());
     if (layer_number <= 0) {
       throw std::runtime_error("Invalid layer number");
     }
-    printf("====RyanDebugRDMA, Finish  #77 layer. ===== \n");
 
     if (local_cache_value_ptr_layer_head_.empty()) {
       has_value_cache_ = false;
@@ -94,20 +90,17 @@ RDMACommunicator::RDMACommunicator(std::string& role,
       }
     }
 
-    printf("====RyanDebugRDMA, Finish  #91 layer. ===== \n");
     // Step 2: Setup cache vectors and pointers
     resize_vectors();
     assign_pointers();
 
-    printf("====RyanDebugRDMA, Finish  #97 layer. ===== \n");
     // Step 3:Initialize the event channel
     rdma_event_channel_epoll_fd = epoll_create1(EPOLL_CLOEXEC);
     if (rdma_event_channel_epoll_fd < 0) {
       throw std::runtime_error("Failed to create epoll fd: " +
                                std::string(strerror(errno)));
     }
 
-    printf("====RyanDebugRDMA, Finish  #105 layer. ===== \n");
     // Start the server thread (if in decode role)
     if (splitwise_role == "decode") {
       std::thread server_thread([this]() {
@@ -119,7 +112,6 @@ RDMACommunicator::RDMACommunicator(std::string& role,
       });
       server_thread.detach();
     }
-    printf("====RyanDebugRDMA, Finish  #117 layer. ===== \n");
     RDMACommunicator_status = 1;
     INFO("RDMA communicator initialized successfully");
   } catch (const std::exception& e) {
@@ -884,10 +876,6 @@ bool RDMACommunicator::server_mr_register_per_layer(RdmaContext* ctx) {
     }
   }
 
-  // 【修复点】：无论是否有 Value Cache，都要赋值给 ctx->conn
-  // 如果没有 Value Cache，write_cache_value_server_mr_list
-  // 是空的，赋值过去也是空的，这是安全的。 如果不赋值，ctx->conn 里的 vector
-  // 可能是未定义的脏状态。
   ctx->conn.write_cache_key_server_mr_list = write_cache_key_server_mr_list;
   ctx->conn.write_cache_value_server_mr_list = write_cache_value_server_mr_list;
 
@@ -972,7 +960,7 @@ int RDMACommunicator::write_cache(const std::string& ip,
     cache_key_remote_addr[block_index] = (uint64_t(
         char_ptr + remote_block_ids[block_index] * total_block_size_byte +
         offset_in_block));
-    
+
     if (has_value_cache_) {
       char_ptr = static_cast<char*>(
           ctx->conn.write_cache_value_remote_ptr_list[layer_idx]);
diff --git a/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py b/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py
@@ -40,16 +40,10 @@ def __init__(
         try:
             import rdma_comm
         except:
-            logger.error(
-                "The installation of the RDMA library failed."
-                "Confirm whether your network card supports RDMA transmission."
-            )
             raise RuntimeError(
                 "The installation of the RDMA library failed."
                 "Confirm whether your network card supports RDMA transmission."
             )
-        logger.info(f" # 499999999 init rdma messager {gpu_id} {rdma_port}")
-        logger.info(f" # == RyanDebug, Decode, the  cache_v_ptr_list is: {cache_v_ptr_list}")
         self.messager = rdma_comm.RDMACommunicator(
             splitwise_role,
             gpu_id,
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -307,9 +307,6 @@ def override_name_from_config(self):
         if hasattr(self, "n_routed_experts") and getattr(self, "moe_num_experts") is None:
             self.moe_num_experts = self.n_routed_experts
 
-        if hasattr(self, "n_routed_experts") and getattr(self, "moe_num_experts") is None:
-            self.moe_num_experts = self.n_routed_experts
-
     def read_from_env(self):
         """
         Read configuration information from environment variables and update the object's attributes.
diff --git a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
@@ -205,8 +205,6 @@ def init_attention_metadata(self, forward_meta: ForwardMeta):
             self.group_size,
             self.block_size,
         )
-        print("===RyanDebug, after ini attn meta, the max_len_tensor_cpu[1] is:", forward_meta.max_len_tensor_cpu[1])
-        print("===RyanDebug, after ini attn meta, the max_len_tensor_cpu[2] is:", forward_meta.max_len_tensor_cpu[2])
         # MLA
         metadata.max_enc_len_this_time = forward_meta.max_len_tensor_cpu[1]
         metadata.max_dec_len_this_time = forward_meta.max_len_tensor_cpu[2]
@@ -428,10 +426,6 @@ def forward_mixed(
                 "none",
                 self.max_seq_len,
             )
-            print(
-                "====After write cache, the metadata.kv_signal_data_list[layer.layer_id] is:",
-                metadata.kv_signal_data_list[layer.layer_id],
-            )
             # FA
             fmha_out = self.flash_attn_func(
                 q,
diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py
@@ -340,17 +340,7 @@ def forward(
         fmha_out = None
 
         # NOTE: (changwenbin) qkv_a_proj horizontal fusion
-        paddle.device.synchronize()
-        print("==RyanDebug, the hidden_states is:", hidden_states)  # 这是一个输入，我们假设它没问题，但也可以加上检查
-        print("==RyanDebug, hidden_states contains NaN:", paddle.any(paddle.isnan(hidden_states)).item())
-
         qkv_a_out = self.qkv_a_proj_with_mqa(hidden_states)
-        paddle.device.synchronize()
-
-        # --- NaN Check Start ---
-        print("===RyanDebug, the qkv_a_out is:", qkv_a_out)
-        print("    >>> RyanDebug, qkv_a_out contains NaN:", paddle.any(paddle.isnan(qkv_a_out)).item())
-        # --- NaN Check End ---
 
         query, compressed_kv, key_pe = qkv_a_out.split(
             [self.q_lora_rank, self.kv_lora_rank, self.qk_rope_head_dim], axis=-1
@@ -363,13 +353,10 @@ def forward(
 
         key_pe.reshape_([-1, 1, self.qk_rope_head_dim])
         query_pe, key_pe = self.rotary_emb(position_ids, query_pe, key_pe)
-        paddle.device.synchronize()
 
         compressed_kv = self.kv_a_layernorm(compressed_kv)[0]
 
-        print("===RyanDebug, in #370, forward_meta.max_len_tensor_cpu[1] is:", forward_meta.max_len_tensor_cpu[1])
         if forward_meta.max_len_tensor_cpu[1]:  # max_enc_len_this_time
-            print("===RyanDebug, in #372, forward_meta.max_len_tensor_cpu[1] is:", forward_meta.max_len_tensor_cpu[1])
             key_value = self.kv_b_proj(compressed_kv)
             key_value.reshape_(
                 [
@@ -402,12 +389,8 @@ def forward(
             fmha_out_prefill = fmha_out_prefill * mask_encoder_batch.cast(fmha_out_prefill.dtype)
 
             fmha_out = fmha_out_prefill
-            print("====RYanDebug, #404, fmha_out after MLA is: ", fmha_out)
 
         if forward_meta.max_len_tensor_cpu[2]:  # max_dec_len_this_time
-            print("===RyanDebug, D in dsv3 !!!!=====")
-            paddle.device.synchronize()
-
             q_nope_out = self.kv_b_proj_bmm(query_nope.transpose([1, 0, 2]), proj_type="k").transpose([1, 0, 2])
 
             q_input = paddle.concat([q_nope_out, query_pe], axis=-1)
@@ -418,18 +401,6 @@ def forward(
                 ]
             )
 
-            print("===RyanDebug, the q_input # 435 is:", q_input)
-            print("    >>> RyanDebug, q_input # 435 contains NaN:", paddle.any(paddle.isnan(q_input)).item())
-
-            print("===RyanDebug, the compressed_kv # 435 is:", compressed_kv)
-            print(
-                "    >>> RyanDebug, compressed_kv # 435 contains NaN:", paddle.any(paddle.isnan(compressed_kv)).item()
-            )
-
-            print("===RyanDebug, the key_pe # 435 is:", q_input)
-            print("    >>> RyanDebug, key_pe # 435 contains NaN:", paddle.any(paddle.isnan(key_pe)).item())
-
-            paddle.device.synchronize()
             fmha_out_decode = self.mla_attn(
                 q=q_input,
                 k=None,
@@ -439,39 +410,23 @@ def forward(
                 k_pe=key_pe,
                 forward_meta=forward_meta,
             )
-            paddle.device.synchronize()
-            # --- NaN Check Start ---
-            print("===RyanDebug, the fmha_out_decode # 448 is:", fmha_out_decode)
-            print(
-                "    >>> RyanDebug, fmha_out_decode # 448 contains NaN:",
-                paddle.any(paddle.isnan(fmha_out_decode)).item(),
-            )
 
             fmha_out_decode = fmha_out_decode.reshape([-1, self.num_attention_heads_tp, self.kv_lora_rank]).transpose(
                 [1, 0, 2]
             )
 
-            paddle.device.synchronize()
-
             fmha_out_decode = (
                 self.kv_b_proj_bmm(fmha_out_decode, proj_type="v")
                 .transpose([1, 0, 2])
                 .reshape([-1, self.num_attention_heads_tp * self.v_head_dim])
             )
 
-            # --- NaN Check Start ---
-            print("===RyanDebug, the fmha_out_decode is:", fmha_out_decode)
-            print("    >>> RyanDebug, fmha_out_decode contains NaN:", paddle.any(paddle.isnan(fmha_out_decode)).item())
-            # --- NaN Check End ---
-
-            paddle.device.synchronize()
             if fmha_out is None:
                 fmha_out = fmha_out_decode
             else:
                 fmha_out = fmha_out + fmha_out_decode
 
         output = self.o_proj(fmha_out)
-        paddle.device.synchronize()
         return output
 
     def load_state_dict(self, state_dict):
@@ -559,19 +514,11 @@ def forward(
             hidden_states, residual_input=residual, forward_meta=forward_meta
         )
 
-        print("===RyanDebug, the hidden_states before self_attn is :", hidden_states)
         hidden_states = self.self_attn(forward_meta, hidden_states, position_ids, mask_encoder_batch)
 
-        print("==RyanDebug, #563 hidden_states contains NaN:", paddle.any(paddle.isnan(hidden_states)).item())
-
         hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
-        print("==RyanDebug, #566 hidden_states contains NaN:", paddle.any(paddle.isnan(hidden_states)).item())
         hidden_states = self.mlp(hidden_states)
 
-        print("===RyanDebug, the hidden_states after mlp is :", hidden_states)
-        print(
-            "==RyanDebug, #570 hidden_states after mlp contains NaN:", paddle.any(paddle.isnan(hidden_states)).item()
-        )
         return hidden_states, residual
 
 
@@ -731,7 +678,6 @@ def load_weights(self, weights_iterator) -> None:
         process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config)
         for loaded_weight_name, loaded_weight in weights_iterator:
             loaded_weight_name = loaded_weight_name.replace("deepseek_v3", "model")
-            print(f"loaded_weight_name:{loaded_weight_name}")
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in loaded_weight_name:
                     continue
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -1889,10 +1889,6 @@ def _dummy_run(
                     self.forward_meta,
                 )
             else:
-                print(
-                    "===RyanDebug #1813 of model runner, the self.share_inputs[ids_remove_padding] is:",
-                    self.share_inputs["ids_remove_padding"],
-                )
                 model_output = self.model(
                     self.forward_meta.ids_remove_padding,
                     self.forward_meta,