[BugFix]Set default OMP_NUM_THREADS=3 and fix extra GPU memory usage in DeepSeek (#5219)

bukejiyu · web-flow · commit 1539fd60569a · 2025-11-28T14:22:04.000+08:00
* fix bug

* update

* update

* update

* fix copy

* update
diff --git a/fastdeploy/engine/async_llm.py b/fastdeploy/engine/async_llm.py
@@ -722,7 +722,7 @@ def _setting_environ_variables(self):
             "FLAGS_use_append_attn": 1,
             "NCCL_ALGO": "Ring",
             "FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),
-            "OMP_NUM_THREADS": int(os.getenv("OMP_NUM_THREADS", 3)),
+            "OMP_NUM_THREADS": 3,
         }
         # environment variables needed by Dy2St
         variables.update(
diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
@@ -453,7 +453,7 @@ def _setting_environ_variables(self):
             "PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python",
             "NCCL_ALGO": "Ring",
             "FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),
-            "OMP_NUM_THREADS": int(os.getenv("OMP_NUM_THREADS", 3)),
+            "OMP_NUM_THREADS": 3,
             "FD_ENABLE_PDL": envs.FD_ENABLE_PDL,
         }
         # environment variables needed by Dy2St
diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
@@ -356,10 +356,6 @@ def __init__(
         self.output_sizes = output_sizes
 
     def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
-        weight_need_transpose = getattr(param, "weight_need_transpose", False)
-        if weight_need_transpose:
-            loaded_weight = get_tensor(loaded_weight).transpose([1, 0])
-
         assert loaded_shard_id in ["q_a", "kv_a"]
         if not param._is_initialized():
             param.initialize()
@@ -385,7 +381,6 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
             else:
                 loaded_weight = loaded_weight.cast(param.dtype)
         # (bukejiyu) After this fix, the early H2D copy for non-GPU devices is no longer needed and can be safely removed.
-        loaded_weight = get_tensor(loaded_weight)
         h2d_copy(param, loaded_weight)
 
 
@@ -452,7 +447,17 @@ def __init__(
             if self.with_bias:
                 # col parallel
                 _set_var_distributed(self.bias, split_axis=1)
-                set_weight_attrs(self.bias, {"output_dim": True})
+                set_weight_attrs(
+                    self.bias,
+                    {
+                        "output_dim": True,
+                        "weight_loader": (
+                            self.weight_loader
+                            if hasattr(self, "weight_loader")
+                            else default_weight_loader(self.fd_config)
+                        ),
+                    },
+                )
 
 
 class MergedColumnParallelLinear(ColumnParallelLinear):
@@ -955,7 +960,10 @@ def __init__(
         self.num_heads_per_partition = divide(num_attention_heads, self.tp_size)
         self.local_rank = fd_config.parallel_config.tensor_parallel_rank
         self.fd_config = fd_config
-        self.kv_b_proj = kv_b_proj
+        if self.fd_config.load_config.load_choices == "default_v1":
+            self.kv_b_proj = kv_b_proj
+        else:
+            self.kv_b_proj = None
 
         self.weight_dtype = self._helper.get_default_dtype()
 
diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py
@@ -141,7 +141,10 @@ def get_tensor(input: Union[paddle.Tensor, np.ndarray, str], model_path=None) ->
 
     if isinstance(input, paddle.Tensor):
         if input.place.is_cpu_place():
-            return input.to(paddle.device.get_device())
+            if current_platform.is_cuda():
+                return input.cuda()
+            else:
+                return input.to(paddle.device.get_device())
         return input
     elif isinstance(input, np.ndarray):
         return paddle.to_tensor(input)
diff --git a/fastdeploy/model_executor/models/qwen2_5_vl/dfnrope/modeling.py b/fastdeploy/model_executor/models/qwen2_5_vl/dfnrope/modeling.py
@@ -32,7 +32,7 @@
 from paddleformers.transformers.model_utils import PretrainedModel
 
 from fastdeploy.model_executor.layers.utils import divide, get_tensor
-from fastdeploy.model_executor.utils import fd_cast, h2d_copy, set_weight_attrs
+from fastdeploy.model_executor.utils import fd_cast, set_weight_attrs
 
 from .activation import ACT2FN
 from .configuration import DFNRopeVisionTransformerConfig
@@ -151,7 +151,8 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
         assert param.shape == shard_weight.shape, (
             f" Attempted to load weight ({shard_weight.shape}) " f"into parameter ({param.shape})"
         )
-        h2d_copy(param, shard_weight)
+        shard_weight = get_tensor(shard_weight)
+        param.copy_(shard_weight, False)
 
     def forward(
         self,
diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py
@@ -281,7 +281,6 @@ def default_weight_loader(fd_config: FDConfig = None) -> None:
 
     def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None):
         """fn"""
-
         output_dim = getattr(param, "output_dim", None)
         weight_need_transpose = getattr(param, "weight_need_transpose", False)
         if weight_need_transpose:
@@ -310,7 +309,8 @@ def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None):
         assert param.shape == loaded_weight.shape, (
             f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})"
         )
-        h2d_copy(dst=param, src=loaded_weight)
+        loaded_weight = get_tensor(loaded_weight)
+        param.copy_(loaded_weight, False)
 
     return fn
 
@@ -369,8 +369,9 @@ def h2d_copy(dst, src, blocking=True):
     if not current_platform.is_cuda() or not is_paddle_support_new_h2d():
         # For non-GPU devices, data is transferred to device (H2D) in advance.
         src = get_tensor(src)
-    if not dst._is_initialized():
-        dst.initialize()
+    if len(src.shape) == 1:
+        # TODO (bukejiyu):A recently merged Paddle PR introduced a hang when copying 1-D non-contiguous tensors. This approach serves as a temporary workaround.
+        src = get_tensor(src)
     dst.copy_(src, blocking)
 
 

Original file line number	Diff line number	Diff line change
`@@ -722,7 +722,7 @@ def _setting_environ_variables(self):`
`722`	`722`	`"FLAGS_use_append_attn": 1,`
`723`	`723`	`"NCCL_ALGO": "Ring",`
`724`	`724`	`"FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),`
`725`		`- "OMP_NUM_THREADS": int(os.getenv("OMP_NUM_THREADS", 3)),`
	`725`	`+ "OMP_NUM_THREADS": 3,`
`726`	`726`	`}`
`727`	`727`	`# environment variables needed by Dy2St`
`728`	`728`	`variables.update(`
Original file line number	Diff line number	Diff line change
`@@ -453,7 +453,7 @@ def _setting_environ_variables(self):`
`453`	`453`	`"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python",`
`454`	`454`	`"NCCL_ALGO": "Ring",`
`455`	`455`	`"FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),`
`456`		`- "OMP_NUM_THREADS": int(os.getenv("OMP_NUM_THREADS", 3)),`
	`456`	`+ "OMP_NUM_THREADS": 3,`
`457`	`457`	`"FD_ENABLE_PDL": envs.FD_ENABLE_PDL,`
`458`	`458`	`}`
`459`	`459`	`# environment variables needed by Dy2St`