Skip to content

Commit 1539fd6

Browse files
authored
[BugFix]Set default OMP_NUM_THREADS=3 and fix extra GPU memory usage in DeepSeek (#5219)
* fix bug * update * update * update * fix copy * update
1 parent 7dc06ca commit 1539fd6

File tree

6 files changed

+29
-16
lines changed

6 files changed

+29
-16
lines changed

fastdeploy/engine/async_llm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -722,7 +722,7 @@ def _setting_environ_variables(self):
722722
"FLAGS_use_append_attn": 1,
723723
"NCCL_ALGO": "Ring",
724724
"FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),
725-
"OMP_NUM_THREADS": int(os.getenv("OMP_NUM_THREADS", 3)),
725+
"OMP_NUM_THREADS": 3,
726726
}
727727
# environment variables needed by Dy2St
728728
variables.update(

fastdeploy/engine/engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,7 @@ def _setting_environ_variables(self):
453453
"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python",
454454
"NCCL_ALGO": "Ring",
455455
"FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),
456-
"OMP_NUM_THREADS": int(os.getenv("OMP_NUM_THREADS", 3)),
456+
"OMP_NUM_THREADS": 3,
457457
"FD_ENABLE_PDL": envs.FD_ENABLE_PDL,
458458
}
459459
# environment variables needed by Dy2St

fastdeploy/model_executor/layers/linear.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -356,10 +356,6 @@ def __init__(
356356
self.output_sizes = output_sizes
357357

358358
def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
359-
weight_need_transpose = getattr(param, "weight_need_transpose", False)
360-
if weight_need_transpose:
361-
loaded_weight = get_tensor(loaded_weight).transpose([1, 0])
362-
363359
assert loaded_shard_id in ["q_a", "kv_a"]
364360
if not param._is_initialized():
365361
param.initialize()
@@ -385,7 +381,6 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
385381
else:
386382
loaded_weight = loaded_weight.cast(param.dtype)
387383
# (bukejiyu) After this fix, the early H2D copy for non-GPU devices is no longer needed and can be safely removed.
388-
loaded_weight = get_tensor(loaded_weight)
389384
h2d_copy(param, loaded_weight)
390385

391386

@@ -452,7 +447,17 @@ def __init__(
452447
if self.with_bias:
453448
# col parallel
454449
_set_var_distributed(self.bias, split_axis=1)
455-
set_weight_attrs(self.bias, {"output_dim": True})
450+
set_weight_attrs(
451+
self.bias,
452+
{
453+
"output_dim": True,
454+
"weight_loader": (
455+
self.weight_loader
456+
if hasattr(self, "weight_loader")
457+
else default_weight_loader(self.fd_config)
458+
),
459+
},
460+
)
456461

457462

458463
class MergedColumnParallelLinear(ColumnParallelLinear):
@@ -955,7 +960,10 @@ def __init__(
955960
self.num_heads_per_partition = divide(num_attention_heads, self.tp_size)
956961
self.local_rank = fd_config.parallel_config.tensor_parallel_rank
957962
self.fd_config = fd_config
958-
self.kv_b_proj = kv_b_proj
963+
if self.fd_config.load_config.load_choices == "default_v1":
964+
self.kv_b_proj = kv_b_proj
965+
else:
966+
self.kv_b_proj = None
959967

960968
self.weight_dtype = self._helper.get_default_dtype()
961969

fastdeploy/model_executor/layers/utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,10 @@ def get_tensor(input: Union[paddle.Tensor, np.ndarray, str], model_path=None) ->
141141

142142
if isinstance(input, paddle.Tensor):
143143
if input.place.is_cpu_place():
144-
return input.to(paddle.device.get_device())
144+
if current_platform.is_cuda():
145+
return input.cuda()
146+
else:
147+
return input.to(paddle.device.get_device())
145148
return input
146149
elif isinstance(input, np.ndarray):
147150
return paddle.to_tensor(input)

fastdeploy/model_executor/models/qwen2_5_vl/dfnrope/modeling.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
from paddleformers.transformers.model_utils import PretrainedModel
3333

3434
from fastdeploy.model_executor.layers.utils import divide, get_tensor
35-
from fastdeploy.model_executor.utils import fd_cast, h2d_copy, set_weight_attrs
35+
from fastdeploy.model_executor.utils import fd_cast, set_weight_attrs
3636

3737
from .activation import ACT2FN
3838
from .configuration import DFNRopeVisionTransformerConfig
@@ -151,7 +151,8 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
151151
assert param.shape == shard_weight.shape, (
152152
f" Attempted to load weight ({shard_weight.shape}) " f"into parameter ({param.shape})"
153153
)
154-
h2d_copy(param, shard_weight)
154+
shard_weight = get_tensor(shard_weight)
155+
param.copy_(shard_weight, False)
155156

156157
def forward(
157158
self,

fastdeploy/model_executor/utils.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,6 @@ def default_weight_loader(fd_config: FDConfig = None) -> None:
281281

282282
def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None):
283283
"""fn"""
284-
285284
output_dim = getattr(param, "output_dim", None)
286285
weight_need_transpose = getattr(param, "weight_need_transpose", False)
287286
if weight_need_transpose:
@@ -310,7 +309,8 @@ def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None):
310309
assert param.shape == loaded_weight.shape, (
311310
f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})"
312311
)
313-
h2d_copy(dst=param, src=loaded_weight)
312+
loaded_weight = get_tensor(loaded_weight)
313+
param.copy_(loaded_weight, False)
314314

315315
return fn
316316

@@ -369,8 +369,9 @@ def h2d_copy(dst, src, blocking=True):
369369
if not current_platform.is_cuda() or not is_paddle_support_new_h2d():
370370
# For non-GPU devices, data is transferred to device (H2D) in advance.
371371
src = get_tensor(src)
372-
if not dst._is_initialized():
373-
dst.initialize()
372+
if len(src.shape) == 1:
373+
# TODO (bukejiyu):A recently merged Paddle PR introduced a hang when copying 1-D non-contiguous tensors. This approach serves as a temporary workaround.
374+
src = get_tensor(src)
374375
dst.copy_(src, blocking)
375376

376377

0 commit comments

Comments
 (0)