From 76206446413a1c07006b9e6c023942e0407eabed Mon Sep 17 00:00:00 2001 From: lvyufeng Date: Mon, 13 Oct 2025 12:01:14 +0800 Subject: [PATCH] fix qwen_vl and load_pretrained patch --- mindnlp/transformers/modeling_utils.py | 4 ++-- mindtorch/nn/functional.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mindnlp/transformers/modeling_utils.py b/mindnlp/transformers/modeling_utils.py index 9fb31dc4d..b422e7577 100644 --- a/mindnlp/transformers/modeling_utils.py +++ b/mindnlp/transformers/modeling_utils.py @@ -184,8 +184,8 @@ def wrapper( pretrained_model_name_or_path, **kwargs, ): - device_map = kwargs.pop("device_map", None) - sharded_metadata = kwargs.pop("sharded_metadata", None) + device_map = kwargs.get("device_map", None) + sharded_metadata = kwargs.get("sharded_metadata", None) # if device_map is not None and not initialize distribute module, raise Error. if device_map is not None: diff --git a/mindtorch/nn/functional.py b/mindtorch/nn/functional.py index 67e5c2af0..01d0aa12d 100644 --- a/mindtorch/nn/functional.py +++ b/mindtorch/nn/functional.py @@ -1220,7 +1220,7 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0. attn_weight = query @ key.transpose(-2, -1) * scale_factor attn_weight += attn_bias - attn_weight = softmax(attn_weight, dim=-1, dtype=mindtorch.float32).to(query.dtype) + attn_weight = softmax(attn_weight, dim=-1) attn_weight = dropout(attn_weight, dropout_p, training=True) return attn_weight @ value