From 65b838e8dd09cdec1e27398077d4f053a97145d9 Mon Sep 17 00:00:00 2001 From: HangXU <945440358@qq.com> Date: Wed, 23 Apr 2025 11:47:43 +0800 Subject: [PATCH 1/2] #fix train deepseek-distill-qwen-1.5b on bfloat16 causes np1.26 do not support bfloat16 --- mindnlp/core/ops/other.py | 88 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 2 deletions(-) diff --git a/mindnlp/core/ops/other.py b/mindnlp/core/ops/other.py index 3e2f955d6..d601a8295 100644 --- a/mindnlp/core/ops/other.py +++ b/mindnlp/core/ops/other.py @@ -10,6 +10,9 @@ from .reduction import any from .comparison import eq +from mindspore._c_expression import typing +from mindspore._c_expression.typing import Type + # atleast_2d @@ -698,8 +701,89 @@ def masked_fill(input, mask, value): masked_fill_ = _get_cache_prim(ops.MaskedFill)() return masked_fill_(input, mask, mindspore.tensor(value, dtype=input.dtype)) -def finfo(dtype): - return np.finfo(mindspore.dtype_to_nptype(dtype)) +def is_complex(weight): + return weight.dtype in (mindspore.complex64, mindspore.complex128) + +dtype = Type +float16 = typing.kFloat16 +float32 = typing.kFloat32 +bfloat16 = typing.kBFloat16 + +bits_map = { + +} + +min_map = { + float32: -3.40282e+38, + float16: -65504, + bfloat16: -3.38953e+38 +} + +max_map = { + float32: 3.40282e+38, + float16: 65504, + bfloat16: 3.38953e+38 +} + +eps_map = { + float32: 1.19209e-07, + float16: 0.000976562, + bfloat16: 0.0078125 +} + +tiny_map = { + float32: 1.17549e-38, + float16: 6.10352e-05, + bfloat16: 1.17549e-38 +} + +smallest_normal_map = { + float32: 1.17549e-38, + float16: 6.10352e-05, + bfloat16: 1.17549e-38 +} + +resolution_map = { + float32: 1e-06, + float16: 0.001, + bfloat16: 0.01 +} + +class finfo: + def __init__(self, dtype): + self._dtype = dtype + + @property + def bits(self): + return bits_map[self._dtype] + + @property + def min(self): + return min_map[self._dtype] + + @property + def max(self): + return max_map[self._dtype] + + @property + def eps(self): + return eps_map[self._dtype] + + @property + def tiny(self): + return tiny_map[self._dtype] + + @property + def smallest_normal(self): + return smallest_normal_map[self._dtype] + + @property + def resolution(self): + return resolution_map[self._dtype] + + @property + def dtype(self): + return str(self._dtype) def iinfo(dtype): return np.iinfo(mindspore.dtype_to_nptype(dtype)) From 8dd78a12ed35f3033785cb0dbdbf6c3d2af9c6dc Mon Sep 17 00:00:00 2001 From: HangXU <945440358@qq.com> Date: Tue, 6 May 2025 16:44:35 +0800 Subject: [PATCH 2/2] #fix qwen2 abnormal loss caused by SoftmaxCrossEntropyWithLogits on 910A/B --- mindnlp/peft/peft_model.py | 2 +- .../models/qwen2/modeling_qwen2.py | 47 +++++++++++++------ 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/mindnlp/peft/peft_model.py b/mindnlp/peft/peft_model.py index 402e1076f..4f17d3f1f 100644 --- a/mindnlp/peft/peft_model.py +++ b/mindnlp/peft/peft_model.py @@ -125,7 +125,7 @@ def __init__(self, model, peft_config: PeftConfig, adapter_name="default"): # if hasattr(self.base_model, "config") and hasattr(self.base_model.config, "pretraining_tp"): # self.base_model.config.pretraining_tp = 1 - def save_pretrained(self, save_directory, safe_serialization=False, **kwargs): + def save_pretrained(self, save_directory, safe_serialization=True, **kwargs): r""" This function saves the adapter model and the adapter configuration files to a directory, so that it can be reloaded using the [`LoraModel.from_pretrained`] class method, and also used by the [`LoraModel.push_to_hub`] diff --git a/mindnlp/transformers/models/qwen2/modeling_qwen2.py b/mindnlp/transformers/models/qwen2/modeling_qwen2.py index c16c2b905..ac3bd80ca 100644 --- a/mindnlp/transformers/models/qwen2/modeling_qwen2.py +++ b/mindnlp/transformers/models/qwen2/modeling_qwen2.py @@ -826,13 +826,22 @@ def forward( # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :] shift_labels = labels[..., 1:] - # Flatten the tokens - loss_fct = mindspore.ops.SoftmaxCrossEntropyWithLogits() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = nn.functional.one_hot(shift_labels.view(-1), self.config.vocab_size) - # Enable model parallelism - loss, _ = loss_fct(shift_logits, shift_labels.to(shift_logits.dtype)) - loss = loss.mean() + if ON_ORANGE_PI: + # Flatten the tokens + loss_fct = mindspore.ops.SoftmaxCrossEntropyWithLogits() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = nn.functional.one_hot(shift_labels.view(-1), self.config.vocab_size) + # Enable model parallelism + loss, _ = loss_fct(shift_logits, shift_labels.to(shift_logits.dtype)) + loss = loss.mean() + else: + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + loss = loss_fct(shift_logits, shift_labels) + if not return_dict: output = (logits,) + outputs[1:] @@ -1004,10 +1013,14 @@ def forward( else: loss = loss_fct(pooled_logits, labels) elif self.config.problem_type == "single_label_classification": - loss_fct = mindspore.ops.SoftmaxCrossEntropyWithLogits() - labels = nn.functional.one_hot(labels.view(-1), self.num_labels) - loss, _ = loss_fct(pooled_logits.view(-1, self.num_labels), labels.to(pooled_logits.dtype)) - loss = loss.mean() + if ON_ORANGE_PI: + loss_fct = mindspore.ops.SoftmaxCrossEntropyWithLogits() + labels = nn.functional.one_hot(labels.view(-1), self.num_labels) + loss, _ = loss_fct(pooled_logits.view(-1, self.num_labels), labels.to(pooled_logits.dtype)) + loss = loss.mean() + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(pooled_logits, labels) @@ -1086,10 +1099,14 @@ def forward( loss = None if labels is not None: - loss_fct = mindspore.ops.SoftmaxCrossEntropyWithLogits() - labels = nn.functional.one_hot(labels.view(-1), self.num_labels) - loss, _= loss_fct(logits.view(-1, self.num_labels), labels.to(logits.dtype)) - loss = loss.mean() + if ON_ORANGE_PI: + loss_fct = mindspore.ops.SoftmaxCrossEntropyWithLogits() + labels = nn.functional.one_hot(labels.view(-1), self.num_labels) + loss, _= loss_fct(logits.view(-1, self.num_labels), labels.to(logits.dtype)) + loss = loss.mean() + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[2:]