Skip to content

Commit 65a51cb

Browse files
authored
【开源实习】chatglm-4模型微调 (#1996)
1 parent f7475f6 commit 65a51cb

File tree

2 files changed

+150
-0
lines changed

2 files changed

+150
-0
lines changed
Binary file not shown.
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import os
2+
import json
3+
import pandas as pd
4+
import numpy as np
5+
import mindspore
6+
from mindspore import nn, Tensor
7+
from mindspore.common.initializer import Normal
8+
from mindspore.train import Model, LossMonitor, TimeMonitor, Callback
9+
from mindspore.train.callback import CheckpointConfig, ModelCheckpoint
10+
from mindspore.nn import AdamWeightDecay
11+
from mindspore.dataset import GeneratorDataset
12+
from mindnlp.transformers import AutoTokenizer, AutoModelForCausalLM
13+
from mindspore import context
14+
from mindnlp.core import no_grad
15+
16+
# 设置上下文,使用Ascend设备
17+
context.set_context(device_target="Ascend", device_id=0)
18+
19+
# 输出结果必须保存在该目录
20+
output_path = "./output"
21+
22+
def dataset_jsonl_transfer(origin_path, new_path):
23+
messages = []
24+
25+
# 读取旧的JSONL文件
26+
with open(origin_path, "r") as file:
27+
for line in file:
28+
# 解析每一行的json数据
29+
data = json.loads(line)
30+
context_data = data["text"]
31+
category = data["category"]
32+
label = data["output"]
33+
message = {
34+
"instruction": "你是一个文本分类领域的专家,你会接收到一段文本和几个潜在的分类选项,请输出文本内容的正确类型",
35+
"input": f"文本:{context_data},类型选型:{category}",
36+
"output": label,
37+
}
38+
messages.append(message)
39+
40+
with open(new_path, "w", encoding="utf-8") as file:
41+
for message in messages:
42+
file.write(json.dumps(message, ensure_ascii=False) + "\n")
43+
44+
def process_func(example, tokenizer):
45+
MAX_LENGTH = 384
46+
input_ids, attention_mask, labels = [], [], []
47+
instruction = tokenizer(
48+
f"<|system|>\n你是一个文本分类领域的专家,你会接收到一段文本和几个潜在的分类选项,请输出文本内容的正确类型<|endoftext|>\n<|user|>\n{example['input']}<|endoftext|>\n<|assistant|>\n",
49+
add_special_tokens=False,
50+
)
51+
response = tokenizer(f"{example['output']}", add_special_tokens=False)
52+
input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
53+
attention_mask = (
54+
instruction["attention_mask"] + response["attention_mask"] + [1]
55+
)
56+
labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
57+
if len(input_ids) > MAX_LENGTH: # 做一个截断
58+
input_ids = input_ids[:MAX_LENGTH]
59+
attention_mask = attention_mask[:MAX_LENGTH]
60+
labels = labels[:MAX_LENGTH]
61+
return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
62+
63+
# 加载模型和分词器
64+
model_name = "ZhipuAI/glm-4-9b-chat"
65+
tokenizer = AutoTokenizer.from_pretrained(model_name, mirror='modelscope')
66+
model = AutoModelForCausalLM.from_pretrained(
67+
model_name,
68+
mirror='modelscope',
69+
ms_dtype=mindspore.float16
70+
)
71+
train_dataset_path = "train.jsonl"
72+
test_dataset_path = "test.jsonl"
73+
74+
train_jsonl_new_path = "new_train.jsonl"
75+
test_jsonl_new_path = "new_test.jsonl"
76+
77+
if not os.path.exists(train_jsonl_new_path):
78+
dataset_jsonl_transfer(train_dataset_path, train_jsonl_new_path)
79+
if not os.path.exists(test_jsonl_new_path):
80+
dataset_jsonl_new_path(test_dataset_path, test_jsonl_new_path)
81+
82+
# 得到训练集
83+
train_df = pd.read_json(train_jsonl_new_path, lines=True)
84+
train_df = train_df[:1000]
85+
86+
# 数据预处理
87+
train_data = train_df.to_dict(orient='records')
88+
processed_train_data = []
89+
for example in train_data:
90+
processed_example = process_func(example, tokenizer)
91+
processed_train_data.append(processed_example)
92+
93+
# 转换为适合 MindSpore 的格式
94+
def data_generator(dataset):
95+
for item in dataset:
96+
yield (
97+
np.array(item["input_ids"], dtype=np.int32),
98+
np.array(item["attention_mask"], dtype=np.int32),
99+
np.array(item["labels"], dtype=np.int32)
100+
)
101+
102+
train_dataset = GeneratorDataset(source=lambda: data_generator(processed_train_data),
103+
column_names=["input_ids", "attention_mask", "labels"])
104+
105+
def forward_fn(input_ids, attention_mask, labels):
106+
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
107+
loss = outputs.loss
108+
return loss
109+
110+
params_dict = model.parameters_dict()
111+
trainable_params = [param for param in params_dict.values() if param.requires_grad]
112+
113+
optimizer = AdamWeightDecay(unique_params, learning_rate=3e-4)
114+
115+
train_net = mindspore.nn.TrainOneStepCell(nn.WithLossCell(model, nn.CrossEntropyLoss()), optimizer)
116+
117+
class EvalCallback(Callback):
118+
def __init__(self, model, dataset, tokenizer):
119+
super(EvalCallback, self).__init__()
120+
self.model = model
121+
self.dataset = dataset
122+
self.tokenizer = tokenizer
123+
124+
def epoch_end(self, run_context):
125+
cb_params = run_context.original_args()
126+
eval_results = self.model.eval(self.dataset)
127+
print(f"Epoch {cb_params.cur_epoch_num} eval results: {eval_results}")
128+
129+
# 创建模型
130+
131+
model = Model(model, loss_fn=nn.CrossEntropyLoss(), optimizer=optimizer, metrics={'loss'})
132+
133+
config_ck = CheckpointConfig(save_checkpoint_steps=50, keep_checkpoint_max=5)
134+
ckpoint_cb = ModelCheckpoint(prefix="glm4-9b", directory=output_path, config=config_ck)
135+
136+
model.train(epoch_size=1, train_dataset=train_dataset, callbacks=[TimeMonitor(), LossMonitor(), ckpoint_cb])
137+
138+
test_df = pd.read_json(test_jsonl_new_path, lines=True)[:10]
139+
test_data = test_df.to_dict(orient='records')
140+
141+
processed_test_data = []
142+
for example in test_data:
143+
processed_example = process_func(example, tokenizer)
144+
processed_test_data.append(processed_example)
145+
146+
test_dataset = GeneratorDataset(source=lambda: data_generator(processed_test_data),
147+
column_names=["input_ids", "attention_mask", "labels"])
148+
149+
eval_results = model.eval(test_dataset)
150+
print(f"评估结果: {eval_results}")

0 commit comments

Comments
 (0)