|
| 1 | +import os |
| 2 | +import json |
| 3 | +import pandas as pd |
| 4 | +import numpy as np |
| 5 | +import mindspore |
| 6 | +from mindspore import nn, Tensor |
| 7 | +from mindspore.common.initializer import Normal |
| 8 | +from mindspore.train import Model, LossMonitor, TimeMonitor, Callback |
| 9 | +from mindspore.train.callback import CheckpointConfig, ModelCheckpoint |
| 10 | +from mindspore.nn import AdamWeightDecay |
| 11 | +from mindspore.dataset import GeneratorDataset |
| 12 | +from mindnlp.transformers import AutoTokenizer, AutoModelForCausalLM |
| 13 | +from mindspore import context |
| 14 | +from mindnlp.core import no_grad |
| 15 | + |
| 16 | +# 设置上下文,使用Ascend设备 |
| 17 | +context.set_context(device_target="Ascend", device_id=0) |
| 18 | + |
| 19 | +# 输出结果必须保存在该目录 |
| 20 | +output_path = "./output" |
| 21 | + |
| 22 | +def dataset_jsonl_transfer(origin_path, new_path): |
| 23 | + messages = [] |
| 24 | + |
| 25 | + # 读取旧的JSONL文件 |
| 26 | + with open(origin_path, "r") as file: |
| 27 | + for line in file: |
| 28 | + # 解析每一行的json数据 |
| 29 | + data = json.loads(line) |
| 30 | + context_data = data["text"] |
| 31 | + category = data["category"] |
| 32 | + label = data["output"] |
| 33 | + message = { |
| 34 | + "instruction": "你是一个文本分类领域的专家,你会接收到一段文本和几个潜在的分类选项,请输出文本内容的正确类型", |
| 35 | + "input": f"文本:{context_data},类型选型:{category}", |
| 36 | + "output": label, |
| 37 | + } |
| 38 | + messages.append(message) |
| 39 | + |
| 40 | + with open(new_path, "w", encoding="utf-8") as file: |
| 41 | + for message in messages: |
| 42 | + file.write(json.dumps(message, ensure_ascii=False) + "\n") |
| 43 | + |
| 44 | +def process_func(example, tokenizer): |
| 45 | + MAX_LENGTH = 384 |
| 46 | + input_ids, attention_mask, labels = [], [], [] |
| 47 | + instruction = tokenizer( |
| 48 | + f"<|system|>\n你是一个文本分类领域的专家,你会接收到一段文本和几个潜在的分类选项,请输出文本内容的正确类型<|endoftext|>\n<|user|>\n{example['input']}<|endoftext|>\n<|assistant|>\n", |
| 49 | + add_special_tokens=False, |
| 50 | + ) |
| 51 | + response = tokenizer(f"{example['output']}", add_special_tokens=False) |
| 52 | + input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id] |
| 53 | + attention_mask = ( |
| 54 | + instruction["attention_mask"] + response["attention_mask"] + [1] |
| 55 | + ) |
| 56 | + labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id] |
| 57 | + if len(input_ids) > MAX_LENGTH: # 做一个截断 |
| 58 | + input_ids = input_ids[:MAX_LENGTH] |
| 59 | + attention_mask = attention_mask[:MAX_LENGTH] |
| 60 | + labels = labels[:MAX_LENGTH] |
| 61 | + return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels} |
| 62 | + |
| 63 | +# 加载模型和分词器 |
| 64 | +model_name = "ZhipuAI/glm-4-9b-chat" |
| 65 | +tokenizer = AutoTokenizer.from_pretrained(model_name, mirror='modelscope') |
| 66 | +model = AutoModelForCausalLM.from_pretrained( |
| 67 | + model_name, |
| 68 | + mirror='modelscope', |
| 69 | + ms_dtype=mindspore.float16 |
| 70 | +) |
| 71 | +train_dataset_path = "train.jsonl" |
| 72 | +test_dataset_path = "test.jsonl" |
| 73 | + |
| 74 | +train_jsonl_new_path = "new_train.jsonl" |
| 75 | +test_jsonl_new_path = "new_test.jsonl" |
| 76 | + |
| 77 | +if not os.path.exists(train_jsonl_new_path): |
| 78 | + dataset_jsonl_transfer(train_dataset_path, train_jsonl_new_path) |
| 79 | +if not os.path.exists(test_jsonl_new_path): |
| 80 | + dataset_jsonl_new_path(test_dataset_path, test_jsonl_new_path) |
| 81 | + |
| 82 | +# 得到训练集 |
| 83 | +train_df = pd.read_json(train_jsonl_new_path, lines=True) |
| 84 | +train_df = train_df[:1000] |
| 85 | + |
| 86 | +# 数据预处理 |
| 87 | +train_data = train_df.to_dict(orient='records') |
| 88 | +processed_train_data = [] |
| 89 | +for example in train_data: |
| 90 | + processed_example = process_func(example, tokenizer) |
| 91 | + processed_train_data.append(processed_example) |
| 92 | + |
| 93 | +# 转换为适合 MindSpore 的格式 |
| 94 | +def data_generator(dataset): |
| 95 | + for item in dataset: |
| 96 | + yield ( |
| 97 | + np.array(item["input_ids"], dtype=np.int32), |
| 98 | + np.array(item["attention_mask"], dtype=np.int32), |
| 99 | + np.array(item["labels"], dtype=np.int32) |
| 100 | + ) |
| 101 | + |
| 102 | +train_dataset = GeneratorDataset(source=lambda: data_generator(processed_train_data), |
| 103 | + column_names=["input_ids", "attention_mask", "labels"]) |
| 104 | + |
| 105 | +def forward_fn(input_ids, attention_mask, labels): |
| 106 | + outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) |
| 107 | + loss = outputs.loss |
| 108 | + return loss |
| 109 | + |
| 110 | +params_dict = model.parameters_dict() |
| 111 | +trainable_params = [param for param in params_dict.values() if param.requires_grad] |
| 112 | + |
| 113 | +optimizer = AdamWeightDecay(unique_params, learning_rate=3e-4) |
| 114 | + |
| 115 | +train_net = mindspore.nn.TrainOneStepCell(nn.WithLossCell(model, nn.CrossEntropyLoss()), optimizer) |
| 116 | + |
| 117 | +class EvalCallback(Callback): |
| 118 | + def __init__(self, model, dataset, tokenizer): |
| 119 | + super(EvalCallback, self).__init__() |
| 120 | + self.model = model |
| 121 | + self.dataset = dataset |
| 122 | + self.tokenizer = tokenizer |
| 123 | + |
| 124 | + def epoch_end(self, run_context): |
| 125 | + cb_params = run_context.original_args() |
| 126 | + eval_results = self.model.eval(self.dataset) |
| 127 | + print(f"Epoch {cb_params.cur_epoch_num} eval results: {eval_results}") |
| 128 | + |
| 129 | +# 创建模型 |
| 130 | + |
| 131 | +model = Model(model, loss_fn=nn.CrossEntropyLoss(), optimizer=optimizer, metrics={'loss'}) |
| 132 | + |
| 133 | +config_ck = CheckpointConfig(save_checkpoint_steps=50, keep_checkpoint_max=5) |
| 134 | +ckpoint_cb = ModelCheckpoint(prefix="glm4-9b", directory=output_path, config=config_ck) |
| 135 | + |
| 136 | +model.train(epoch_size=1, train_dataset=train_dataset, callbacks=[TimeMonitor(), LossMonitor(), ckpoint_cb]) |
| 137 | + |
| 138 | +test_df = pd.read_json(test_jsonl_new_path, lines=True)[:10] |
| 139 | +test_data = test_df.to_dict(orient='records') |
| 140 | + |
| 141 | +processed_test_data = [] |
| 142 | +for example in test_data: |
| 143 | + processed_example = process_func(example, tokenizer) |
| 144 | + processed_test_data.append(processed_example) |
| 145 | + |
| 146 | +test_dataset = GeneratorDataset(source=lambda: data_generator(processed_test_data), |
| 147 | + column_names=["input_ids", "attention_mask", "labels"]) |
| 148 | + |
| 149 | +eval_results = model.eval(test_dataset) |
| 150 | +print(f"评估结果: {eval_results}") |
0 commit comments