Skip to content

Commit f99d082

Browse files
authored
use new core to adapt huggingface library (#2056)
1 parent 30b1947 commit f99d082

File tree

439 files changed

+103053
-383
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

439 files changed

+103053
-383
lines changed

llm/peft/lora/lora_seq2seq.ipynb

Lines changed: 32 additions & 305 deletions
Large diffs are not rendered by default.

llm/peft/lora/lora_seq2seq.py

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
# In[1]:
5+
6+
7+
import mindspore
8+
from mindnlp.transformers import AutoModelForSeq2SeqLM
9+
from mindnlp.peft import get_peft_model, LoraConfig, TaskType
10+
from mindnlp.core import ops
11+
12+
from mindnlp.transformers import AutoTokenizer
13+
from mindnlp.transformers.optimization import get_linear_schedule_with_warmup
14+
from tqdm import tqdm
15+
from datasets import load_dataset
16+
17+
model_name_or_path = "bigscience/mt0-large"
18+
tokenizer_name_or_path = "bigscience/mt0-large"
19+
20+
checkpoint_name = "financial_sentiment_analysis_lora_v1.ckpt"
21+
max_length = 128
22+
lr = 1e-3
23+
num_epochs = 3
24+
batch_size = 8
25+
26+
27+
# In[ ]:
28+
29+
30+
# creating model
31+
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
32+
33+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
34+
model = get_peft_model(model, peft_config)
35+
model.print_trainable_parameters()
36+
37+
38+
# In[ ]:
39+
40+
41+
# loading dataset
42+
dataset = load_dataset("financial_phrasebank", "sentences_allagree")
43+
dataset = dataset["train"].train_test_split(test_size=0.1)
44+
dataset["validation"] = dataset["test"]
45+
del dataset["test"]
46+
47+
classes = dataset["train"].features["label"].names
48+
dataset = dataset.map(
49+
lambda x: {"text_label": [classes[label] for label in x["label"]]},
50+
batched=True,
51+
num_proc=1,
52+
)
53+
54+
dataset["train"][0]
55+
56+
# In[ ]:
57+
58+
print(dataset.source.ds)
59+
classes = dataset.source.ds.features["label"].names
60+
classes
61+
62+
63+
# In[ ]:
64+
65+
66+
train_dataset, validation_dataset = dataset.shuffle(64).split([0.9, 0.1])
67+
68+
69+
# In[ ]:
70+
71+
72+
def add_text_label(sentence, label):
73+
return sentence, label, classes[label.item()]
74+
75+
train_dataset = train_dataset.map(add_text_label, ['sentence', 'label'], ['sentence', 'label', 'text_label'])
76+
validation_dataset = validation_dataset.map(add_text_label, ['sentence', 'label'], ['sentence', 'label', 'text_label'])
77+
78+
79+
# In[ ]:
80+
81+
82+
next(train_dataset.create_dict_iterator())
83+
84+
85+
# In[ ]:
86+
87+
88+
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
89+
90+
91+
# In[ ]:
92+
93+
94+
import numpy as np
95+
from mindnlp.dataset import BaseMapFunction
96+
from threading import Lock
97+
lock = Lock()
98+
99+
class MapFunc(BaseMapFunction):
100+
def __call__(self, sentence, label, text_label):
101+
lock.acquire()
102+
model_inputs = tokenizer(sentence, max_length=max_length, padding="max_length", truncation=True)
103+
labels = tokenizer(text_label, max_length=3, padding="max_length", truncation=True)
104+
lock.release()
105+
labels = labels['input_ids']
106+
labels = np.where(np.equal(labels, tokenizer.pad_token_id), -100, labels)
107+
return model_inputs['input_ids'], model_inputs['attention_mask'], labels
108+
109+
110+
def get_dataset(dataset, tokenizer, shuffle=True):
111+
input_colums=['sentence', 'label', 'text_label']
112+
output_columns=['input_ids', 'attention_mask', 'labels']
113+
dataset = dataset.map(MapFunc(input_colums, output_columns),
114+
input_colums, output_columns)
115+
if shuffle:
116+
dataset = dataset.shuffle(64)
117+
dataset = dataset.batch(batch_size)
118+
return dataset
119+
120+
train_dataset = get_dataset(train_dataset, tokenizer)
121+
eval_dataset = get_dataset(validation_dataset, tokenizer, shuffle=False)
122+
123+
124+
# In[ ]:
125+
126+
127+
next(train_dataset.create_dict_iterator())
128+
129+
130+
# In[ ]:
131+
132+
133+
from mindnlp.core import optim
134+
# optimizer and lr scheduler
135+
optimizer = optim.AdamW(model.trainable_params(), lr=lr)
136+
lr_scheduler = get_linear_schedule_with_warmup(
137+
optimizer=optimizer,
138+
num_warmup_steps=0,
139+
num_training_steps=(len(train_dataset) * num_epochs),
140+
)
141+
142+
143+
# In[ ]:
144+
145+
146+
from mindnlp.core import value_and_grad
147+
# training and evaluation
148+
def forward_fn(**batch):
149+
outputs = model(**batch)
150+
loss = outputs.loss
151+
return loss
152+
153+
grad_fn = value_and_grad(forward_fn, model.trainable_params())
154+
155+
for epoch in range(num_epochs):
156+
model.set_train()
157+
total_loss = 0
158+
train_total_size = train_dataset.get_dataset_size()
159+
for step, batch in enumerate(tqdm(train_dataset.create_dict_iterator(), total=train_total_size)):
160+
optimizer.zero_grad()
161+
loss = grad_fn(**batch)
162+
optimizer.step()
163+
total_loss += loss.float()
164+
lr_scheduler.step()
165+
166+
model.set_train(False)
167+
eval_loss = 0
168+
eval_preds = []
169+
eval_total_size = eval_dataset.get_dataset_size()
170+
for step, batch in enumerate(tqdm(eval_dataset.create_dict_iterator(), total=eval_total_size)):
171+
with mindspore._no_grad():
172+
outputs = model(**batch)
173+
loss = outputs.loss
174+
eval_loss += loss.float()
175+
eval_preds.extend(
176+
tokenizer.batch_decode(ops.argmax(outputs.logits, -1).asnumpy(), skip_special_tokens=True)
177+
)
178+
179+
eval_epoch_loss = eval_loss / len(eval_dataset)
180+
eval_ppl = ops.exp(eval_epoch_loss)
181+
train_epoch_loss = total_loss / len(train_dataset)
182+
train_ppl = ops.exp(train_epoch_loss)
183+
print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
184+
185+
186+
# In[ ]:
187+
188+
189+
# print accuracy
190+
correct = 0
191+
total = 0
192+
193+
ground_truth = []
194+
195+
for pred, data in zip(eval_preds, validation_dataset.create_dict_iterator(output_numpy=True)):
196+
true = str(data['text_label'])
197+
ground_truth.append(true)
198+
if pred.strip() == true.strip():
199+
correct += 1
200+
total += 1
201+
accuracy = correct / total * 100
202+
print(f"{accuracy=} % on the evaluation dataset")
203+
print(f"{eval_preds[:10]=}")
204+
print(f"{ground_truth[:10]=}")
205+
206+
207+
# In[ ]:
208+
209+
210+
next(eval_dataset.create_tuple_iterator())
211+
212+
213+
# In[ ]:
214+
215+
216+
# saving model
217+
peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"
218+
model.save_pretrained(peft_model_id)
219+
220+
221+
# In[ ]:
222+
223+
224+
ckpt = f"{peft_model_id}/adapter_model.ckpt"
225+
get_ipython().system('du -h $ckpt')
226+
227+
228+
# In[ ]:
229+
230+
231+
from mindnlp.peft import PeftModel, PeftConfig
232+
233+
peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"
234+
235+
config = PeftConfig.from_pretrained(peft_model_id)
236+
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
237+
model = PeftModel.from_pretrained(model, peft_model_id)
238+
239+
240+
# In[ ]:
241+
242+
243+
model.set_train(False)
244+
example = next(validation_dataset.create_dict_iterator(output_numpy=True))
245+
246+
print(example['text_label'])
247+
inputs = tokenizer(example['text_label'], return_tensors="ms")
248+
print(inputs)
249+
250+
with mindspore._no_grad():
251+
outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
252+
print(outputs)
253+
print(tokenizer.batch_decode(outputs.asnumpy(), skip_special_tokens=True))
254+

llm/peft/lora/roberta_sequence_classification.ipynb

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,5 @@
11
{
22
"cells": [
3-
{
4-
"cell_type": "code",
5-
"execution_count": 1,
6-
"id": "7228a58b-4f81-4f5d-ac6c-d9439b3f4447",
7-
"metadata": {},
8-
"outputs": [
9-
{
10-
"name": "stdout",
11-
"output_type": "stream",
12-
"text": [
13-
"env: HF_ENDPOINT=https://hf-mirror.com\n"
14-
]
15-
}
16-
],
17-
"source": [
18-
"%env HF_ENDPOINT=https://hf-mirror.com"
19-
]
20-
},
213
{
224
"cell_type": "code",
235
"execution_count": 2,
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import argparse
2+
import os
3+
4+
import mindtorch
5+
from mindtorch.optim import AdamW
6+
from mindtorch.utils.data import DataLoader
7+
# import mindnlp
8+
9+
from peft import (
10+
get_peft_config,
11+
get_peft_model,
12+
get_peft_model_state_dict,
13+
set_peft_model_state_dict,
14+
LoraConfig,
15+
PeftType,
16+
PrefixTuningConfig,
17+
PromptEncoderConfig,
18+
)
19+
20+
import evaluate
21+
from datasets import load_dataset
22+
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
23+
from tqdm import tqdm
24+
25+
batch_size = 32
26+
model_name_or_path = "roberta-large"
27+
task = "mrpc"
28+
peft_type = PeftType.LORA
29+
device = "npu" # "cuda"
30+
num_epochs = 20
31+
32+
peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
33+
lr = 3e-4
34+
35+
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
36+
padding_side = "left"
37+
else:
38+
padding_side = "right"
39+
40+
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
41+
if getattr(tokenizer, "pad_token_id") is None:
42+
tokenizer.pad_token_id = tokenizer.eos_token_id
43+
44+
datasets = load_dataset("glue", task)
45+
metric = evaluate.load("glue", task)
46+
47+
48+
def tokenize_function(examples):
49+
# max_length=None => use the model max length (it's actually the default)
50+
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
51+
return outputs
52+
53+
54+
tokenized_datasets = datasets.map(
55+
tokenize_function,
56+
batched=True,
57+
remove_columns=["idx", "sentence1", "sentence2"],
58+
)
59+
60+
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
61+
# transformers library
62+
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
63+
64+
65+
def collate_fn(examples):
66+
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
67+
68+
69+
# Instantiate dataloaders.
70+
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
71+
eval_dataloader = DataLoader(
72+
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
73+
)
74+
75+
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)
76+
model = get_peft_model(model, peft_config)
77+
model.print_trainable_parameters()
78+
model

0 commit comments

Comments
 (0)