Skip to content

Commit d37233e

Browse files
[QEff. Finetuning]: Enhance test cases to match intermediate step level loss/metrics (#531)
Enable test cases for Intermediate step level loss/metric matching in single and DDP set up. Nested dictionary structure for mapping the reference losses at different test scenarios. The test scenarios with the ref values are listed in a separate reference file. The test scenarios at present include single device testing for below models: Llama, Bert on Alpaca and GSM8k dataset. **REFERNCE DATA based on SDK - 1.21.0.23** --------- Signed-off-by: Ann Kuruvilla <[email protected]> Signed-off-by: Ann Kuruvilla <[email protected]>
1 parent ccc1923 commit d37233e

File tree

4 files changed

+367
-26
lines changed

4 files changed

+367
-26
lines changed

QEfficient/finetune/utils/train_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,6 @@ def train(
355355
logger.log_rank_zero(
356356
f"Epoch {epoch + 1}: Train epoch loss: {train_epoch_loss:.4f}, Train metric: {train_epoch_metric:.4f}, Epoch time {epoch_end_time:.2f} sec"
357357
)
358-
359358
# Saving the results every epoch to plot later
360359
if train_config.save_metrics:
361360
save_to_json(
@@ -374,9 +373,14 @@ def train(
374373

375374
results["last_epoch_train_loss"] = train_epoch_loss.cpu()
376375
results["last_epoch_train_metric"] = train_epoch_metric.cpu()
376+
results["train_step_loss"] = train_step_loss
377+
results["train_step_metric"] = train_step_metric
378+
377379
if train_config.run_validation:
378380
results["last_epoch_eval_loss"] = eval_epoch_loss.cpu()
379381
results["last_epoch_eval_metric"] = eval_epoch_metric.cpu()
382+
results["eval_step_loss"] = eval_step_loss
383+
results["eval_step_metric"] = eval_step_metric
380384
results["avg_epoch_time"] = avg_epoch_time
381385
results["avg_checkpoint_time"] = avg_checkpoint_time
382386
if train_config.save_metrics:

tests/finetune/constants.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# -----------------------------------------------------------------------------
2+
#
3+
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
#
6+
# -----------------------------------------------------------------------------
7+
8+
# Finetuning Test Constants
9+
LOSS_ATOL = 1e-3
10+
METRIC_ATOL = 1e-3

tests/finetune/reference_data.py

Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
# -----------------------------------------------------------------------------
2+
#
3+
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
#
6+
# -----------------------------------------------------------------------------
7+
8+
"""Reference data for the finetune tests from SDK version - 1.21.0.23"""
9+
10+
# A dictionary to hold all reference data for all test sets.
11+
REFERENCE_DATA = {
12+
# Scenario 1: Single-device llama 3.2-1B training on Alpaca dataset.
13+
"llama_3.2_1B_config_alpaca_single_device": {
14+
"description": "Baseline for Llama on Alpaca single-device",
15+
"train_step_losses": [
16+
1.5112206935882568,
17+
1.2211230993270874,
18+
1.9942185878753662,
19+
2.093623161315918,
20+
0.9168124198913574,
21+
1.2125635147094727,
22+
0.3648962676525116,
23+
1.6231939792633057,
24+
0.8259601593017578,
25+
0.7741442918777466,
26+
1.7359141111373901,
27+
2.118462085723877,
28+
2.061161994934082,
29+
0.8256913423538208,
30+
0.8088029623031616,
31+
1.761340618133545,
32+
1.6828027963638306,
33+
1.3538823127746582,
34+
2.0672550201416016,
35+
3.1532647609710693,
36+
],
37+
"eval_step_losses": [
38+
1.462059736251831,
39+
0.24527676403522491,
40+
1.046107292175293,
41+
1.6403586864471436,
42+
1.395291805267334,
43+
2.8664817810058594,
44+
1.035412311553955,
45+
1.8670039176940918,
46+
3.8079662322998047,
47+
0.6516809463500977,
48+
],
49+
"train_step_metrics": [
50+
4.532259941101074,
51+
3.390994071960449,
52+
7.34645938873291,
53+
8.114261627197266,
54+
2.5013046264648438,
55+
3.3620924949645996,
56+
1.4403645992279053,
57+
5.069255828857422,
58+
2.2840728759765625,
59+
2.1687355041503906,
60+
5.674112319946289,
61+
8.318334579467773,
62+
7.855090141296387,
63+
2.283458948135376,
64+
2.2452187538146973,
65+
5.820234775543213,
66+
5.380615711212158,
67+
3.872429847717285,
68+
7.903097629547119,
69+
23.412376403808594,
70+
],
71+
"eval_step_metrics": [ # steps 0-9
72+
4.31483793258667,
73+
1.2779749631881714,
74+
2.8465487957000732,
75+
5.157018661499023,
76+
4.036152362823486,
77+
17.575077056884766,
78+
2.816267251968384,
79+
6.468885898590088,
80+
45.05870819091797,
81+
1.9187631607055664,
82+
],
83+
},
84+
# Scenario 2: Single-device llama 3.2-1B training on GSM8k dataset.
85+
"llama_3.2_1B_config_gsm8k_single_device": {
86+
"description": "Baseline for Llama on GSM8k single-device",
87+
"train_step_losses": [
88+
2.250276803970337,
89+
2.3231687545776367,
90+
1.9379945993423462,
91+
1.5981022119522095,
92+
1.9867562055587769,
93+
1.4573354721069336,
94+
1.8969658613204956,
95+
1.2177824974060059,
96+
1.6489791870117188,
97+
1.5380687713623047,
98+
1.4025083780288696,
99+
1.5301083326339722,
100+
1.6858205795288086,
101+
1.383747935295105,
102+
1.7968919277191162,
103+
1.4075607061386108,
104+
1.6447738409042358,
105+
1.2807793617248535,
106+
0.8450672030448914,
107+
1.5795941352844238,
108+
],
109+
"eval_step_losses": [
110+
1.7081595659255981,
111+
1.719305157661438,
112+
1.153528094291687,
113+
2.0051634311676025,
114+
1.3372926712036133,
115+
1.3009852170944214,
116+
1.2207027673721313,
117+
1.3452664613723755,
118+
1.329830288887024,
119+
1.307450532913208,
120+
],
121+
"train_step_metrics": [
122+
9.490362167358398,
123+
10.207969665527344,
124+
6.944809913635254,
125+
4.943641662597656,
126+
7.291841506958008,
127+
4.294501304626465,
128+
6.6656389236450195,
129+
3.3796849250793457,
130+
5.201667308807373,
131+
4.655590534210205,
132+
4.065384864807129,
133+
4.618677139282227,
134+
5.396877765655518,
135+
3.989826202392578,
136+
6.030873775482178,
137+
4.0859761238098145,
138+
5.179838180541992,
139+
3.5994436740875244,
140+
2.328134298324585,
141+
4.852985858917236,
142+
],
143+
"eval_step_metrics": [ # steps 0-9
144+
5.518795013427734,
145+
5.580649375915527,
146+
3.1693549156188965,
147+
7.42730712890625,
148+
3.8087174892425537,
149+
3.672913074493408,
150+
3.38956880569458,
151+
3.8392088413238525,
152+
3.7804012298583984,
153+
3.6967368125915527,
154+
],
155+
},
156+
# Scenario 3: Single-device google-bert/bert-base-uncased training on IMDB dataset.
157+
"bert_base_uncased_config_imdb_single_device": {
158+
"description": "Baseline for google-bert/bert-base-uncased on IMDB single-device",
159+
"train_step_losses": [
160+
0.357421875,
161+
0.546875,
162+
0.98486328125,
163+
0.35302734375,
164+
1.23828125,
165+
0.60791015625,
166+
0.44384765625,
167+
0.791015625,
168+
0.7861328125,
169+
0.51318359375,
170+
0.50244140625,
171+
0.90087890625,
172+
0.8818359375,
173+
0.86279296875,
174+
0.6396484375,
175+
0.49267578125,
176+
0.97119140625,
177+
0.7451171875,
178+
0.798828125,
179+
0.7080078125,
180+
],
181+
"eval_step_losses": [
182+
0.634765625,
183+
0.8173828125,
184+
0.9072265625,
185+
0.7177734375,
186+
0.59423828125,
187+
0.69921875,
188+
0.7109375,
189+
0.7216796875,
190+
0.6064453125,
191+
0.7041015625,
192+
],
193+
"train_step_metrics": [
194+
1.0,
195+
1.0,
196+
0.5,
197+
0.5,
198+
0.5,
199+
0.5,
200+
0.5,
201+
0.5,
202+
0.5,
203+
0.5,
204+
0.5,
205+
0.5,
206+
0.5,
207+
0.5,
208+
0.5,
209+
0.5,
210+
0.5,
211+
0.5,
212+
0.449951171875,
213+
0.4091796875,
214+
],
215+
"eval_step_metrics": [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0],
216+
},
217+
# Scenario 4: Distributed google-bert/bert-base-uncased training (world_size=2)
218+
"bert_base_uncased_config_imdb_distributed_ws2": {
219+
"description": "Baseline for distributed training with 2 devices",
220+
"world_size": 2,
221+
"rank_data": {
222+
0: { # Data for Rank 0
223+
"train_step_losses": [],
224+
"eval_step_losses": [],
225+
"train_step_metrics": [],
226+
"eval_step_metrics": [],
227+
},
228+
1: { # Data for Rank 1
229+
"train_step_losses": [],
230+
"eval_step_losses": [],
231+
"train_step_metrics": [],
232+
"eval_step_metrics": [],
233+
},
234+
},
235+
},
236+
}

0 commit comments

Comments
 (0)