File tree Expand file tree Collapse file tree 2 files changed +3
-3
lines changed Expand file tree Collapse file tree 2 files changed +3
-3
lines changed Original file line number Diff line number Diff line change 9
9
N_NODE=4
10
10
N_GPU_PER_NODE=8
11
11
12
- # You need to export $RANK , $MASTER_ADDR, $MASTER_PORT automatically for each Node.
12
+ # You need to export $MACHINE_RANK , $MASTER_ADDR, $MASTER_PORT automatically for each Node.
13
13
14
14
# config path
15
15
CONFIG=" configs/xxx_train_config.json"
@@ -37,7 +37,7 @@ accelerate launch \
37
37
--mixed_precision ' bf16' \
38
38
--dynamo_backend ' no' \
39
39
--same_network \
40
- --machine_rank $RANK \
40
+ --machine_rank $MACHINE_RANK \
41
41
--main_process_ip $MASTER_ADDR \
42
42
--main_process_port $MASTER_PORT \
43
43
--rdzv_backend ' static' \
Original file line number Diff line number Diff line change @@ -431,7 +431,7 @@ def accelerate_train(self):
431
431
# Training Loop!
432
432
for epoch in range (starting_epoch , self .args .num_train_epochs ):
433
433
# set_epoch
434
- self .train_dataloader .set_epoch (epoch )
434
+ # self.train_dataloader.set_epoch(epoch)
435
435
436
436
# if we early stop by some ckpts not converging
437
437
if self .args .early_stopping and stall_num == self .args .early_stopping_stall_num :
You can’t perform that action at this time.
0 commit comments