-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathzero1.yaml
More file actions
55 lines (45 loc) · 1.63 KB
/
zero1.yaml
File metadata and controls
55 lines (45 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
zero_optimization:
stage: 1 # ZeRO-1 with optimizer partitioning
reduce_bucket_size: auto # controls how many elements are grouped before being reduced
reduce_scatter: true # fused reduce + scatter ops
contiguous_gradients: true # gradients allocated in a contiguous memory block
overlap_comm: true # overlap communication with computation
offload_optimizer: # disables offloading to CPU, everything stays on GPU
device: none
offload_param:
device: none
fp16:
enabled: false
auto_cast: false
loss_scale: 0 # dynamic loss scaling (0 = auto)
loss_scale_window: 1000 # number of good steps before increasing the scale again.
hysteresis: 2 # number of overflows to wait before decreasing the loss scale.
consecutive_hysteresis: true # if true, hysteresis must be consecutive (i.e., N overflows in a row)
min_loss_scale: 1 # minimum loss scale to use
bf16:
enabled: true
checkpoint:
load_universal: false # whether to load a universal checkpoint
tensorboard:
enabled: true
output_path: ${loggers.logdir}
job_name: ${experiment_name}
csv_monitor:
enabled: true
output_path: ${loggers.logdir}
job_name: ${experiment_name}
wandb:
enabled: true
project: ${wandb_project}
flops_profiler:
enabled: true
profile_step: 1
module_depth: -1
top_modules: 1
detailed: true
output_file: ${loggers.logdir}/${experiment_name}/flops_profiler.log
gradient_clipping: .5
steps_per_print: 100
gradient_accumulation_steps: 1 # number of steps to accumulate gradients before performing a backward/update pass
train_batch_size: ${clusters.batch_size} # total batch size across all gpus
zero_allow_untested_optimizer: true