cell_observatory_platform/configs/deepspeed/zero1.yaml at develop · cell-observatory/cell_observatory_platform · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

zero_optimization:
  stage: 1 # ZeRO-1 with optimizer partitioning
  reduce_bucket_size: auto # controls how many elements are grouped before being reduced
  reduce_scatter: true # fused reduce + scatter ops
  contiguous_gradients: true # gradients allocated in a contiguous memory block
  overlap_comm: true # overlap communication with computation
  offload_optimizer: # disables offloading to CPU, everything stays on GPU
    device: none
  offload_param:
    device: none

fp16:
  enabled: false
  auto_cast: false
  loss_scale: 0 #  dynamic loss scaling (0 = auto)
  loss_scale_window: 1000 # number of good steps before increasing the scale again.
  hysteresis: 2 # number of overflows to wait before decreasing the loss scale.
  consecutive_hysteresis: true # if true, hysteresis must be consecutive (i.e., N overflows in a row)
  min_loss_scale: 1 # minimum loss scale to use

bf16:
  enabled: true


checkpoint:
  load_universal: false # whether to load a universal checkpoint

tensorboard:
  enabled: true
  output_path: ${loggers.logdir}
  job_name: ${experiment_name}

csv_monitor:
  enabled: true
  output_path: ${loggers.logdir}
  job_name: ${experiment_name}

wandb:
  enabled: true
  project: ${wandb_project}

flops_profiler:
  enabled: true
  profile_step: 1
  module_depth: -1
  top_modules: 1
  detailed: true
  output_file: ${loggers.logdir}/${experiment_name}/flops_profiler.log

gradient_clipping: .5
steps_per_print: 100
gradient_accumulation_steps: 1 # number of steps to accumulate gradients before performing a backward/update pass
train_batch_size: ${clusters.batch_size} # total batch size across all gpus
zero_allow_untested_optimizer: true