Skip to content

Commit d2ff7e8

Browse files
committed
Merge remote-tracking branch 'origin' into omosafi/add-esm2-TE-to-HF
2 parents 3f8bcf6 + b877715 commit d2ff7e8

File tree

14 files changed

+195
-80
lines changed

14 files changed

+195
-80
lines changed

.github/workflows/convergence-tests.yml

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11,29 +11,30 @@ on:
1111
options:
1212
- h100-sxm
1313
- h200
14-
branch:
15-
description: "Branch to use (ignored if commit SHA is provided)"
16-
required: true
17-
default: "main"
18-
type: string
19-
commit_sha:
20-
description: "Commit SHA (optional - overrides branch if provided)"
21-
required: false
22-
type: string
2314
model_config:
2415
description: "Model configuration to use"
2516
required: true
26-
default: "esm2_accelerate"
17+
default: "esm2_native_te"
2718
type: choice
2819
options:
2920
- esm2_accelerate_te.yaml
3021
- esm2_native_te
31-
- esm2_native_te_mfsdp_thd
3222
- geneformer_native_te_mfsdp_fp8
3323
config_override:
3424
description: "Optional: run only these product configs (CSV). Examples: 10m or 10m,4b. Leave blank to run all."
3525
required: false
3626
type: string
27+
branch:
28+
description: "Branch to use (ignored if commit SHA is provided)"
29+
required: true
30+
default: "main"
31+
type: string
32+
commit_sha:
33+
description: "Commit SHA (optional - overrides branch if provided)"
34+
required: false
35+
type: string
36+
schedule:
37+
- cron: "0 12 * * *" # everyday at 4am PST
3738

3839
jobs:
3940
submit-lepton-jobs:
@@ -58,30 +59,34 @@ jobs:
5859
- name: Submit Lepton Jobs
5960
env:
6061
LEP_LOGIN_CREDENTIALS: ${{ secrets.LEP_LOGIN_CREDENTIALS }}
62+
GPU_TYPE: ${{ github.event.inputs.gpu_type || 'h100-sxm' }}
63+
BRANCH: ${{ github.event.inputs.branch || 'main' }}
64+
COMMIT_SHA: ${{ github.event.inputs.commit_sha || '' }}
65+
MODEL_CONFIG: ${{ github.event.inputs.model_config || 'esm2_native_te' }}
66+
CONFIG_OVERRIDE: ${{ github.event.inputs.config_override || '' }}
6167
run: |
6268
set -euo pipefail
6369
lep login -c "$LEP_LOGIN_CREDENTIALS" || true
6470
6571
# Map GPU type to node group
66-
if [ "${{ inputs.gpu_type }}" = "h200" ]; then
72+
if [ "$GPU_TYPE" = "h200" ]; then
6773
NODE_GROUP="nv-int-multiteam-nebius-h200-01"
68-
elif [ "${{ inputs.gpu_type }}" = "h100-sxm" ]; then
74+
elif [ "$GPU_TYPE" = "h100-sxm" ]; then
6975
NODE_GROUP="yo-bom-lepton-001"
7076
else
71-
echo "Error: Unknown GPU type: ${{ inputs.gpu_type }}"
77+
echo "Error: Unknown GPU type: $GPU_TYPE"
7278
exit 1
7379
fi
7480
7581
RUN_ONLY_ARGS=""
76-
if [ -n "${{ inputs.config_override }}" ]; then
77-
# Users can type: 10m or 10m,4b
78-
RUN_ONLY_ARGS="+run_only=${{ inputs.config_override }}"
82+
if [ -n "$CONFIG_OVERRIDE" ]; then
83+
RUN_ONLY_ARGS="+run_only=$CONFIG_OVERRIDE"
7984
fi
8085
8186
python ci/lepton/model_convergence/scripts/launch_job.py \
82-
--config-name recipes/${{ inputs.model_config }} \
87+
--config-name "recipes/$MODEL_CONFIG" \
8388
$RUN_ONLY_ARGS \
84-
branch=${{ inputs.branch }} \
85-
commit_sha=${{ inputs.commit_sha }} \
89+
branch="$BRANCH" \
90+
commit_sha=$COMMIT_SHA \
8691
node_group=$NODE_GROUP \
87-
gpu_type=${{ inputs.gpu_type }}
92+
gpu_type="$GPU_TYPE"

CODE-REVIEW.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
docs/docs/user-guide/contributing/code-review.md
1+
docs/docs/main/contributing/code-review.md

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
docs/docs/user-guide/contributing/contributing.md
1+
docs/docs/main/contributing/contributing.md

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.7rc1
1+
2.7

ci/benchmarks/partial-conv/amplify_pretrain.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
scope: partial-conv
22
time_limit: 14400
3+
# enable this when the data issue problem is fixed
4+
# artifacts:
5+
# # Artifact data mount paths for script execution, specified as mount_path: artifact_tag pairs.
6+
# # See Confluence Onboarding Guide section 5.4 for more details on locating this data.
7+
# /root/.cache/huggingface/datasets/chandar-lab___ur100_p: text/ur100p/processed/25_04
38
key_segments:
49
# Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
510
num_layers: False

ci/benchmarks/partial-conv/esm2_pretrain.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
scope: partial-conv
22
time_limit: 14400
3+
artifacts:
4+
# Artifact data mount paths for script execution, specified as mount_path: artifact_tag pairs.
5+
# See Confluence Onboarding Guide section 5.4 for more details on locating this data.
6+
/data-jetart/20240809_uniref_2024_03/: text/uniprot/2024_03/processed-esm2-2024-10-uncompressed
37
key_segments:
48
# Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
59
data_path: False
@@ -9,7 +13,7 @@ script_args:
913
# Arguments not referenced in the script string must have the 'arg' field specified.
1014
# See jet/core/configs.py for the specification of the configuration class
1115
workspace: /workspace/bionemo2
12-
data_path: /data/20240809_uniref_2024_03/data
16+
data_path: /data-jetart/20240809_uniref_2024_03/
1317
model: esm2
1418
variant: train
1519
config_name: 650M

ci/benchmarks/partial-conv/evo2_pretrain.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
scope: partial-conv
22
time_limit: 14400
3+
# artifacts:
4+
# # Artifact data mount paths for script execution, specified as mount_path: artifact_tag pairs.
5+
# # See Confluence Onboarding Guide section 5.4 for more details on locating this data.
6+
# # Needs update of script_args.data_path: /data-jetart/evo2. Cannot be enabled since Evo2 does not work with read-only folders as data mount.
7+
# /data-jetart/evo2/data : text/opengenome2/processed/2025-01
38
key_segments:
49
# Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
510
data_path: False

ci/benchmarks/partial-conv/geneformer_pretrain.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
scope: partial-conv
22
time_limit: 14400
3+
# artifacts:
4+
# # Artifact data mount paths for script execution, specified as mount_path: artifact_tag pairs.
5+
# # See Confluence Onboarding Guide section 5.4 for more details on locating this data.
6+
# /data-jetart/cellxgene_scdl : tabular/cellxgene/processed/scdl-2025-06-20
37
key_segments:
48
# Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
59
data_path: False
@@ -28,8 +32,6 @@ script: |-
2832
# Copying data to shared memory filesystem in Linux that provides a RAM-based temporary storage location
2933
COPY_FLAG="/tmp/copy_done_${{SLURMD_NODENAME}}";
3034
NEW_DATA_PATH="/dev/shm/data_path_${{SLURMD_NODENAME}}";
31-
COPY_FLAG="/tmp/copy_done_${{SLURMD_NODENAME}}";
32-
NEW_DATA_PATH="/dev/shm/data_path_${{SLURMD_NODENAME}}";
3335
if [ "$SLURM_LOCALID" = "0" ]; then
3436
df -h /dev/shm
3537
echo $NEW_DATA_PATH;

ci/lepton/model_convergence/configs/base.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
node_group_name: nv-int-multiteam-nebius-h200-01
2-
31
container:
42
image: nvcr.io/nvidia/pytorch:25.06-py3
53
registry_auth: lepton-nvidia

ci/lepton/model_convergence/configs/recipes/esm2_accelerate_te.yaml

Lines changed: 55 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,47 +3,88 @@ defaults:
33
- /base
44
- _self_
55

6-
# lepton info
6+
branch: jwilber/add-accelerate-l1-3b-config
7+
8+
############################################################
9+
# lepton job info
10+
############################################################
711
node_group: yo-bom-lepton-001
812
num_nodes: 2
913
device_type: gpu
10-
num_devices: 2
14+
num_devices: 8
1115
gpu_type: h100-sxm
12-
total_gpus: ${multiply:${num_devices},${num_nodes}}
1316
resource_shape: "${device_type}.${num_devices}x${gpu_type}"
1417

18+
############################################################
1519
# recipe identifiers
20+
# mostly used for logging and observability
21+
############################################################
1622
recipe_subdir: esm2_accelerate_te
1723
model_type: esm2
24+
variant: train # train, finetune
25+
26+
# Core identifiers for filtering
27+
framework: native # native, accelerate
28+
parallelism_strategy: fsdp2 # ddp, fsdp2, mfsdp
29+
precision: fp8 # likely bf16 or fp8
30+
te_enabled: true
31+
fp8_enabled: true
32+
33+
# Catchall for additional features/configs
34+
extras: [] # e.g. [thd]
35+
36+
############################################################
37+
# wandb info (total_gpus used for group name)
38+
############################################################
39+
# `total_gpus` calculated from lepton job info above
40+
total_gpus: ${multiply:${num_devices},${num_nodes}}
1841

19-
# wandb
2042
wandb_init_args:
2143
project: "test_convergence__recipes__${sanitize:${branch}}"
22-
group: "${model_type}__${task_cmd}__${total_gpus}__${sanitize:${gpu_type}}"
44+
group: "${model_type}__${task_cmd}__${total_gpus}gpus__${sanitize:${gpu_type}}"
2345
job_type: "${recipe_subdir}"
2446
name: null
2547

48+
############################################################
49+
# task commands
50+
# shared across all products (if not explicitly overridden)
51+
############################################################
52+
# task_cmd: train_fsdp2 # mfsdp
53+
task_cmd: train
54+
55+
# script overrides
56+
# these should match the keys in the recipe's config file
57+
# model_tag: nvidia/esm2_t36_3B_UR50D
58+
59+
micro_batch_size: 4
60+
# num_warmup_steps: 20_000
2661
# config overrides
2762
trainer:
2863
report_to: "wandb"
2964

30-
# train specific commands
31-
task_cmd: train
32-
stop_after_n_steps: 10
65+
stop_after_n_steps: 100
3366

34-
# configs to run
67+
############################################################
68+
# Each product is a different config to run, alongside
69+
# config-specific arguments. Must have a w`andb_name`.
70+
############################################################
3571
products:
36-
- config: L0_sanity
72+
- config: L1_3B
73+
acc_config: default
3774
wandb_name: "${config}__${now:%Y%m%d-%H%M%S}__${gitsha:}"
3875

39-
# training script to run
76+
############################################################
77+
# run script
78+
# This gets called right after `checkout_script` in the base config.
79+
############################################################
4080
run_script: |
41-
accelerate launch --config_file accelerate_config/default.yaml \
81+
accelerate launch --config_file accelerate_config/${acc_config}.yaml \
4282
${task_cmd}.py \
4383
--config-name=${config} \
4484
stop_after_n_steps=${stop_after_n_steps} \
45-
wandb_init_args.mode=${wandb_init_args.mode} \
85+
+wandb_init_args.mode=${wandb_init_args.mode} \
4686
+wandb_init_args.project=${wandb_init_args.project} \
4787
+wandb_init_args.group=${wandb_init_args.group} \
4888
+wandb_init_args.job_type=${wandb_init_args.job_type} \
49-
wandb_init_args.name=${wandb_name}
89+
wandb_init_args.name=${wandb_name} \
90+
trainer.per_device_train_batch_size=${micro_batch_size}

0 commit comments

Comments
 (0)