Merge remote-tracking branch 'origin' into omosafi/add-esm2-TE-to-HF

ohadmo · ohadmo · commit d2ff7e87f7d6 · 2025-10-01T19:39:54.000-07:00
diff --git a/.github/workflows/convergence-tests.yml b/.github/workflows/convergence-tests.yml
@@ -11,29 +11,30 @@ on:
         options:
           - h100-sxm
           - h200
-      branch:
-        description: "Branch to use (ignored if commit SHA is provided)"
-        required: true
-        default: "main"
-        type: string
-      commit_sha:
-        description: "Commit SHA (optional - overrides branch if provided)"
-        required: false
-        type: string
       model_config:
         description: "Model configuration to use"
         required: true
-        default: "esm2_accelerate"
+        default: "esm2_native_te"
         type: choice
         options:
           - esm2_accelerate_te.yaml
           - esm2_native_te
-          - esm2_native_te_mfsdp_thd
           - geneformer_native_te_mfsdp_fp8
       config_override:
         description: "Optional: run only these product configs (CSV). Examples: 10m  or  10m,4b. Leave blank to run all."
         required: false
         type: string
+      branch:
+        description: "Branch to use (ignored if commit SHA is provided)"
+        required: true
+        default: "main"
+        type: string
+      commit_sha:
+        description: "Commit SHA (optional - overrides branch if provided)"
+        required: false
+        type: string
+  schedule:
+    - cron: "0 12 * * *" # everyday at 4am PST
 
 jobs:
   submit-lepton-jobs:
@@ -58,30 +59,34 @@ jobs:
       - name: Submit Lepton Jobs
         env:
           LEP_LOGIN_CREDENTIALS: ${{ secrets.LEP_LOGIN_CREDENTIALS }}
+          GPU_TYPE: ${{ github.event.inputs.gpu_type      || 'h100-sxm' }}
+          BRANCH: ${{ github.event.inputs.branch        || 'main' }}
+          COMMIT_SHA: ${{ github.event.inputs.commit_sha    || '' }}
+          MODEL_CONFIG: ${{ github.event.inputs.model_config  || 'esm2_native_te' }}
+          CONFIG_OVERRIDE: ${{ github.event.inputs.config_override || '' }}
         run: |
           set -euo pipefail
           lep login -c "$LEP_LOGIN_CREDENTIALS" || true
 
           # Map GPU type to node group
-          if [ "${{ inputs.gpu_type }}" = "h200" ]; then
+          if [ "$GPU_TYPE" = "h200" ]; then
             NODE_GROUP="nv-int-multiteam-nebius-h200-01"
-          elif [ "${{ inputs.gpu_type }}" = "h100-sxm" ]; then
+          elif [ "$GPU_TYPE" = "h100-sxm" ]; then
             NODE_GROUP="yo-bom-lepton-001"
           else
-            echo "Error: Unknown GPU type: ${{ inputs.gpu_type }}"
+            echo "Error: Unknown GPU type: $GPU_TYPE"
             exit 1
           fi
 
           RUN_ONLY_ARGS=""
-          if [ -n "${{ inputs.config_override }}" ]; then
-            # Users can type: 10m  or  10m,4b
-            RUN_ONLY_ARGS="+run_only=${{ inputs.config_override }}"
+          if [ -n "$CONFIG_OVERRIDE" ]; then
+            RUN_ONLY_ARGS="+run_only=$CONFIG_OVERRIDE"
           fi
 
           python ci/lepton/model_convergence/scripts/launch_job.py \
-            --config-name recipes/${{ inputs.model_config }} \
+            --config-name "recipes/$MODEL_CONFIG" \
             $RUN_ONLY_ARGS \
-            branch=${{ inputs.branch }} \
-            commit_sha=${{ inputs.commit_sha }} \
+            branch="$BRANCH" \
+            commit_sha=$COMMIT_SHA \
             node_group=$NODE_GROUP \
-            gpu_type=${{ inputs.gpu_type }}
+            gpu_type="$GPU_TYPE"
diff --git a/CODE-REVIEW.md b/CODE-REVIEW.md
@@ -1 +1 @@
-docs/docs/user-guide/contributing/code-review.md
+docs/docs/main/contributing/code-review.md
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1 +1 @@
-docs/docs/user-guide/contributing/contributing.md
+docs/docs/main/contributing/contributing.md
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.7rc1
+2.7
diff --git a/ci/benchmarks/partial-conv/amplify_pretrain.yaml b/ci/benchmarks/partial-conv/amplify_pretrain.yaml
@@ -1,5 +1,10 @@
 scope: partial-conv
 time_limit: 14400
+# enable this when the data issue problem is fixed
+# artifacts:
+#   # Artifact data mount paths for script execution, specified as mount_path: artifact_tag pairs.
+#   # See Confluence Onboarding Guide section 5.4 for more details on locating this data.
+#   /root/.cache/huggingface/datasets/chandar-lab___ur100_p: text/ur100p/processed/25_04
 key_segments:
   # Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
   num_layers: False
diff --git a/ci/benchmarks/partial-conv/esm2_pretrain.yaml b/ci/benchmarks/partial-conv/esm2_pretrain.yaml
@@ -1,5 +1,9 @@
 scope: partial-conv
 time_limit: 14400
+artifacts:
+  # Artifact data mount paths for script execution, specified as mount_path: artifact_tag pairs.
+  # See Confluence Onboarding Guide section 5.4 for more details on locating this data.
+  /data-jetart/20240809_uniref_2024_03/: text/uniprot/2024_03/processed-esm2-2024-10-uncompressed
 key_segments:
   # Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
   data_path: False
@@ -9,7 +13,7 @@ script_args:
   # Arguments not referenced in the script string must have the 'arg' field specified.
   # See jet/core/configs.py for the specification of the configuration class
   workspace: /workspace/bionemo2
-  data_path: /data/20240809_uniref_2024_03/data
+  data_path: /data-jetart/20240809_uniref_2024_03/
   model: esm2
   variant: train
   config_name: 650M
diff --git a/ci/benchmarks/partial-conv/evo2_pretrain.yaml b/ci/benchmarks/partial-conv/evo2_pretrain.yaml
@@ -1,5 +1,10 @@
 scope: partial-conv
 time_limit: 14400
+# artifacts:
+#   # Artifact data mount paths for script execution, specified as mount_path: artifact_tag pairs.
+#   # See Confluence Onboarding Guide section 5.4 for more details on locating this data.
+#   # Needs update of script_args.data_path: /data-jetart/evo2. Cannot be enabled since Evo2 does not work with read-only folders as data mount.
+#   /data-jetart/evo2/data : text/opengenome2/processed/2025-01
 key_segments:
   # Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
   data_path: False
diff --git a/ci/benchmarks/partial-conv/geneformer_pretrain.yaml b/ci/benchmarks/partial-conv/geneformer_pretrain.yaml
@@ -1,5 +1,9 @@
 scope: partial-conv
 time_limit: 14400
+# artifacts:
+#   # Artifact data mount paths for script execution, specified as mount_path: artifact_tag pairs.
+#   # See Confluence Onboarding Guide section 5.4 for more details on locating this data.
+#   /data-jetart/cellxgene_scdl : tabular/cellxgene/processed/scdl-2025-06-20
 key_segments:
   # Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
   data_path: False
@@ -28,8 +32,6 @@ script: |-
   # Copying data to shared memory filesystem in Linux that provides a RAM-based temporary storage location
   COPY_FLAG="/tmp/copy_done_${{SLURMD_NODENAME}}";
   NEW_DATA_PATH="/dev/shm/data_path_${{SLURMD_NODENAME}}";
-  COPY_FLAG="/tmp/copy_done_${{SLURMD_NODENAME}}";
-  NEW_DATA_PATH="/dev/shm/data_path_${{SLURMD_NODENAME}}";
   if [ "$SLURM_LOCALID" = "0" ]; then
       df -h /dev/shm
       echo $NEW_DATA_PATH;
diff --git a/ci/lepton/model_convergence/configs/base.yaml b/ci/lepton/model_convergence/configs/base.yaml
@@ -1,5 +1,3 @@
-node_group_name: nv-int-multiteam-nebius-h200-01
-
 container:
   image: nvcr.io/nvidia/pytorch:25.06-py3
   registry_auth: lepton-nvidia
diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_accelerate_te.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_accelerate_te.yaml
@@ -3,47 +3,88 @@ defaults:
   - /base
   - _self_
 
-# lepton info
+branch: jwilber/add-accelerate-l1-3b-config
+
+############################################################
+# lepton job info
+############################################################
 node_group: yo-bom-lepton-001
 num_nodes: 2
 device_type: gpu
-num_devices: 2
+num_devices: 8
 gpu_type: h100-sxm
-total_gpus: ${multiply:${num_devices},${num_nodes}}
 resource_shape: "${device_type}.${num_devices}x${gpu_type}"
 
+############################################################
 # recipe identifiers
+# mostly used for logging and observability
+############################################################
 recipe_subdir: esm2_accelerate_te
 model_type: esm2
+variant: train # train, finetune
+
+# Core identifiers for filtering
+framework: native # native, accelerate
+parallelism_strategy: fsdp2 # ddp, fsdp2, mfsdp
+precision: fp8 # likely bf16 or fp8
+te_enabled: true
+fp8_enabled: true
+
+# Catchall for additional features/configs
+extras: [] # e.g. [thd]
+
+############################################################
+# wandb info (total_gpus used for group name)
+############################################################
+# `total_gpus` calculated from lepton job info above
+total_gpus: ${multiply:${num_devices},${num_nodes}}
 
-# wandb
 wandb_init_args:
   project: "test_convergence__recipes__${sanitize:${branch}}"
-  group: "${model_type}__${task_cmd}__${total_gpus}__${sanitize:${gpu_type}}"
+  group: "${model_type}__${task_cmd}__${total_gpus}gpus__${sanitize:${gpu_type}}"
   job_type: "${recipe_subdir}"
   name: null
 
+############################################################
+# task commands
+# shared across all products (if not explicitly overridden)
+############################################################
+# task_cmd: train_fsdp2 # mfsdp
+task_cmd: train
+
+# script overrides
+# these should match the keys in the recipe's config file
+# model_tag: nvidia/esm2_t36_3B_UR50D
+
+micro_batch_size: 4
+# num_warmup_steps: 20_000
 # config overrides
 trainer:
   report_to: "wandb"
 
-# train specific commands
-task_cmd: train
-stop_after_n_steps: 10
+stop_after_n_steps: 100
 
-# configs to run
+############################################################
+# Each product is a different config to run, alongside
+# config-specific arguments. Must have a w`andb_name`.
+############################################################
 products:
-  - config: L0_sanity
+  - config: L1_3B
+    acc_config: default
     wandb_name: "${config}__${now:%Y%m%d-%H%M%S}__${gitsha:}"
 
-# training script to run
+############################################################
+# run script
+# This gets called right after `checkout_script` in the base config.
+############################################################
 run_script: |
-  accelerate launch --config_file accelerate_config/default.yaml \
+  accelerate launch --config_file accelerate_config/${acc_config}.yaml \
     ${task_cmd}.py \
     --config-name=${config} \
     stop_after_n_steps=${stop_after_n_steps} \
-    wandb_init_args.mode=${wandb_init_args.mode} \
+    +wandb_init_args.mode=${wandb_init_args.mode} \
     +wandb_init_args.project=${wandb_init_args.project} \
     +wandb_init_args.group=${wandb_init_args.group} \
     +wandb_init_args.job_type=${wandb_init_args.job_type} \
-    wandb_init_args.name=${wandb_name}
+    wandb_init_args.name=${wandb_name} \
+    trainer.per_device_train_batch_size=${micro_batch_size}
diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te.yaml
@@ -3,46 +3,101 @@ defaults:
   - /base
   - _self_
 
-# lepton info
+############################################################
+# lepton job info
+############################################################
 node_group: yo-bom-lepton-001
 num_nodes: 1
 device_type: gpu
 num_devices: 2
 gpu_type: h100-sxm
-total_gpus: ${multiply:${num_devices},${num_nodes}}
 resource_shape: "${device_type}.${num_devices}x${gpu_type}"
 
+############################################################
 # recipe identifiers
+# mostly used for logging and observability
+############################################################
 recipe_subdir: esm2_native_te
 model_type: esm2
+variant: train # train, finetune
+
+# Core identifiers for filtering
+framework: native # native, accelerate
+parallelism_strategy: fsdp2 # ddp, fsdp2, mfsdp
+precision: fp8 # likely bf16 or fp8
+te_enabled: true
+fp8_enabled: true
+
+# Catchall for additional features/configs
+extras: [] # e.g. [thd]
+
+############################################################
+# wandb info (total_gpus used for group name)
+############################################################
+# `total_gpus` calculated from lepton job info above
+total_gpus: ${multiply:${num_devices},${num_nodes}}
 
-# wandb
 wandb_init_args:
   project: "test_convergence__recipes__${sanitize:${branch}}"
-  group: "${model_type}__${task_cmd}__${total_gpus}__${sanitize:${gpu_type}}"
+  group: "${model_type}__${task_cmd}__${total_gpus}gpus__${sanitize:${gpu_type}}"
   job_type: "${recipe_subdir}"
   name: null
 
-# train specific commands
-task_cmd: train_fsdp2 # mfsdp
-num_train_steps: 100
+############################################################
+# task commands
+# shared across all products (if not explicitly overridden)
+############################################################
+
+# script overrides
+# these should match the keys in the recipe's config file
+model_tag: nvidia/esm2_t36_3B_UR50D
+# task_cmd: train_fsdp2 # mfsdp
+num_train_steps: 10_000
+micro_batch_size: 16
+num_warmup_steps: 20_000
 
-# configs to run
+# checkpoint controls
+ckpt_dir: ""
+save_checkpoints: false
+save_final_model: false
+resume_from_checkpoint: false
+use_distributed_checkpoint_fsdp2: false
+save_every_n_steps: 50
+
+############################################################
+# Each product is a different config to run, alongside
+# config-specific arguments. Must have a w`andb_name`.
+############################################################
 products:
-  - config: L0_sanity
+  - config: L1_3B
+    task_cmd: train_fsdp2
+    wandb_name: "${config}__${now:%Y%m%d-%H%M%S}__${gitsha:}"
+  - config: L1_3B
+    task_cmd: train_mfsdp
+    wandb_name: "${config}__${now:%Y%m%d-%H%M%S}__${gitsha:}"
+    micro_batch_size: 2
+  - config: L1_3B
+    task_cmd: train_ddp
     wandb_name: "${config}__${now:%Y%m%d-%H%M%S}__${gitsha:}"
-#     resource_shape: gpu.2xh200
-#   # - config: L1_3B
-#   #   resource_shape: gpu.2xh200
-#   # - config: L1_15B_perf_test
 
-# training script to run
+############################################################
+# run script
+# This gets called right after `checkout_script` in the base config.
+############################################################
 run_script: |
   torchrun ${task_cmd}.py \
     --config-name ${config}.yaml \
-    num_train_steps=${num_train_steps} \
-    wandb_init_args.mode=${wandb_init_args.mode} \
-    +wandb_init_args.project=${wandb_init_args.project} \
+    +wandb_init_args.mode=${wandb_init_args.mode} \
+    wandb_init_args.project=${wandb_init_args.project} \
     +wandb_init_args.group=${wandb_init_args.group} \
     +wandb_init_args.job_type=${wandb_init_args.job_type} \
-    wandb_init_args.name=${wandb_name}
+    wandb_init_args.name=${wandb_name} \
+    num_train_steps=${num_train_steps} \
+    dataset.micro_batch_size=${micro_batch_size} \
+    lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \
+    checkpoint.ckpt_dir=${ckpt_dir} \
+    checkpoint.save_final_model=${save_final_model} \
+    checkpoint.resume_from_checkpoint=${resume_from_checkpoint} \
+    checkpoint.save_every_n_steps=${save_every_n_steps} \
+    +checkpoint.save_checkpoints=${save_checkpoints} \
+    +checkpoint.use_distributed_checkpoint_fsdp2=${use_distributed_checkpoint_fsdp2}
diff --git a/ci/lepton/model_convergence/scripts/launch_job.py b/ci/lepton/model_convergence/scripts/launch_job.py
diff --git a/ci/lepton/model_convergence/scripts/wrap_template.sh b/ci/lepton/model_convergence/scripts/wrap_template.sh
diff --git a/docs/docs/main/about/releasenotes-fw.md b/docs/docs/main/about/releasenotes-fw.md

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-docs/docs/user-guide/contributing/code-review.md`
	`1`	`+docs/docs/main/contributing/code-review.md`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-docs/docs/user-guide/contributing/contributing.md`
	`1`	`+docs/docs/main/contributing/contributing.md`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,3 @@`
`1`		`-node_group_name: nv-int-multiteam-nebius-h200-01`
`2`		`-`
`3`	`1`	`container:`
`4`	`2`	`image: nvcr.io/nvidia/pytorch:25.06-py3`
`5`	`3`	`registry_auth: lepton-nvidia`