verl-project · conver334 · Feb 27, 2026 · Feb 27, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml
@@ -194,6 +194,21 @@ jobs:
           ray stop --force
           export VLLM_USE_V1=1
           ROLLOUT_QUANTIZATION=fp8 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
+      - name: Install Megatron-LM and Megatron-Bridge for Megatron-FSDP
+        run: |
+          pip3 install --no-deps --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@d4cacef87
+          pip3 install --no-deps --no-build-isolation git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@6fea5bb
+          pip3 install "nvidia-modelopt[torch]>=0.37.0"
+      - name: Running GSM8K E2E PPO training tests on 8 L20 GPUs with Megatron-FSDP (Qwen3)
+        run: |
+          ray stop --force
+          ALL_OFFLOAD=False USE_MBRIDGE=True VANILLA_MBRIDGE=False USE_MEGATRON_FSDP=True \
+          TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B \
+          COMMON_PP=1 COMMON_VPP=null COMMON_CP=1 COMMON_TP=1 INFER_TP=1 \
+          bash tests/special_e2e/run_ppo_trainer_megatron.sh \
+            ++actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=False \
+            ++actor_rollout_ref.ref.megatron.override_transformer_config.gradient_accumulation_fusion=False \
+            ++critic.megatron.override_transformer_config.gradient_accumulation_fusion=False
       - name: clean up
         run: |
           rm -rf checkpoints

@@ -0,0 +1,73 @@
+Megatron-FSDP Example
+========================
+
+Last updated: 04/29/2026.
+
+Introduction
+------------
+
+In this example, we run SFT and RL training with Megatron-FSDP:
+
+- Runtime image: ``verlai/verl:vllm011.dev7``
+
+Step 1: Prepare
+--------------------
+
+Download ``Megatron-LM`` and ``Megatron-Bridge``. The required Megatron-FSDP support has already been merged into
+   ``Megatron-LM`` main
+   (`<https://github.com/NVIDIA/Megatron-LM/pull/3191>`) and
+   ``Megatron-Bridge`` main
+   (`<https://github.com/NVIDIA-NeMo/Megatron-Bridge/pull/3512>`).
+
+.. code:: bash
+
+   git clone https://github.com/NVIDIA/Megatron-LM.git
+   git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
+
+Step 2: Run Megatron-FSDP SFT
+----------------------------
+
+Before launch, check and update key fields ``MODEL_PATH`` and ``SAVE_PATH`` in the script.
+
+.. code:: bash
+
+   bash examples/sft/gsm8k/run_qwen_megatron_fsdp.sh
+
+Step 3: Run Megatron-FSDP RL
+----------------------------
+
+Before launch, check and update key fields in
+``examples/grpo_trainer/run_qwen2-7b_math_megatron_fsdp.sh``:
+
+- ``actor_rollout_ref.model.path``: model name or local model path.
+- ``train_files`` / ``test_files``: parquet paths for GSM8K and MATH.
+- ``trainer.n_gpus_per_node`` and ``trainer.nnodes``: hardware topology.
+- ``trainer.project_name`` and ``trainer.experiment_name``: experiment identifiers.
+
+Then run:
+
+.. code:: bash
+
+   bash examples/grpo_trainer/run_qwen2-7b_math_megatron_fsdp.sh
+
+The script launches RL training and enables Megatron-FSDP with:
+
+- ``actor_rollout_ref.actor.megatron.use_mbridge=True``
+- ``actor_rollout_ref.actor.megatron.vanilla_mbridge=False``
+- ``actor_rollout_ref.actor.megatron.use_megatron_fsdp=True``
+
+Checkpoint Notes
+----------------
+
+Megatron-FSDP checkpoints are saved as DTensor checkpoints under ``dist_ckpt``.
+When ``checkpoint.save_contents`` includes ``model``, verl also saves the HuggingFace config and
+tokenizer under ``huggingface``; HF weights can also be exported through Megatron-Bridge.
+
+Current Megatron-FSDP checkpoint examples assume:
+
+- ``use_distributed_optimizer=True``.
+- ``CUDA_DEVICE_MAX_CONNECTIONS`` is unset or greater than ``1``.
+- PEFT + Megatron-FSDP checkpoint save/load is not covered by this example yet.
+- ``checkpoint.async_save=True`` is not covered for Megatron-FSDP DTensor checkpoints yet.
+- Megatron-FSDP checkpoints do not support saving optimizer state by itself; include ``model`` whenever
+  ``optimizer`` is listed in ``checkpoint.save_contents``.
@@ -61,6 +61,7 @@ verl is fast with:
 
    examples/ppo_code_architecture
    examples/gsm8k_example
+   examples/megatron_fsdp_example
    examples/multi_modal_example
    examples/skypilot_examples
 

@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
+unset ROCR_VISIBLE_DEVICES
+export VLLM_USE_V1=1
+export VLLM_ALLREDUCE_USE_SYMM_MEM=0
+
+########################### Quick Config ###########################
+
+TP=${TP:-4}
+PP=${PP:-1}
+GEN_TP=${GEN_TP:-4}
+
+rollout_mode=${rollout_mode:-async}
+return_raw_chat=${return_raw_chat:-True}
+USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-False}
+
+HF_MODEL_PATH=${HF_MODEL_PATH:-Qwen/Qwen2.5-Math-7B}
+gsm8k_train_path=${gsm8k_train_path:-$HOME/data/gsm8k/train.parquet}
+gsm8k_test_path=${gsm8k_test_path:-$HOME/data/gsm8k/test.parquet}
+math_train_path=${math_train_path:-$HOME/data/math/train.parquet}
+math_test_path=${math_test_path:-$HOME/data/math/test.parquet}
+
+train_files=${train_files:-"['$gsm8k_train_path', '$math_train_path']"}
+test_files=${test_files:-"['$gsm8k_test_path', '$math_test_path']"}
+
+########################### Parameter Arrays ###########################
+
+DATA=(
+    "data.train_files=${train_files}"
+    "data.val_files=${test_files}"
+    "data.return_raw_chat=${return_raw_chat}"
+    data.train_batch_size=32
+    data.max_prompt_length=512
+    data.max_response_length=512
+    data.filter_overlong_prompts=True
+    data.truncation='error'
+)
+
+MODEL=(
+    "actor_rollout_ref.model.path=${HF_MODEL_PATH}"
+    "actor_rollout_ref.model.use_fused_kernels=${USE_FUSED_KERNELS}"
+)
+
+ACTOR=(
+    actor_rollout_ref.actor.optim.lr=1e-6
+    actor_rollout_ref.actor.ppo_mini_batch_size=16
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+    actor_rollout_ref.actor.use_kl_loss=True
+    actor_rollout_ref.actor.kl_loss_coef=0.001
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    actor_rollout_ref.actor.entropy_coeff=0
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${PP}
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TP}
+    actor_rollout_ref.actor.megatron.use_mbridge=True
+    actor_rollout_ref.actor.megatron.vanilla_mbridge=False
+    actor_rollout_ref.actor.megatron.use_megatron_fsdp=True
+    ++actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=False
+)
+
+ROLLOUT=(
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP}
+    actor_rollout_ref.rollout.name=vllm
+    "actor_rollout_ref.rollout.mode=${rollout_mode}"
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+    actor_rollout_ref.rollout.n=2
+)
+
+REF=(
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${PP}
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TP}
+    actor_rollout_ref.ref.megatron.use_mbridge=True
+    actor_rollout_ref.ref.megatron.vanilla_mbridge=False
+    actor_rollout_ref.ref.megatron.use_megatron_fsdp=True
+    ++actor_rollout_ref.ref.megatron.override_transformer_config.gradient_accumulation_fusion=False
+)
+
+ALGORITHM=(
+    algorithm.adv_estimator=grpo
+    algorithm.use_kl_in_reward=False
+)
+
+TRAINER=(
+    trainer.critic_warmup=0
+    trainer.logger='["console","wandb"]'
+    trainer.project_name='verl_grpo_example_gsm8k_math'
+    trainer.experiment_name='qwen2_7b_megatron_fsdp'
+    trainer.n_gpus_per_node=8
+    trainer.nnodes=1
+    trainer.save_freq=20
+    trainer.test_freq=5
+    trainer.total_epochs=15
+)
+
+########################### Launch ###########################
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    "${DATA[@]}" \
+    "${ALGORITHM[@]}" \
+    "${MODEL[@]}" \
+    "${ROLLOUT[@]}" \
+    "${ACTOR[@]}" \
+    "${REF[@]}" \
+    "${TRAINER[@]}" \
+    "$@"
diff --git a/examples/sft/gsm8k/run_qwen_megatron_fsdp.sh b/examples/sft/gsm8k/run_qwen_megatron_fsdp.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+########################### Quick Config ###########################
+
+MODEL_PATH=${MODEL_PATH:-Qwen/Qwen2.5-Math-7B}
+SAVE_PATH=${SAVE_PATH:-/root/checkpoints/Qwen2.5-Math-7B}
+TRAIN_FILES=${TRAIN_FILES:-$HOME/data/gsm8k_sft/train.parquet}
+VAL_FILES=${VAL_FILES:-$HOME/data/gsm8k_sft/test.parquet}
+
+NPROC=${NPROC:-8}
+TP=${TP:-4}
+PP=${PP:-1}
+EP=${EP:-1}
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HYDRA_FULL_ERROR=1
+unset ROCR_VISIBLE_DEVICES
+
+########################### Parameter Arrays ###########################
+
+DATA=(
+    "data.train_files=${TRAIN_FILES}"
+    "data.val_files=${VAL_FILES}"
+    data.messages_key=messages
+    data.train_batch_size=8
+    data.use_dynamic_bsz=True
+    data.max_token_len_per_gpu=1024
+    data.pad_mode=no_padding
+    data.truncation=error
+)
+
+MODEL=(
+    model=hf_model
+    "model.path=${MODEL_PATH}"
+    model.trust_remote_code=True
+    model.use_remove_padding=true
+)
+
+OPTIM=(
+    optim=megatron
+    optim.lr=1e-5
+    optim.lr_warmup_steps_ratio=0.2
+    optim.weight_decay=0.1
+    "optim.betas=[0.9,0.95]"
+    optim.clip_grad=1.0
+    optim.lr_warmup_init=0
+    optim.lr_decay_style=cosine
+    optim.min_lr=1e-6
+)
+
+ENGINE=(
+    engine=megatron
+    engine.tensor_model_parallel_size=${TP}
+    engine.pipeline_model_parallel_size=${PP}
+    engine.expert_model_parallel_size=${EP}
+    engine.use_mbridge=True
+    engine.vanilla_mbridge=False
+    engine.use_megatron_fsdp=True
+    +engine.override_transformer_config.gradient_accumulation_fusion=False
+)
+
+TRAINER=(
+    "trainer.default_local_dir=${SAVE_PATH}"
+    trainer.project_name=gsm8k-sft
+    trainer.experiment_name=SFT-qwen2.5-7b-mfsdp
+    trainer.logger='["console","wandb","file"]'
+    trainer.total_epochs=4
+)
+
+########################### Launch ###########################
+
+torchrun --standalone --nnodes=1 --nproc_per_node=$NPROC \
+    -m verl.trainer.sft_trainer \
+    "${DATA[@]}" \
+    "${MODEL[@]}" \
+    "${OPTIM[@]}" \
+    "${ENGINE[@]}" \
+    "${TRAINER[@]}" \
+    "$@"
diff --git a/tests/special_e2e/run_ppo_trainer_megatron.sh b/tests/special_e2e/run_ppo_trainer_megatron.sh
@@ -107,6 +107,7 @@ RM_PARAM_OFFLOAD=${RM_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
 USE_MBRIDGE=${USE_MBRIDGE:-True}
 VANILLA_MBRIDGE=${VANILLA_MBRIDGE:-True}
 VALUE_VANILLA_MBRIDGE=${VALUE_VANILLA_MBRIDGE:-$VANILLA_MBRIDGE}
+USE_MEGATRON_FSDP=${USE_MEGATRON_FSDP:-False}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-False}
 
 LR_WARMUP_STEPS=${LR_WARMUP_STEPS:-null}
@@ -187,6 +188,7 @@ python3 -m verl.trainer.main_ppo --config-path=config \
     actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${ppo_max_token_len_per_gpu} \
     actor_rollout_ref.actor.megatron.use_mbridge=${USE_MBRIDGE} \
     actor_rollout_ref.actor.megatron.vanilla_mbridge=${VANILLA_MBRIDGE} \
+    actor_rollout_ref.actor.megatron.use_megatron_fsdp=${USE_MEGATRON_FSDP} \
     actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$ACTOR_PP \
     actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=$ACTOR_VPP \
     actor_rollout_ref.actor.megatron.context_parallel_size=$ACTOR_CP \
@@ -216,6 +218,7 @@ python3 -m verl.trainer.main_ppo --config-path=config \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
     actor_rollout_ref.ref.megatron.use_mbridge=${USE_MBRIDGE} \
     actor_rollout_ref.ref.megatron.vanilla_mbridge=${VANILLA_MBRIDGE} \
+    actor_rollout_ref.ref.megatron.use_megatron_fsdp=${USE_MEGATRON_FSDP} \
     actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$REF_PP \
     actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=$REF_VPP \
     actor_rollout_ref.ref.megatron.context_parallel_size=$REF_CP \
@@ -238,6 +241,7 @@ python3 -m verl.trainer.main_ppo --config-path=config \
     critic.ppo_max_token_len_per_gpu=${forward_max_token_len_per_gpu} \
     critic.megatron.use_mbridge=${USE_MBRIDGE} \
     critic.megatron.vanilla_mbridge=${VALUE_VANILLA_MBRIDGE} \
+    critic.megatron.use_megatron_fsdp=${USE_MEGATRON_FSDP} \
     critic.megatron.pipeline_model_parallel_size=$CRITIC_PP \
     critic.megatron.virtual_pipeline_model_parallel_size=$CRITIC_VPP \
     critic.megatron.context_parallel_size=$CRITIC_CP \

@@ -59,6 +59,7 @@ actor_rollout_ref:
       use_mbridge: true
       vanilla_mbridge: true
       use_remove_padding: true
+      use_megatron_fsdp: false
       forward_only: false
       dtype: bfloat16
       router_replay:
@@ -240,6 +241,7 @@ actor_rollout_ref:
       use_mbridge: ${oc.select:actor_rollout_ref.actor.megatron.use_mbridge,False}
       vanilla_mbridge: ${oc.select:actor_rollout_ref.actor.megatron.vanilla_mbridge,True}
       use_remove_padding: ${oc.select:actor_rollout_ref.actor.megatron.use_remove_padding,True}
+      use_megatron_fsdp: false
       forward_only: true
       dtype: bfloat16
       router_replay:
@@ -543,6 +545,7 @@ critic:
     use_mbridge: true
     vanilla_mbridge: true
     use_remove_padding: true
+    use_megatron_fsdp: false
     forward_only: false
     dtype: bfloat16
     router_replay:

@@ -96,6 +96,9 @@ vanilla_mbridge: True
 # whether to use thd format (sequence packing), if not, use bshd format, padding the input_ids to the longest sequence length
 use_remove_padding: True
 
+# Whether to use Megatron-FSDP (Zero-3 sharding)
+use_megatron_fsdp: False
+
 # whether to use forward only
 forward_only: False