skypilot-org · aflah02 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 25, 2025
diff --git a/README.md b/README.md
@@ -181,7 +181,7 @@ Latest featured examples:
 
 | Task | Examples |
 |----------|----------|
-| Training | [Verl](https://docs.skypilot.co/en/latest/examples/training/verl.html), [Finetune Llama 4](https://docs.skypilot.co/en/latest/examples/training/llama-4-finetuning.html), [TorchTitan](https://docs.skypilot.co/en/latest/examples/training/torchtitan.html), [PyTorch](https://docs.skypilot.co/en/latest/getting-started/tutorial.html), [DeepSpeed](https://docs.skypilot.co/en/latest/examples/training/deepspeed.html), [NeMo](https://docs.skypilot.co/en/latest/examples/training/nemo.html), [Ray](https://docs.skypilot.co/en/latest/examples/training/ray.html), [Unsloth](https://docs.skypilot.co/en/latest/examples/training/unsloth.html), [Jax/TPU](https://docs.skypilot.co/en/latest/examples/training/tpu.html) |
+| Training | [Verl](https://docs.skypilot.co/en/latest/examples/training/verl.html), [Finetune Llama 4](https://docs.skypilot.co/en/latest/examples/training/llama-4-finetuning.html), [TorchTitan](https://docs.skypilot.co/en/latest/examples/training/torchtitan.html), [PyTorch](https://docs.skypilot.co/en/latest/getting-started/tutorial.html), [DeepSpeed](https://docs.skypilot.co/en/latest/examples/training/deepspeed.html), [NeMo](https://docs.skypilot.co/en/latest/examples/training/nemo.html), [Ray](https://docs.skypilot.co/en/latest/examples/training/ray.html), [Unsloth](https://docs.skypilot.co/en/latest/examples/training/unsloth.html), [Jax/TPU](https://docs.skypilot.co/en/latest/examples/training/tpu.html), [OpenRLHF](https://docs.skypilot.co/en/latest/examples/training/openrlhf.html) |
 | Serving | [vLLM](https://docs.skypilot.co/en/latest/examples/serving/vllm.html), [SGLang](https://docs.skypilot.co/en/latest/examples/serving/sglang.html), [Ollama](https://docs.skypilot.co/en/latest/examples/serving/ollama.html) |
 | Models | [DeepSeek-R1](https://docs.skypilot.co/en/latest/examples/models/deepseek-r1.html), [Llama 4](https://docs.skypilot.co/en/latest/examples/models/llama-4.html), [Llama 3](https://docs.skypilot.co/en/latest/examples/models/llama-3.html), [CodeLlama](https://docs.skypilot.co/en/latest/examples/models/codellama.html), [Qwen](https://docs.skypilot.co/en/latest/examples/models/qwen.html), [Kimi-K2](https://docs.skypilot.co/en/latest/examples/models/kimi-k2.html), [Mixtral](https://docs.skypilot.co/en/latest/examples/models/mixtral.html) |
 | AI apps | [RAG](https://docs.skypilot.co/en/latest/examples/applications/rag.html), [vector databases](https://docs.skypilot.co/en/latest/examples/applications/vector_database.html) (ChromaDB, CLIP) |

diff --git a/docs/source/examples/training/index.rst b/docs/source/examples/training/index.rst
@@ -15,6 +15,7 @@ Training
    nanochat <nanochat.md>
    NeMo <nemo.md>
    NeMo RL <nemorl.md>
+   OpenRLHF <openrlhf.md>
    Ray <ray.md>
    TorchTitan <torchtitan.md>
    Training on TPUs <tpu.md>

diff --git a/docs/source/examples/training/openrlhf.md b/docs/source/examples/training/openrlhf.md
@@ -0,0 +1 @@
+../../generated-examples/openrlhf.md
diff --git a/llm/openrlhf/README.md b/llm/openrlhf/README.md
@@ -0,0 +1,50 @@
+# OpenRLHF
+
+## What is OpenRLHF?
+
+Before diving into the implementation, it helps to understand what OpenRLHF actually is. As the name suggests, it is a library to perform Reinforcement Learning from Human Feedback (RLHF). OpenRLHF describes itself as an easy-to-use, high-performance RLHF framework built on Ray, vLLM, ZeRO-3, and HuggingFace Transformers. Despite the growing number of RLHF libraries today, OpenRLHF remains a popular option because of its wide support for training algorithms and its highly optimized codebase.
+The framework is still under active development, so for this tutorial I used the latest commit at the time I began the project. That may change as new features land, but the core concepts should remain stable.
+
+The OpenRLHF architecture can appear complex at first glance, but it is built from a few intuitive components:
+
+- Ray: Ray manages distributed execution. OpenRLHF uses it to schedule the various roles in the RLHF pipeline such as actors, critics, reward models, and other processes across multiple GPUs. It also supports a Hybrid Engine mode that allows multiple components to be colocated on the same GPU in order to improve utilization.
+- vLLM and AutoTP: A significant portion of RLHF training time is spent generating new trajectories from the policy model. vLLM accelerates this step with optimized inference kernels and distributed features such as tensor parallelism, which make large-model inference fast and scalable.
+- ZeRO: Training large language models requires substantial memory, and ZeRO addresses this by sharding parameters and optimizer states across GPUs. It also supports offloading to CPU memory. Through its different stages, it provides fine-grained control over resource usage and efficiency.
+
+## Setting up a SkyPilot Environment
+
+You can set up an environment using the following commands:
+
+```bash
+uv venv --seed --python 3.10
+source .venv/bin/activate
+uv pip install "skypilot[gcp]"
+```
+
+The examples below use GCP; however, you're free to use any infrastructure you prefer.
+You can configure and install the appropriate libraries by following the instructions in the official [SkyPilot documentation](https://docs.skypilot.co/en/latest/getting-started/installation.html).
+
+## Running OpenRLHF Workloads
+
+The examples are derived from the [DPO](https://openrlhf.readthedocs.io/en/latest/non_rl.html#direct-preference-optimization-dpo), [RM Training](https://openrlhf.readthedocs.io/en/latest/rl.html#reward-model-training), and [SFT](https://openrlhf.readthedocs.io/en/latest/rl.html#supervised-fine-tuning) tutorials present in the OpenRLHF documentation.
+
+They set up the environment within a Docker container, following the recommendations in the Quick Start guide (https://openrlhf.readthedocs.io/en/latest/quick_start.html#installation). The model sizes, batch sizes, and related parameters are scaled down to run efficiently on an 8×A100-40GB cluster and to keep execution time manageable.
+
+All the examples use Spot Instances by default which can easily be changed by setting use_spot to False
+
+Supervised Finetuning (SFT) example -
+```
+WANDB_TOKEN=xxx HF_TOKEN=xxx sky launch -c open-rlhf-sft openrlhf_sft.yaml --secret WANDB_TOKEN --secret HF_TOKEN
+```
+
+Direct Preference Optimization (DPO) example - 
+```
+WANDB_TOKEN=xxx sky launch -c open-rlhf-dpo openrlhf_dpo.yaml --secret WANDB_TOKEN
+```
+
+Reward Model Training example - 
+```
+WANDB_TOKEN=xxx sky launch -c open-rlhf-rm open-rlhf-rm-training.yaml --secret WANDB_TOKEN
+```
+
+
diff --git a/llm/openrlhf/openrlhf_dpo.yaml b/llm/openrlhf/openrlhf_dpo.yaml
@@ -0,0 +1,57 @@
+# Adapted from https://openrlhf.readthedocs.io/en/latest/non_rl.html#direct-preference-optimization-dpo
+# To run simply run the following command in a SkyPilot environment (with GCP configured):
+# WANDB_TOKEN=xxx sky launch -c open-rlhf-dpo openrlhf_dpo.yaml --secret WANDB_TOKEN
+
+resources:
+  infra: gcp
+  accelerators: A100:8
+  use_spot: True
+  autostop:
+    idle_minutes: 2 # Auto-stop after 2 minutes of idleness.
+  image_id: docker:nvcr.io/nvidia/pytorch:25.02-py3
+
+config:
+  docker:
+    run_options:
+      - -v $PWD:/openrlhf
+      - -v /checkpoints:/checkpoints
+
+envs:
+  WANDB_TOKEN: null # Pass with `--secret WANDB_TOKEN` in CLI
+
+file_mounts:
+  /checkpoints:
+    name: openrlhf-dpo-checkpoints
+    store: gcs
+
+setup: |
+  sudo apt update
+  pip uninstall xgboost transformer_engine flash_attn pynvml opencv-python-headless -y
+  pip install git+https://github.com/OpenRLHF/OpenRLHF.git@1bfcc334692f4d1e0ec0817465d3d9d495fb162f
+
+run: |
+  deepspeed --module openrlhf.cli.train_dpo \
+    --save_path /checkpoints/llama3-8b-dpo \
+    --save_steps -1 \
+    --logging_steps 1 \
+    --eval_steps -1 \
+    --train_batch_size 256 \
+    --micro_train_batch_size 1 \
+    --pretrain OpenRLHF/Llama-3-8b-sft-mixture \
+    --bf16 \
+    --max_samples 50000 \
+    --max_epochs 1 \
+    --max_len 8192 \
+    --zero_stage 3 \
+    --learning_rate 5e-7 \
+    --beta 0.1 \
+    --dataset OpenRLHF/preference_dataset_mixture2_and_safe_pku \
+    --apply_chat_template \
+    --chosen_key chosen \
+    --rejected_key rejected \
+    --attn_implementation flash_attention_2 \
+    --gradient_checkpointing \
+    --use_wandb ${WANDB_TOKEN} \
+    --wandb_project openrlhf \
+    --wandb_run_name dpo-training
+  "
diff --git a/llm/openrlhf/openrlhf_rm_training.yaml b/llm/openrlhf/openrlhf_rm_training.yaml
@@ -0,0 +1,57 @@
+# Adapted from https://openrlhf.readthedocs.io/en/latest/rl.html#reward-model-training
+# To run simply run the following command in a SkyPilot environment (with GCP configured):
+# WANDB_TOKEN=xxx sky launch -c open-rlhf-rm open-rlhf-rm-training.yaml --secret WANDB_TOKEN
+
+resources:
+  infra: gcp
+  accelerators: A100:8
+  use_spot: True
+  autostop:
+    idle_minutes: 2 # Auto-stop after 2 minutes of idleness.
+  image_id: docker:nvcr.io/nvidia/pytorch:25.02-py3
+
+config:
+  docker:
+    run_options:
+      - -v $PWD:/openrlhf
+      - -v /checkpoints:/checkpoints
+
+envs:
+  WANDB_TOKEN: null # Pass with `--secret WANDB_TOKEN` in CLI
+
+file_mounts:
+  /checkpoints:
+    name: openrlhf-rm-training-checkpoints
+    store: gcs
+
+setup: |
+  sudo apt update
+  pip uninstall xgboost transformer_engine flash_attn pynvml opencv-python-headless -y
+  pip install git+https://github.com/OpenRLHF/OpenRLHF.git@1bfcc334692f4d1e0ec0817465d3d9d495fb162f
+
+run: |
+  deepspeed --module openrlhf.cli.train_rm \
+    --save_path /checkpoints/llama3-8b-rm \
+    --save_steps -1 \
+    --logging_steps 1 \
+    --eval_steps -1 \
+    --train_batch_size 256 \
+    --micro_train_batch_size 4 \
+    --pretrain OpenRLHF/Llama-3-8b-sft-mixture \
+    --bf16 \
+    --max_samples 50000 \
+    --max_epochs 1 \
+    --max_len 8192 \
+    --zero_stage 3 \
+    --learning_rate 9e-6 \
+    --dataset OpenRLHF/preference_dataset_mixture2_and_safe_pku \
+    --apply_chat_template \
+    --chosen_key chosen \
+    --rejected_key rejected \
+    --attn_implementation flash_attention_2 \
+    --packing_samples \
+    --gradient_checkpointing \
+    --use_wandb ${WANDB_TOKEN} \
+    --wandb_project openrlhf \
+    --wandb_run_name rm-training
+  "
diff --git a/llm/openrlhf/openrlhf_sft.yaml b/llm/openrlhf/openrlhf_sft.yaml
@@ -0,0 +1,59 @@
+# Adapted from https://openrlhf.readthedocs.io/en/latest/rl.html#supervised-fine-tuning
+# To run simply run the following command in a SkyPilot environment (with GCP configured):
+# WANDB_TOKEN=xxx HF_TOKEN=xxx sky launch -c open-rlhf-sft openrlhf_sft.yaml --secret WANDB_TOKEN --secret HF_TOKEN
+
+resources:
+  infra: gcp
+  accelerators: A100:8
+  use_spot: True
+  autostop:
+    idle_minutes: 2 # Auto-stop after 2 minutes of idleness.
+  image_id: docker:nvcr.io/nvidia/pytorch:25.02-py3
+
+config:
+  docker:
+    run_options:
+      - -v $PWD:/openrlhf
+      - -v /checkpoints:/checkpoints
+
+envs:
+  WANDB_TOKEN: null # Pass with `--secret WANDB_TOKEN` in CLI
+  HF_TOKEN: null # Pass with `--secret HF_TOKEN` in CLI
+
+file_mounts:
+  /checkpoints:
+    name: openrlhf-sft-checkpoints
+    store: gcs
+
+setup: |
+  sudo apt update
+  pip uninstall xgboost transformer_engine flash_attn pynvml opencv-python-headless -y
+  pip install git+https://github.com/OpenRLHF/OpenRLHF.git@1bfcc334692f4d1e0ec0817465d3d9d495fb162f
+
+run: |
+  huggingface-cli login --token ${HF_TOKEN}
+  deepspeed --module openrlhf.cli.train_sft \
+    --max_len 2048 \
+    --dataset Open-Orca/OpenOrca \
+    --input_key question \
+    --output_key response \
+    --input_template $'User: {}\nAssistant: ' \
+    --train_batch_size 256 \
+    --micro_train_batch_size 8 \
+    --max_samples 500000 \
+    --pretrain meta-llama/Llama-3.2-1B \
+    --save_path /checkpoints/llama3_2-1b-sft \
+    --save_steps -1 \
+    --logging_steps 1 \
+    --eval_steps -1 \
+    --zero_stage 2 \
+    --max_epochs 1 \
+    --bf16 \
+    --attn_implementation flash_attention_2 \
+    --packing_samples \
+    --learning_rate 5e-6 \
+    --gradient_checkpointing \
+    --use_wandb ${WANDB_TOKEN} \
+    --wandb_project openrlhf \
+    --wandb_run_name sft-training
+  "