From 5b160880bec3984779777e4c78c1df6013082f59 Mon Sep 17 00:00:00 2001
From: Aflah <aflah20082@iiitd.ac.in>
Date: Mon, 24 Nov 2025 18:49:02 +0530
Subject: [PATCH 1/7] Add OpenRLHF Example

---
 README.md                                 |  2 +-
 docs/source/examples/training/index.rst   |  1 +
 docs/source/examples/training/openrlhf.md |  1 +
 llm/openrlhf/README.md                    |  5 ++
 llm/openrlhf/openrlhf_dpo.yaml            | 56 +++++++++++++++++++++
 llm/openrlhf/openrlhf_rm_training.yaml    | 56 +++++++++++++++++++++
 llm/openrlhf/openrlhf_sft.yaml            | 59 +++++++++++++++++++++++
 7 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/examples/training/openrlhf.md
 create mode 100644 llm/openrlhf/README.md
 create mode 100644 llm/openrlhf/openrlhf_dpo.yaml
 create mode 100644 llm/openrlhf/openrlhf_rm_training.yaml
 create mode 100644 llm/openrlhf/openrlhf_sft.yaml

diff --git a/README.md b/README.md
index 0e122d7eb1d..bff033cebfb 100644
--- a/README.md
+++ b/README.md
@@ -181,7 +181,7 @@ Latest featured examples:
 
 | Task | Examples |
 |----------|----------|
-| Training | [Verl](https://docs.skypilot.co/en/latest/examples/training/verl.html), [Finetune Llama 4](https://docs.skypilot.co/en/latest/examples/training/llama-4-finetuning.html), [TorchTitan](https://docs.skypilot.co/en/latest/examples/training/torchtitan.html), [PyTorch](https://docs.skypilot.co/en/latest/getting-started/tutorial.html), [DeepSpeed](https://docs.skypilot.co/en/latest/examples/training/deepspeed.html), [NeMo](https://docs.skypilot.co/en/latest/examples/training/nemo.html), [Ray](https://docs.skypilot.co/en/latest/examples/training/ray.html), [Unsloth](https://docs.skypilot.co/en/latest/examples/training/unsloth.html), [Jax/TPU](https://docs.skypilot.co/en/latest/examples/training/tpu.html) |
+| Training | [Verl](https://docs.skypilot.co/en/latest/examples/training/verl.html), [Finetune Llama 4](https://docs.skypilot.co/en/latest/examples/training/llama-4-finetuning.html), [TorchTitan](https://docs.skypilot.co/en/latest/examples/training/torchtitan.html), [PyTorch](https://docs.skypilot.co/en/latest/getting-started/tutorial.html), [DeepSpeed](https://docs.skypilot.co/en/latest/examples/training/deepspeed.html), [NeMo](https://docs.skypilot.co/en/latest/examples/training/nemo.html), [Ray](https://docs.skypilot.co/en/latest/examples/training/ray.html), [Unsloth](https://docs.skypilot.co/en/latest/examples/training/unsloth.html), [Jax/TPU](https://docs.skypilot.co/en/latest/examples/training/tpu.html), [OpenRLHF](https://docs.skypilot.co/en/latest/examples/training/openrlhf.html) |
 | Serving | [vLLM](https://docs.skypilot.co/en/latest/examples/serving/vllm.html), [SGLang](https://docs.skypilot.co/en/latest/examples/serving/sglang.html), [Ollama](https://docs.skypilot.co/en/latest/examples/serving/ollama.html) |
 | Models | [DeepSeek-R1](https://docs.skypilot.co/en/latest/examples/models/deepseek-r1.html), [Llama 4](https://docs.skypilot.co/en/latest/examples/models/llama-4.html), [Llama 3](https://docs.skypilot.co/en/latest/examples/models/llama-3.html), [CodeLlama](https://docs.skypilot.co/en/latest/examples/models/codellama.html), [Qwen](https://docs.skypilot.co/en/latest/examples/models/qwen.html), [Kimi-K2](https://docs.skypilot.co/en/latest/examples/models/kimi-k2.html), [Mixtral](https://docs.skypilot.co/en/latest/examples/models/mixtral.html) |
 | AI apps | [RAG](https://docs.skypilot.co/en/latest/examples/applications/rag.html), [vector databases](https://docs.skypilot.co/en/latest/examples/applications/vector_database.html) (ChromaDB, CLIP) |
diff --git a/docs/source/examples/training/index.rst b/docs/source/examples/training/index.rst
index 8a81e0d7dc2..5f0f1c4640e 100644
--- a/docs/source/examples/training/index.rst
+++ b/docs/source/examples/training/index.rst
@@ -15,6 +15,7 @@ Training
    nanochat <nanochat.md>
    NeMo <nemo.md>
    NeMo RL <nemorl.md>
+   OpenRLHF <openrlhf.md>
    Ray <ray.md>
    TorchTitan <torchtitan.md>
    Training on TPUs <tpu.md>
diff --git a/docs/source/examples/training/openrlhf.md b/docs/source/examples/training/openrlhf.md
new file mode 100644
index 00000000000..41b888abcc2
--- /dev/null
+++ b/docs/source/examples/training/openrlhf.md
@@ -0,0 +1 @@
+../../generated-examples/openrlhf.md
\ No newline at end of file
diff --git a/llm/openrlhf/README.md b/llm/openrlhf/README.md
new file mode 100644
index 00000000000..008b9c73bd0
--- /dev/null
+++ b/llm/openrlhf/README.md
@@ -0,0 +1,5 @@
+# OpenRLHF
+
+These examples are derived from the DPO, RM Training, and SFT samples available at the following URLs — https://openrlhf.readthedocs.io/en/latest/non_rl.html & https://openrlhf.readthedocs.io/en/latest/rl.html#supervised-fine-tuning
+
+They set up the environment within a Docker container, following the recommendations in the Quick Start guide (https://openrlhf.readthedocs.io/en/latest/quick_start.html#installation). The model sizes, batch sizes, and related parameters are scaled down to run efficiently on an 8×A100-40GB cluster and to keep execution time manageable.
\ No newline at end of file
diff --git a/llm/openrlhf/openrlhf_dpo.yaml b/llm/openrlhf/openrlhf_dpo.yaml
new file mode 100644
index 00000000000..f61ac093c6d
--- /dev/null
+++ b/llm/openrlhf/openrlhf_dpo.yaml
@@ -0,0 +1,56 @@
+# Adapted from https://openrlhf.readthedocs.io/en/latest/non_rl.html#direct-preference-optimization-dpo
+# sky launch -c open-rlhf-dpo open-rlhf-dpo.yaml
+resources:
+  infra: gcp
+  accelerators: A100:8
+  use_spot: True
+  autostop:
+    idle_minutes: 2 # Auto-stop after 2 minutes of idleness.
+
+envs:
+  WANDB_TOKEN: null # Pass with `--secret WANDB_TOKEN` in CLI
+
+file_mounts:
+  /checkpoints:
+    name: openrlhf-dpo-checkpoints
+    store: gcs
+
+setup: |
+  sudo apt update
+  docker pull nvcr.io/nvidia/pytorch:25.02-py3
+  docker run --name openrlhf_tmp --runtime=nvidia -v $PWD:/openrlhf nvcr.io/nvidia/pytorch:25.02-py3 bash -c "
+          pip uninstall xgboost transformer_engine flash_attn pynvml opencv-python-headless -y
+
+          pip install git+https://github.com/OpenRLHF/OpenRLHF.git@1bfcc334692f4d1e0ec0817465d3d9d495fb162f
+  "
+  docker commit openrlhf_tmp openrlhf:custom
+  docker rm openrlhf_tmp
+
+run: |
+  docker run --runtime=nvidia -v $PWD:/openrlhf -v /checkpoints:/checkpoints openrlhf:custom bash -c "
+
+    deepspeed --module openrlhf.cli.train_dpo \
+      --save_path /checkpoints/llama3-8b-dpo \
+      --save_steps -1 \
+      --logging_steps 1 \
+      --eval_steps -1 \
+      --train_batch_size 256 \
+      --micro_train_batch_size 1 \
+      --pretrain OpenRLHF/Llama-3-8b-sft-mixture \
+      --bf16 \
+      --max_samples 50000 \
+      --max_epochs 1 \
+      --max_len 8192 \
+      --zero_stage 3 \
+      --learning_rate 5e-7 \
+      --beta 0.1 \
+      --dataset OpenRLHF/preference_dataset_mixture2_and_safe_pku \
+      --apply_chat_template \
+      --chosen_key chosen \
+      --rejected_key rejected \
+      --attn_implementation flash_attention_2 \
+      --gradient_checkpointing \
+      --use_wandb ${WANDB_TOKEN} \
+      --wandb_project openrlhf \
+      --wandb_run_name dpo-training
+  "
\ No newline at end of file
diff --git a/llm/openrlhf/openrlhf_rm_training.yaml b/llm/openrlhf/openrlhf_rm_training.yaml
new file mode 100644
index 00000000000..df504cabb2b
--- /dev/null
+++ b/llm/openrlhf/openrlhf_rm_training.yaml
@@ -0,0 +1,56 @@
+# Adapted from https://openrlhf.readthedocs.io/en/latest/rl.html#reward-model-training
+# sky launch -c open-rlhf-rm open-rlhf-rm-training.yaml
+resources:
+  infra: gcp
+  accelerators: A100:8
+  use_spot: True
+  autostop:
+    idle_minutes: 2 # Auto-stop after 2 minutes of idleness.
+
+envs:
+  WANDB_TOKEN: null # Pass with `--secret WANDB_TOKEN` in CLI
+
+file_mounts:
+  /checkpoints:
+    name: openrlhf-rm-training-checkpoints
+    store: gcs
+
+setup: |
+  sudo apt update
+  docker pull nvcr.io/nvidia/pytorch:25.02-py3
+  docker run --name openrlhf_tmp --runtime=nvidia -v $PWD:/openrlhf nvcr.io/nvidia/pytorch:25.02-py3 bash -c "
+          pip uninstall xgboost transformer_engine flash_attn pynvml opencv-python-headless -y
+
+          pip install git+https://github.com/OpenRLHF/OpenRLHF.git@1bfcc334692f4d1e0ec0817465d3d9d495fb162f
+  "
+  docker commit openrlhf_tmp openrlhf:custom
+  docker rm openrlhf_tmp
+
+run: |
+  docker run --runtime=nvidia -v $PWD:/openrlhf -v /checkpoints:/checkpoints openrlhf:custom bash -c "
+
+    deepspeed --module openrlhf.cli.train_rm \
+      --save_path /checkpoints/llama3-8b-rm \
+      --save_steps -1 \
+      --logging_steps 1 \
+      --eval_steps -1 \
+      --train_batch_size 256 \
+      --micro_train_batch_size 4 \
+      --pretrain OpenRLHF/Llama-3-8b-sft-mixture \
+      --bf16 \
+      --max_samples 50000 \
+      --max_epochs 1 \
+      --max_len 8192 \
+      --zero_stage 3 \
+      --learning_rate 9e-6 \
+      --dataset OpenRLHF/preference_dataset_mixture2_and_safe_pku \
+      --apply_chat_template \
+      --chosen_key chosen \
+      --rejected_key rejected \
+      --attn_implementation flash_attention_2 \
+      --packing_samples \
+      --gradient_checkpointing \
+      --use_wandb ${WANDB_TOKEN} \
+      --wandb_project openrlhf \
+      --wandb_run_name rm-training
+  "
\ No newline at end of file
diff --git a/llm/openrlhf/openrlhf_sft.yaml b/llm/openrlhf/openrlhf_sft.yaml
new file mode 100644
index 00000000000..0b29cca831d
--- /dev/null
+++ b/llm/openrlhf/openrlhf_sft.yaml
@@ -0,0 +1,59 @@
+# Adapted from https://openrlhf.readthedocs.io/en/latest/rl.html#supervised-fine-tuning
+
+resources:
+  infra: gcp
+  accelerators: A100:8
+  use_spot: True
+  autostop:
+    idle_minutes: 2 # Auto-stop after 2 minutes of idleness.
+
+envs:
+  WANDB_TOKEN: null # Pass with `--secret WANDB_TOKEN` in CLI
+  HF_TOKEN: null # Pass with `--secret HF_TOKEN` in CLI
+
+file_mounts:
+  /checkpoints:
+    name: openrlhf-sft-checkpoints
+    store: gcs
+
+setup: |
+  sudo apt update
+  docker pull nvcr.io/nvidia/pytorch:25.02-py3
+  docker run --name openrlhf_tmp --runtime=nvidia -v $PWD:/openrlhf nvcr.io/nvidia/pytorch:25.02-py3 bash -c "
+          pip uninstall xgboost transformer_engine flash_attn pynvml opencv-python-headless -y
+
+          pip install git+https://github.com/OpenRLHF/OpenRLHF.git@1bfcc334692f4d1e0ec0817465d3d9d495fb162f
+  "
+  docker commit openrlhf_tmp openrlhf:custom
+  docker rm openrlhf_tmp
+
+run: |
+  docker run --runtime=nvidia -v $PWD:/openrlhf -v /checkpoints:/checkpoints openrlhf:custom bash -c "
+
+    huggingface-cli login --token ${HF_TOKEN}
+
+    deepspeed --module openrlhf.cli.train_sft \
+      --max_len 2048 \
+      --dataset Open-Orca/OpenOrca \
+      --input_key question \
+      --output_key response \
+      --input_template $'User: {}\nAssistant: ' \
+      --train_batch_size 256 \
+      --micro_train_batch_size 8 \
+      --max_samples 500000 \
+      --pretrain meta-llama/Llama-3.2-1B \
+      --save_path /checkpoints/llama3_2-1b-sft \
+      --save_steps -1 \
+      --logging_steps 1 \
+      --eval_steps -1 \
+      --zero_stage 2 \
+      --max_epochs 1 \
+      --bf16 \
+      --attn_implementation flash_attention_2 \
+      --packing_samples \
+      --learning_rate 5e-6 \
+      --gradient_checkpointing \
+      --use_wandb ${WANDB_TOKEN} \
+      --wandb_project openrlhf \
+      --wandb_run_name sft-training
+  "
\ No newline at end of file

From 7548961c0908dfd2ed7ed41ded2bcd66b9d6f2ca Mon Sep 17 00:00:00 2001
From: Aflah <aflah20082@iiitd.ac.in>
Date: Mon, 24 Nov 2025 18:59:11 +0530
Subject: [PATCH 2/7] Improve documentation

---
 llm/openrlhf/README.md                 | 21 ++++++++++++++++++++-
 llm/openrlhf/openrlhf_dpo.yaml         |  4 +++-
 llm/openrlhf/openrlhf_rm_training.yaml |  4 +++-
 llm/openrlhf/openrlhf_sft.yaml         |  2 ++
 4 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/llm/openrlhf/README.md b/llm/openrlhf/README.md
index 008b9c73bd0..65848ab43b4 100644
--- a/llm/openrlhf/README.md
+++ b/llm/openrlhf/README.md
@@ -2,4 +2,23 @@
 
 These examples are derived from the DPO, RM Training, and SFT samples available at the following URLs — https://openrlhf.readthedocs.io/en/latest/non_rl.html & https://openrlhf.readthedocs.io/en/latest/rl.html#supervised-fine-tuning
 
-They set up the environment within a Docker container, following the recommendations in the Quick Start guide (https://openrlhf.readthedocs.io/en/latest/quick_start.html#installation). The model sizes, batch sizes, and related parameters are scaled down to run efficiently on an 8×A100-40GB cluster and to keep execution time manageable.
\ No newline at end of file
+They set up the environment within a Docker container, following the recommendations in the Quick Start guide (https://openrlhf.readthedocs.io/en/latest/quick_start.html#installation). The model sizes, batch sizes, and related parameters are scaled down to run efficiently on an 8×A100-40GB cluster and to keep execution time manageable.
+
+All the examples use Spot Instances by default which can easily be changed by setting use_spot to False
+
+Supervised Finetuning (SFT) example -
+```
+WANDB_TOKEN=xxx HF_TOKEN=xxx sky launch -c open-rlhf-sft openrlhf_sft.yaml --secret WANDB_TOKEN --secret HF_TOKEN
+```
+
+Direct Preference Optimization (DPO) example - 
+```
+WANDB_TOKEN=xxx sky launch -c open-rlhf-dpo openrlhf_dpo.yaml --secret WANDB_TOKEN
+```
+
+Reward Model Training example - 
+```
+WANDB_TOKEN=xxx sky launch -c open-rlhf-rm open-rlhf-rm-training.yaml --secret WANDB_TOKEN
+```
+
+
diff --git a/llm/openrlhf/openrlhf_dpo.yaml b/llm/openrlhf/openrlhf_dpo.yaml
index f61ac093c6d..f9d8b114b97 100644
--- a/llm/openrlhf/openrlhf_dpo.yaml
+++ b/llm/openrlhf/openrlhf_dpo.yaml
@@ -1,5 +1,7 @@
 # Adapted from https://openrlhf.readthedocs.io/en/latest/non_rl.html#direct-preference-optimization-dpo
-# sky launch -c open-rlhf-dpo open-rlhf-dpo.yaml
+# To run simply run the following command in a SkyPilot environment (with GCP configured):
+# WANDB_TOKEN=xxx sky launch -c open-rlhf-dpo openrlhf_dpo.yaml --secret WANDB_TOKEN
+
 resources:
   infra: gcp
   accelerators: A100:8
diff --git a/llm/openrlhf/openrlhf_rm_training.yaml b/llm/openrlhf/openrlhf_rm_training.yaml
index df504cabb2b..53c32661dbe 100644
--- a/llm/openrlhf/openrlhf_rm_training.yaml
+++ b/llm/openrlhf/openrlhf_rm_training.yaml
@@ -1,5 +1,7 @@
 # Adapted from https://openrlhf.readthedocs.io/en/latest/rl.html#reward-model-training
-# sky launch -c open-rlhf-rm open-rlhf-rm-training.yaml
+# To run simply run the following command in a SkyPilot environment (with GCP configured):
+# WANDB_TOKEN=xxx sky launch -c open-rlhf-rm open-rlhf-rm-training.yaml --secret WANDB_TOKEN
+
 resources:
   infra: gcp
   accelerators: A100:8
diff --git a/llm/openrlhf/openrlhf_sft.yaml b/llm/openrlhf/openrlhf_sft.yaml
index 0b29cca831d..5ff563be07a 100644
--- a/llm/openrlhf/openrlhf_sft.yaml
+++ b/llm/openrlhf/openrlhf_sft.yaml
@@ -1,4 +1,6 @@
 # Adapted from https://openrlhf.readthedocs.io/en/latest/rl.html#supervised-fine-tuning
+# To run simply run the following command in a SkyPilot environment (with GCP configured):
+# WANDB_TOKEN=xxx HF_TOKEN=xxx sky launch -c open-rlhf-sft openrlhf_sft.yaml --secret WANDB_TOKEN --secret HF_TOKEN
 
 resources:
   infra: gcp

From af48c622afe3b9728bb524083923e8d440e6c365 Mon Sep 17 00:00:00 2001
From: Aflah <aflah20082@iiitd.ac.in>
Date: Mon, 24 Nov 2025 19:20:54 +0530
Subject: [PATCH 3/7] Try to fix failing test

---
 docs/source/examples/training/openrlhf.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/examples/training/openrlhf.md b/docs/source/examples/training/openrlhf.md
index 41b888abcc2..8a83f8707e9 100644
--- a/docs/source/examples/training/openrlhf.md
+++ b/docs/source/examples/training/openrlhf.md
@@ -1 +1,2 @@
-../../generated-examples/openrlhf.md
\ No newline at end of file
+:::{include} ../../generated-examples/openrlhf.md
+:::
\ No newline at end of file

From 73b4472a4cf968ca80340a20dc29a215455cc1a4 Mon Sep 17 00:00:00 2001
From: Aflah <aflah20082@iiitd.ac.in>
Date: Tue, 25 Nov 2025 23:03:36 +0530
Subject: [PATCH 4/7] Update to use image_id attr

---
 llm/openrlhf/openrlhf_dpo.yaml         | 67 ++++++++++++------------
 llm/openrlhf/openrlhf_rm_training.yaml | 67 ++++++++++++------------
 llm/openrlhf/openrlhf_sft.yaml         | 70 +++++++++++++-------------
 3 files changed, 100 insertions(+), 104 deletions(-)

diff --git a/llm/openrlhf/openrlhf_dpo.yaml b/llm/openrlhf/openrlhf_dpo.yaml
index f9d8b114b97..cd85d69daa5 100644
--- a/llm/openrlhf/openrlhf_dpo.yaml
+++ b/llm/openrlhf/openrlhf_dpo.yaml
@@ -8,6 +8,13 @@ resources:
   use_spot: True
   autostop:
     idle_minutes: 2 # Auto-stop after 2 minutes of idleness.
+  image_id: docker:nvcr.io/nvidia/pytorch:25.02-py3
+
+config:
+  docker:
+    run_options:
+      - -v $PWD:/openrlhf
+      - -v /checkpoints:/checkpoints
 
 envs:
   WANDB_TOKEN: null # Pass with `--secret WANDB_TOKEN` in CLI
@@ -19,40 +26,32 @@ file_mounts:
 
 setup: |
   sudo apt update
-  docker pull nvcr.io/nvidia/pytorch:25.02-py3
-  docker run --name openrlhf_tmp --runtime=nvidia -v $PWD:/openrlhf nvcr.io/nvidia/pytorch:25.02-py3 bash -c "
-          pip uninstall xgboost transformer_engine flash_attn pynvml opencv-python-headless -y
-
-          pip install git+https://github.com/OpenRLHF/OpenRLHF.git@1bfcc334692f4d1e0ec0817465d3d9d495fb162f
-  "
-  docker commit openrlhf_tmp openrlhf:custom
-  docker rm openrlhf_tmp
+  pip uninstall xgboost transformer_engine flash_attn pynvml opencv-python-headless -y
+  pip install git+https://github.com/OpenRLHF/OpenRLHF.git@1bfcc334692f4d1e0ec0817465d3d9d495fb162f
 
 run: |
-  docker run --runtime=nvidia -v $PWD:/openrlhf -v /checkpoints:/checkpoints openrlhf:custom bash -c "
-
-    deepspeed --module openrlhf.cli.train_dpo \
-      --save_path /checkpoints/llama3-8b-dpo \
-      --save_steps -1 \
-      --logging_steps 1 \
-      --eval_steps -1 \
-      --train_batch_size 256 \
-      --micro_train_batch_size 1 \
-      --pretrain OpenRLHF/Llama-3-8b-sft-mixture \
-      --bf16 \
-      --max_samples 50000 \
-      --max_epochs 1 \
-      --max_len 8192 \
-      --zero_stage 3 \
-      --learning_rate 5e-7 \
-      --beta 0.1 \
-      --dataset OpenRLHF/preference_dataset_mixture2_and_safe_pku \
-      --apply_chat_template \
-      --chosen_key chosen \
-      --rejected_key rejected \
-      --attn_implementation flash_attention_2 \
-      --gradient_checkpointing \
-      --use_wandb ${WANDB_TOKEN} \
-      --wandb_project openrlhf \
-      --wandb_run_name dpo-training
+  deepspeed --module openrlhf.cli.train_dpo \
+    --save_path /checkpoints/llama3-8b-dpo \
+    --save_steps -1 \
+    --logging_steps 1 \
+    --eval_steps -1 \
+    --train_batch_size 256 \
+    --micro_train_batch_size 1 \
+    --pretrain OpenRLHF/Llama-3-8b-sft-mixture \
+    --bf16 \
+    --max_samples 50000 \
+    --max_epochs 1 \
+    --max_len 8192 \
+    --zero_stage 3 \
+    --learning_rate 5e-7 \
+    --beta 0.1 \
+    --dataset OpenRLHF/preference_dataset_mixture2_and_safe_pku \
+    --apply_chat_template \
+    --chosen_key chosen \
+    --rejected_key rejected \
+    --attn_implementation flash_attention_2 \
+    --gradient_checkpointing \
+    --use_wandb ${WANDB_TOKEN} \
+    --wandb_project openrlhf \
+    --wandb_run_name dpo-training
   "
\ No newline at end of file
diff --git a/llm/openrlhf/openrlhf_rm_training.yaml b/llm/openrlhf/openrlhf_rm_training.yaml
index 53c32661dbe..2d8ffcb182c 100644
--- a/llm/openrlhf/openrlhf_rm_training.yaml
+++ b/llm/openrlhf/openrlhf_rm_training.yaml
@@ -8,6 +8,13 @@ resources:
   use_spot: True
   autostop:
     idle_minutes: 2 # Auto-stop after 2 minutes of idleness.
+  image_id: docker:nvcr.io/nvidia/pytorch:25.02-py3
+
+config:
+  docker:
+    run_options:
+      - -v $PWD:/openrlhf
+      - -v /checkpoints:/checkpoints
 
 envs:
   WANDB_TOKEN: null # Pass with `--secret WANDB_TOKEN` in CLI
@@ -19,40 +26,32 @@ file_mounts:
 
 setup: |
   sudo apt update
-  docker pull nvcr.io/nvidia/pytorch:25.02-py3
-  docker run --name openrlhf_tmp --runtime=nvidia -v $PWD:/openrlhf nvcr.io/nvidia/pytorch:25.02-py3 bash -c "
-          pip uninstall xgboost transformer_engine flash_attn pynvml opencv-python-headless -y
-
-          pip install git+https://github.com/OpenRLHF/OpenRLHF.git@1bfcc334692f4d1e0ec0817465d3d9d495fb162f
-  "
-  docker commit openrlhf_tmp openrlhf:custom
-  docker rm openrlhf_tmp
+  pip uninstall xgboost transformer_engine flash_attn pynvml opencv-python-headless -y
+  pip install git+https://github.com/OpenRLHF/OpenRLHF.git@1bfcc334692f4d1e0ec0817465d3d9d495fb162f
 
 run: |
-  docker run --runtime=nvidia -v $PWD:/openrlhf -v /checkpoints:/checkpoints openrlhf:custom bash -c "
-
-    deepspeed --module openrlhf.cli.train_rm \
-      --save_path /checkpoints/llama3-8b-rm \
-      --save_steps -1 \
-      --logging_steps 1 \
-      --eval_steps -1 \
-      --train_batch_size 256 \
-      --micro_train_batch_size 4 \
-      --pretrain OpenRLHF/Llama-3-8b-sft-mixture \
-      --bf16 \
-      --max_samples 50000 \
-      --max_epochs 1 \
-      --max_len 8192 \
-      --zero_stage 3 \
-      --learning_rate 9e-6 \
-      --dataset OpenRLHF/preference_dataset_mixture2_and_safe_pku \
-      --apply_chat_template \
-      --chosen_key chosen \
-      --rejected_key rejected \
-      --attn_implementation flash_attention_2 \
-      --packing_samples \
-      --gradient_checkpointing \
-      --use_wandb ${WANDB_TOKEN} \
-      --wandb_project openrlhf \
-      --wandb_run_name rm-training
+  deepspeed --module openrlhf.cli.train_rm \
+    --save_path /checkpoints/llama3-8b-rm \
+    --save_steps -1 \
+    --logging_steps 1 \
+    --eval_steps -1 \
+    --train_batch_size 256 \
+    --micro_train_batch_size 4 \
+    --pretrain OpenRLHF/Llama-3-8b-sft-mixture \
+    --bf16 \
+    --max_samples 50000 \
+    --max_epochs 1 \
+    --max_len 8192 \
+    --zero_stage 3 \
+    --learning_rate 9e-6 \
+    --dataset OpenRLHF/preference_dataset_mixture2_and_safe_pku \
+    --apply_chat_template \
+    --chosen_key chosen \
+    --rejected_key rejected \
+    --attn_implementation flash_attention_2 \
+    --packing_samples \
+    --gradient_checkpointing \
+    --use_wandb ${WANDB_TOKEN} \
+    --wandb_project openrlhf \
+    --wandb_run_name rm-training
   "
\ No newline at end of file
diff --git a/llm/openrlhf/openrlhf_sft.yaml b/llm/openrlhf/openrlhf_sft.yaml
index 5ff563be07a..e0df4968499 100644
--- a/llm/openrlhf/openrlhf_sft.yaml
+++ b/llm/openrlhf/openrlhf_sft.yaml
@@ -8,6 +8,13 @@ resources:
   use_spot: True
   autostop:
     idle_minutes: 2 # Auto-stop after 2 minutes of idleness.
+  image_id: docker:nvcr.io/nvidia/pytorch:25.02-py3
+
+config:
+  docker:
+    run_options:
+      - -v $PWD:/openrlhf
+      - -v /checkpoints:/checkpoints
 
 envs:
   WANDB_TOKEN: null # Pass with `--secret WANDB_TOKEN` in CLI
@@ -20,42 +27,33 @@ file_mounts:
 
 setup: |
   sudo apt update
-  docker pull nvcr.io/nvidia/pytorch:25.02-py3
-  docker run --name openrlhf_tmp --runtime=nvidia -v $PWD:/openrlhf nvcr.io/nvidia/pytorch:25.02-py3 bash -c "
-          pip uninstall xgboost transformer_engine flash_attn pynvml opencv-python-headless -y
-
-          pip install git+https://github.com/OpenRLHF/OpenRLHF.git@1bfcc334692f4d1e0ec0817465d3d9d495fb162f
-  "
-  docker commit openrlhf_tmp openrlhf:custom
-  docker rm openrlhf_tmp
+  pip uninstall xgboost transformer_engine flash_attn pynvml opencv-python-headless -y
+  pip install git+https://github.com/OpenRLHF/OpenRLHF.git@1bfcc334692f4d1e0ec0817465d3d9d495fb162f
 
 run: |
-  docker run --runtime=nvidia -v $PWD:/openrlhf -v /checkpoints:/checkpoints openrlhf:custom bash -c "
-
-    huggingface-cli login --token ${HF_TOKEN}
-
-    deepspeed --module openrlhf.cli.train_sft \
-      --max_len 2048 \
-      --dataset Open-Orca/OpenOrca \
-      --input_key question \
-      --output_key response \
-      --input_template $'User: {}\nAssistant: ' \
-      --train_batch_size 256 \
-      --micro_train_batch_size 8 \
-      --max_samples 500000 \
-      --pretrain meta-llama/Llama-3.2-1B \
-      --save_path /checkpoints/llama3_2-1b-sft \
-      --save_steps -1 \
-      --logging_steps 1 \
-      --eval_steps -1 \
-      --zero_stage 2 \
-      --max_epochs 1 \
-      --bf16 \
-      --attn_implementation flash_attention_2 \
-      --packing_samples \
-      --learning_rate 5e-6 \
-      --gradient_checkpointing \
-      --use_wandb ${WANDB_TOKEN} \
-      --wandb_project openrlhf \
-      --wandb_run_name sft-training
+  huggingface-cli login --token ${HF_TOKEN}
+  deepspeed --module openrlhf.cli.train_sft \
+    --max_len 2048 \
+    --dataset Open-Orca/OpenOrca \
+    --input_key question \
+    --output_key response \
+    --input_template $'User: {}\nAssistant: ' \
+    --train_batch_size 256 \
+    --micro_train_batch_size 8 \
+    --max_samples 500000 \
+    --pretrain meta-llama/Llama-3.2-1B \
+    --save_path /checkpoints/llama3_2-1b-sft \
+    --save_steps -1 \
+    --logging_steps 1 \
+    --eval_steps -1 \
+    --zero_stage 2 \
+    --max_epochs 1 \
+    --bf16 \
+    --attn_implementation flash_attention_2 \
+    --packing_samples \
+    --learning_rate 5e-6 \
+    --gradient_checkpointing \
+    --use_wandb ${WANDB_TOKEN} \
+    --wandb_project openrlhf \
+    --wandb_run_name sft-training
   "
\ No newline at end of file

From e84a519afc450c611e6cfde666b58333a474ceb1 Mon Sep 17 00:00:00 2001
From: Mohammad Aflah Khan <aflah20082@iiitd.ac.in>
Date: Thu, 27 Nov 2025 14:35:51 +0100
Subject: [PATCH 5/7] Remove file to test CI

---
 docs/source/examples/training/openrlhf.md | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 docs/source/examples/training/openrlhf.md

diff --git a/docs/source/examples/training/openrlhf.md b/docs/source/examples/training/openrlhf.md
deleted file mode 100644
index 8a83f8707e9..00000000000
--- a/docs/source/examples/training/openrlhf.md
+++ /dev/null
@@ -1,2 +0,0 @@
-:::{include} ../../generated-examples/openrlhf.md
-:::
\ No newline at end of file

From 732a79a4ecf1cc9706520e7b50f6acdb22dc9df5 Mon Sep 17 00:00:00 2001
From: Mohammad Aflah Khan <aflah20082@iiitd.ac.in>
Date: Thu, 27 Nov 2025 14:47:30 +0100
Subject: [PATCH 6/7] Followed Instructions from Docs for SymLink

---
 docs/source/examples/training/openrlhf.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 docs/source/examples/training/openrlhf.md

diff --git a/docs/source/examples/training/openrlhf.md b/docs/source/examples/training/openrlhf.md
new file mode 120000
index 00000000000..41b888abcc2
--- /dev/null
+++ b/docs/source/examples/training/openrlhf.md
@@ -0,0 +1 @@
+../../generated-examples/openrlhf.md
\ No newline at end of file

From a879808a670c2744d42218540fcc5f10bf07d38f Mon Sep 17 00:00:00 2001
From: Mohammad Aflah Khan <aflah20082@iiitd.ac.in>
Date: Thu, 27 Nov 2025 15:08:08 +0100
Subject: [PATCH 7/7] Improve Docs for OpenRLHF

---
 llm/openrlhf/README.md | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/llm/openrlhf/README.md b/llm/openrlhf/README.md
index 65848ab43b4..71282c96811 100644
--- a/llm/openrlhf/README.md
+++ b/llm/openrlhf/README.md
@@ -1,6 +1,32 @@
 # OpenRLHF
 
-These examples are derived from the DPO, RM Training, and SFT samples available at the following URLs — https://openrlhf.readthedocs.io/en/latest/non_rl.html & https://openrlhf.readthedocs.io/en/latest/rl.html#supervised-fine-tuning
+## What is OpenRLHF?
+
+Before diving into the implementation, it helps to understand what OpenRLHF actually is. As the name suggests, it is a library to perform Reinforcement Learning from Human Feedback (RLHF). OpenRLHF describes itself as an easy-to-use, high-performance RLHF framework built on Ray, vLLM, ZeRO-3, and HuggingFace Transformers. Despite the growing number of RLHF libraries today, OpenRLHF remains a popular option because of its wide support for training algorithms and its highly optimized codebase.
+The framework is still under active development, so for this tutorial I used the latest commit at the time I began the project. That may change as new features land, but the core concepts should remain stable.
+
+The OpenRLHF architecture can appear complex at first glance, but it is built from a few intuitive components:
+
+- Ray: Ray manages distributed execution. OpenRLHF uses it to schedule the various roles in the RLHF pipeline such as actors, critics, reward models, and other processes across multiple GPUs. It also supports a Hybrid Engine mode that allows multiple components to be colocated on the same GPU in order to improve utilization.
+- vLLM and AutoTP: A significant portion of RLHF training time is spent generating new trajectories from the policy model. vLLM accelerates this step with optimized inference kernels and distributed features such as tensor parallelism, which make large-model inference fast and scalable.
+- ZeRO: Training large language models requires substantial memory, and ZeRO addresses this by sharding parameters and optimizer states across GPUs. It also supports offloading to CPU memory. Through its different stages, it provides fine-grained control over resource usage and efficiency.
+
+## Setting up a SkyPilot Environment
+
+You can set up an environment using the following commands:
+
+```bash
+uv venv --seed --python 3.10
+source .venv/bin/activate
+uv pip install "skypilot[gcp]"
+```
+
+The examples below use GCP; however, you're free to use any infrastructure you prefer.
+You can configure and install the appropriate libraries by following the instructions in the official [SkyPilot documentation](https://docs.skypilot.co/en/latest/getting-started/installation.html).
+
+## Running OpenRLHF Workloads
+
+The examples are derived from the [DPO](https://openrlhf.readthedocs.io/en/latest/non_rl.html#direct-preference-optimization-dpo), [RM Training](https://openrlhf.readthedocs.io/en/latest/rl.html#reward-model-training), and [SFT](https://openrlhf.readthedocs.io/en/latest/rl.html#supervised-fine-tuning) tutorials present in the OpenRLHF documentation.
 
 They set up the environment within a Docker container, following the recommendations in the Quick Start guide (https://openrlhf.readthedocs.io/en/latest/quick_start.html#installation). The model sizes, batch sizes, and related parameters are scaled down to run efficiently on an 8×A100-40GB cluster and to keep execution time manageable.