rlzero template and fixes to rlzero scripts (#1216)

mnoukhov · finbarrtimbers · hamishivi · web-flow · commit 70b0472af73b · 2025-11-22T20:21:41.000Z
* Added olmo3 scripts

* Added scripts

* Updated scripts with better defaults

* Removed 32B DPO smoke test

* Fixed table formatting.

* Added RLZero scripts.

* scripts hamish

* add beaker

* add beaker

* add beaker

* add beaker links

* RL0 beakre links

* add beaker links

* delete a couple of RL Zero rows

* Added launching instructions

* update multiturn subset names

* thinker rl-zero template

* fix scripts

* remove answer stop string

* fix and remove gs_model_name

* fixes for gemini, cursor, finbarr

* fixed

---------

Co-authored-by: Finbarr Timbers &lt;finbarrtimbers@gmail.com&gt;
Co-authored-by: Hamish Ivison &lt;hamishivi@gmail.com&gt;
Co-authored-by: Saurabh Shah &lt;saurabhs@allenai.org&gt;
Co-authored-by: VictoriaGraf &lt;70868290+VictoriaGraf@users.noreply.github.com&gt;
diff --git a/open_instruct/dataset_transformation.py b/open_instruct/dataset_transformation.py
@@ -523,47 +523,27 @@ def visualize_token_role(tokens: list[int], masks: list[int], tokenizer: PreTrai
         "{% endif %}"
         "{% endfor %}"
     ),
-    "olmo_thinker_r1_style": (
-        "A conversation between user and assistant. "
-        "The user asks a question, and the assistant solves it. "
-        "The assistant first thinks and reasons about the question "
-        "and after thinking provides the user with the answer. "
-        "The reasoning process is enclosed in <think> </think> tags "
-        "and the answer is enclosed in <answer> </answer> tags "
-        "so the full response is <think> reasoning process here </think> "
-        "<answer> answer here </answer>."
+    "olmo_thinker_rlzero": (
+        "Solve the following problem step by step. "
+        "The last line of your response should be the answer to the problem in form Answer: $Answer (without quotes) where $Answer is the answer to the problem."
         "\n\n"
         "{% for message in messages %}"
-        "{% if message['role'] == 'system' %}"
-        "{% if message.get('functions', none) is not none %}"
-        "{{ '<|im_start|>system\n' + message['content'] + '\n' + '<functions>' + message['functions'] + '</functions><|im_end|>\n' }}"
-        "{% else %}"
-        "{{ '<|im_start|>system\n' + message['content']  + '<|im_end|>\n' }}"
-        "{% endif %}"
-        "{% elif message['role'] == 'user' %}"
-        "{% if message.get('functions', none) is not none %}"
-        "{{ '<|im_start|>user\n' + message['content'] + '\n' + '<functions>' + message['functions'] + '</functions><|im_end|>\n' }}"
-        "{% else %}"
-        "{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' }}"
-        "{% endif %}"
-        "{% elif message['role'] == 'assistant' %}"
-        "{{ '<|im_start|>assistant\n' }}"
-        "{% if message.get('content', none) is not none %}"
-        "{{ message['content'] }}"
-        "{% endif %}"
-        "{% if message.get('function_calls', none) is not none %}"
-        "{{ '<function_calls>' + message['function_calls'] + '</function_calls>' }}"
-        "{% endif %}"
-        "{% if not loop.last %}"
-        "{{ '<|im_end|>' + '\n' }}"
-        "{% else %}"
-        "{{ eos_token }}"
-        "{% endif %}"
-        "{% elif message['role'] == 'environment' %}"
-        "{{ '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' }}"
+        "{{ '\n\n' if not loop.first else '' }}"
+        "{{ message['content'] + '\n' }}"
+        "{% if loop.last and add_generation_prompt %}"
+        "{{ '\nRemember to put your answer on its own line after \"Answer:\"' }}"
         "{% endif %}"
+        "{% endfor %}"
+    ),
+    "olmo_thinker_code_rlzero": (
+        "Solve the following code problem step by step. "
+        f"The last part of your response should be the solution to the problem in form ```\npython\nCODE\n``` where CODE is the solution for the problem."
+        "\n\n"
+        "{% for message in messages %}"
+        "{{ '\n\n' if not loop.first else '' }}"
+        "{{ message['content'] + '\n' }}"
         "{% if loop.last and add_generation_prompt %}"
-        "{{ '<|im_start|>assistant\n<think>' }}"
+        f"\nRemember to put your solution inside the ```\npython\nCODE\n``` tags"
         "{% endif %}"
         "{% endfor %}"
     ),
diff --git a/scripts/train/olmo3/7b_rlzero_code.sh b/scripts/train/olmo3/7b_rlzero_code.sh
@@ -1,16 +1,14 @@
 #!/bin/bash
 
-# OLMo 3 model
-MODEL_NAME_OR_PATH="/weka/oe-training-default/ai2-llm/checkpoints/tylerr/long-context/olmo25_7b_lc_64k_6T_M100B_round5-sparkle_6634-pre_s2pdf_gzip2080_cweN-yake-all-olmo_packing_yarn-fullonly_50B-fb13a737/step11921-hf"
+MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
+DATASETS="allenai/Dolci-RLZero-Code-7B 1.0"
 
-DATASETS="saurabh5/rlvr_acecoder_filtered_filtered_olmo_completions_filtered 6656 hamishivi/synthetic2-rlvr-code-compressed_filtered 3328 hamishivi/klear-code-rlvr_filtered 3328"
-
-LOCAL_EVALS="hamishivi/rlvr_acecoder_filtered_filtered 4 hamishivi/klear-code-rlvr_filtered 4"
+LOCAL_EVALS="allenai/Dolci-RLZero-Code-7B 8"
 LOCAL_EVAL_SPLITS="train"
 
 EVALS="codex_humanevalplus:0-shot-chat::tulu-thinker_deepseek,mbppplus:0-shot-chat::tulu-thinker_deepseek,livecodebench_codegeneration::tulu-thinker_deepseek_lite"
 
-EXP_NAME="grpo_code_from_zero"
+EXP_NAME="olmo3_7b_rlzero_code"
 BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
 BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
 shift
@@ -30,9 +28,8 @@ python mason.py \
     --env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
     --gpus 8 \
     --budget ai2/oe-adapt \
-    -- \
-source configs/beaker_configs/ray_node_setup.sh \&\& \
-python open_instruct/grpo_fast.py \
+    -- source configs/beaker_configs/ray_node_setup.sh \
+\&\& uv run open_instruct/grpo_fast.py \
     --exp_name ${EXP_NAME} \
     --beta 0.0 \
     --async_steps 4 \
@@ -54,8 +51,7 @@ python open_instruct/grpo_fast.py \
     --response_length 16384 \
     --pack_length 18432 \
     --model_name_or_path ${MODEL_NAME_OR_PATH} \
-    --chat_template_name olmo_thinker \
-    --stop_strings "</answer>" \
+    --chat_template_name olmo_thinker_code_rlzero \
     --non_stop_penalty False \
     --temperature 1.0 \
     --total_episodes 10000000 \
diff --git a/scripts/train/olmo3/7b_rlzero_instruction_following.sh b/scripts/train/olmo3/7b_rlzero_instruction_following.sh
@@ -1,16 +1,14 @@
 #!/bin/bash
 
-# OLMo 3 model
-MODEL_NAME_OR_PATH="/weka/oe-training-default/ai2-llm/checkpoints/tylerr/long-context/olmo25_7b_lc_64k_6T_M100B_round5-sparkle_6634-pre_s2pdf_gzip2080_cweN-yake-all-olmo_packing_yarn-fullonly_50B-fb13a737/step11921-hf"
+MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
+DATASETS="allenai/Dolci-RLZero-IF-7B 1.0"
 
-DATASETS="saurabh5/IF_multi_constraints_upto5_filtered_olmo_completions_filtered 13314"
-
-LOCAL_EVALS="hamishivi/IF_multi_constraints_upto5_filtered 8"
+LOCAL_EVALS="allenai/Dolci-RLZero-IF-7B 8"
 LOCAL_EVAL_SPLITS="train"
 
 EVALS="ifeval::hamish_zs_reasoning_deepseek"
 
-EXP_NAME="grpo_if_from_zero"
+EXP_NAME="olmo3_7b_rlzero_if"
 BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
 BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
 shift
@@ -30,9 +28,8 @@ python mason.py \
     --env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
     --gpus 8 \
     --budget ai2/oe-adapt \
-    -- \
-source configs/beaker_configs/ray_node_setup.sh \&\& \
-python open_instruct/grpo_fast.py \
+    -- source configs/beaker_configs/ray_node_setup.sh \
+\&\& uv run open_instruct/grpo_fast.py \
     --exp_name ${EXP_NAME} \
     --beta 0.0 \
     --async_steps 4 \
@@ -54,8 +51,7 @@ python open_instruct/grpo_fast.py \
     --response_length 16384 \
     --pack_length 18432 \
     --model_name_or_path ${MODEL_NAME_OR_PATH} \
-    --chat_template_name olmo_thinker \
-    --stop_strings "</answer>" \
+    --chat_template_name olmo_thinker_rlzero \
     --non_stop_penalty False \
     --temperature 1.0 \
     --total_episodes 10000000 \
diff --git a/scripts/train/olmo3/7b_rlzero_math.sh b/scripts/train/olmo3/7b_rlzero_math.sh
@@ -1,25 +1,21 @@
 #!/bin/bash
 
-# OLMo 3 model
-MODEL_NAME_OR_PATH="/weka/oe-adapt-default/michaeln/checkpoints/olmo3-7b-base"
-GS_MODEL_NAME="olmo3_7b_base"
+MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
+DATASETS="allenai/Dolci-RLZero-Math-7B 1.0"
 
-DATASETS="saurabh5/DAPO-Math-17k-Processed_filtered_olmo_completions_new_template_filtered 1.0 saurabh5/MATH_3000_Filtered_olmo_completions_new_template_filtered 1.0"
-
-# AIME 2024, 2025 local evals
-LOCAL_EVALS="mnoukhov/aime2024-25-rlvr 1.0 mnoukhov/aime2024-25-rlvr 1.0"
+# AIME 2024, 2025 local single-sample evals
+# Full, bootstrapped pass@32 evals must be run separately
+LOCAL_EVALS="allenai/aime2024-25-rlvr 1.0 allenai/aime2024-25-rlvr 1.0"
 LOCAL_EVAL_SPLITS="test_2024 test_2024 test_2025 test_2025"
 
-# math evals
 EVALS="aime:zs_cot_r1::pass_at_32_2024_dapo,aime:zs_cot_r1::pass_at_32_2025_dapo"
 
-EXP_NAME="grpo_17kfilter_${GS_MODEL_NAME}"
+EXP_NAME="olmo3_7b_rlzero_math"
 BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
 BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
 shift
 
 cluster=ai2/augusta
-
 python mason.py \
     --task_name ${EXP_NAME} \
     --cluster ${cluster} \
@@ -31,12 +27,10 @@ python mason.py \
     --num_nodes 8 \
     --env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
     --env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
-    --gs_model_name $GS_MODEL_NAME \
     --gpus 8 \
     --budget ai2/oe-adapt \
-    -- \
-source configs/beaker_configs/ray_node_setup.sh \&\& \
-python open_instruct/grpo_fast.py \
+    -- source configs/beaker_configs/ray_node_setup.sh \
+\&\& uv run open_instruct/grpo_fast.py \
     --exp_name ${EXP_NAME} \
     --beta 0.0 \
     --async_steps 4 \
@@ -59,7 +53,7 @@ python open_instruct/grpo_fast.py \
     --response_length 12000 \
     --pack_length 32768 \
     --model_name_or_path ${MODEL_NAME_OR_PATH} \
-    --chat_template_name olmo_thinker_dapo \
+    --chat_template_name olmo_thinker_rlzero \
     --non_stop_penalty False \
     --temperature 1.0 \
     --total_episodes 512000 \
diff --git a/scripts/train/olmo3/7b_rlzero_mix.sh b/scripts/train/olmo3/7b_rlzero_mix.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
+DATASETS="allenai/Dolci-RLZero-Code-7B 1.0 allenai/Dolci-RLZero-IF-7B 1.0 allenai/Dolci-RLZero-Code-7B 1.0 allenai/Dolci-RLZero-General-7B 1.0"
+
+LOCAL_EVALS="allenai/Dolci-RLZero-Code-7B 8 allenai/Dolci-RLZero-IF-7B 8 allenai/Dolci-RLZero-Code-7B 8 allenai/Dolci-RLZero-General-7B 8"
+LOCAL_EVAL_SPLITS="train train train train train train train train"
+
+EVALS="alpaca_eval_v3::hamish_zs_reasoning_deepseek,agi_eval_english:0shot_cot::hamish_zs_reasoning_deepseek,gpqa:0shot_cot::hamish_zs_reasoning_deepseek"
+
+EXP_NAME="olmo3_7b_rlzero_mix"
+BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
+BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
+shift
+
+cluster=ai2/augusta
+
+python mason.py \
+    --task_name ${EXP_NAME} \
+    --cluster ${cluster} \
+    --workspace ai2/olmo-instruct \
+    --priority high \
+    --pure_docker_mode \
+    --image ${BEAKER_IMAGE} \
+    --preemptible \
+    --num_nodes 4 \
+    --env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
+    --env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
+    --gpus 8 \
+    --budget ai2/oe-adapt \
+    -- source configs/beaker_configs/ray_node_setup.sh \
+\&\& uv run open_instruct/grpo_fast.py \
+    --exp_name ${EXP_NAME} \
+    --beta 0.0 \
+    --async_steps 4 \
+    --inflight_updates \
+    --truncated_importance_sampling_ratio_cap 2.0 \
+    --num_samples_per_prompt_rollout 8 \
+    --num_unique_prompts_rollout 32 \
+    --num_mini_batches 1 \
+    --num_epochs 1 \
+    --learning_rate 1e-6 \
+    --per_device_train_batch_size 1 \
+    --kl_estimator kl3 \
+    --dataset_mixer_list $DATASETS \
+    --dataset_mixer_list_splits train \
+    --dataset_mixer_eval_list $LOCAL_EVALS \
+    --dataset_mixer_eval_list_splits $LOCAL_EVAL_SPLITS \
+    --max_token_length 10240 \
+    --max_prompt_token_length 2048 \
+    --response_length 16384 \
+    --pack_length 18432 \
+    --model_name_or_path ${MODEL_NAME_OR_PATH} \
+    --chat_template_name olmo_thinker_rlzero \
+    --non_stop_penalty False \
+    --temperature 1.0 \
+    --total_episodes 10000000 \
+    --deepspeed_stage 3 \
+    --num_learners_per_node 8 \
+    --vllm_num_engines 32 \
+    --vllm_tensor_parallel_size 1 \
+    --llm_judge_model hosted_vllm/Qwen/Qwen3-32B \
+    --llm_judge_timeout 600 \
+    --llm_judge_max_tokens 2048 \
+    --llm_judge_max_context_length 32768 \
+    --lr_scheduler_type constant \
+    --apply_verifiable_reward true \
+    --seed 1 \
+    --local_eval_every 50 \
+    --save_freq 50 \
+    --checkpoint_state_freq 50 \
+    --gradient_checkpointing \
+    --with_tracking \
+    --vllm_enable_prefix_caching \
+    --clip_higher 0.272 \
+    --keep_last_n_checkpoints -1 \
+    --mask_truncated_completions True \
+    --oe_eval_max_length 16384 \
+    --code_api_url https://p9f1719l7f.execute-api.us-west-2.amazonaws.com/prod/test_program \
+    --try_launch_beaker_eval_jobs_on_weka True \
+    --oe_eval_tasks $EVALS \
+    --eval_on_step_0 True \
+    --oe_eval_beaker_image oe-eval-beaker/oe_eval_olmo2_retrofit_auto \
+    --output_dir /output/olmo3-7b-rlzero-general/checkpoints $@