Skip to content

Commit 70b0472

Browse files
mnoukhovfinbarrtimbershamishivisaurabh111233212VictoriaGraf
authored
rlzero template and fixes to rlzero scripts (#1216)
* Added olmo3 scripts * Added scripts * Updated scripts with better defaults * Removed 32B DPO smoke test * Fixed table formatting. * Added RLZero scripts. * scripts hamish * add beaker * add beaker * add beaker * add beaker links * RL0 beakre links * add beaker links * delete a couple of RL Zero rows * Added launching instructions * update multiturn subset names * thinker rl-zero template * fix scripts * remove answer stop string * fix and remove gs_model_name * fixes for gemini, cursor, finbarr * fixed --------- Co-authored-by: Finbarr Timbers <[email protected]> Co-authored-by: Hamish Ivison <[email protected]> Co-authored-by: Saurabh Shah <[email protected]> Co-authored-by: VictoriaGraf <[email protected]>
1 parent 7e780f2 commit 70b0472

File tree

5 files changed

+124
-74
lines changed

5 files changed

+124
-74
lines changed

open_instruct/dataset_transformation.py

Lines changed: 17 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -523,47 +523,27 @@ def visualize_token_role(tokens: list[int], masks: list[int], tokenizer: PreTrai
523523
"{% endif %}"
524524
"{% endfor %}"
525525
),
526-
"olmo_thinker_r1_style": (
527-
"A conversation between user and assistant. "
528-
"The user asks a question, and the assistant solves it. "
529-
"The assistant first thinks and reasons about the question "
530-
"and after thinking provides the user with the answer. "
531-
"The reasoning process is enclosed in <think> </think> tags "
532-
"and the answer is enclosed in <answer> </answer> tags "
533-
"so the full response is <think> reasoning process here </think> "
534-
"<answer> answer here </answer>."
526+
"olmo_thinker_rlzero": (
527+
"Solve the following problem step by step. "
528+
"The last line of your response should be the answer to the problem in form Answer: $Answer (without quotes) where $Answer is the answer to the problem."
535529
"\n\n"
536530
"{% for message in messages %}"
537-
"{% if message['role'] == 'system' %}"
538-
"{% if message.get('functions', none) is not none %}"
539-
"{{ '<|im_start|>system\n' + message['content'] + '\n' + '<functions>' + message['functions'] + '</functions><|im_end|>\n' }}"
540-
"{% else %}"
541-
"{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' }}"
542-
"{% endif %}"
543-
"{% elif message['role'] == 'user' %}"
544-
"{% if message.get('functions', none) is not none %}"
545-
"{{ '<|im_start|>user\n' + message['content'] + '\n' + '<functions>' + message['functions'] + '</functions><|im_end|>\n' }}"
546-
"{% else %}"
547-
"{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' }}"
548-
"{% endif %}"
549-
"{% elif message['role'] == 'assistant' %}"
550-
"{{ '<|im_start|>assistant\n' }}"
551-
"{% if message.get('content', none) is not none %}"
552-
"{{ message['content'] }}"
553-
"{% endif %}"
554-
"{% if message.get('function_calls', none) is not none %}"
555-
"{{ '<function_calls>' + message['function_calls'] + '</function_calls>' }}"
556-
"{% endif %}"
557-
"{% if not loop.last %}"
558-
"{{ '<|im_end|>' + '\n' }}"
559-
"{% else %}"
560-
"{{ eos_token }}"
561-
"{% endif %}"
562-
"{% elif message['role'] == 'environment' %}"
563-
"{{ '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' }}"
531+
"{{ '\n\n' if not loop.first else '' }}"
532+
"{{ message['content'] + '\n' }}"
533+
"{% if loop.last and add_generation_prompt %}"
534+
"{{ '\nRemember to put your answer on its own line after \"Answer:\"' }}"
564535
"{% endif %}"
536+
"{% endfor %}"
537+
),
538+
"olmo_thinker_code_rlzero": (
539+
"Solve the following code problem step by step. "
540+
f"The last part of your response should be the solution to the problem in form ```\npython\nCODE\n``` where CODE is the solution for the problem."
541+
"\n\n"
542+
"{% for message in messages %}"
543+
"{{ '\n\n' if not loop.first else '' }}"
544+
"{{ message['content'] + '\n' }}"
565545
"{% if loop.last and add_generation_prompt %}"
566-
"{{ '<|im_start|>assistant\n<think>' }}"
546+
f"\nRemember to put your solution inside the ```\npython\nCODE\n``` tags"
567547
"{% endif %}"
568548
"{% endfor %}"
569549
),

scripts/train/olmo3/7b_rlzero_code.sh

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,14 @@
11
#!/bin/bash
22

3-
# OLMo 3 model
4-
MODEL_NAME_OR_PATH="/weka/oe-training-default/ai2-llm/checkpoints/tylerr/long-context/olmo25_7b_lc_64k_6T_M100B_round5-sparkle_6634-pre_s2pdf_gzip2080_cweN-yake-all-olmo_packing_yarn-fullonly_50B-fb13a737/step11921-hf"
3+
MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
4+
DATASETS="allenai/Dolci-RLZero-Code-7B 1.0"
55

6-
DATASETS="saurabh5/rlvr_acecoder_filtered_filtered_olmo_completions_filtered 6656 hamishivi/synthetic2-rlvr-code-compressed_filtered 3328 hamishivi/klear-code-rlvr_filtered 3328"
7-
8-
LOCAL_EVALS="hamishivi/rlvr_acecoder_filtered_filtered 4 hamishivi/klear-code-rlvr_filtered 4"
6+
LOCAL_EVALS="allenai/Dolci-RLZero-Code-7B 8"
97
LOCAL_EVAL_SPLITS="train"
108

119
EVALS="codex_humanevalplus:0-shot-chat::tulu-thinker_deepseek,mbppplus:0-shot-chat::tulu-thinker_deepseek,livecodebench_codegeneration::tulu-thinker_deepseek_lite"
1210

13-
EXP_NAME="grpo_code_from_zero"
11+
EXP_NAME="olmo3_7b_rlzero_code"
1412
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
1513
BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
1614
shift
@@ -30,9 +28,8 @@ python mason.py \
3028
--env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
3129
--gpus 8 \
3230
--budget ai2/oe-adapt \
33-
-- \
34-
source configs/beaker_configs/ray_node_setup.sh \&\& \
35-
python open_instruct/grpo_fast.py \
31+
-- source configs/beaker_configs/ray_node_setup.sh \
32+
\&\& uv run open_instruct/grpo_fast.py \
3633
--exp_name ${EXP_NAME} \
3734
--beta 0.0 \
3835
--async_steps 4 \
@@ -54,8 +51,7 @@ python open_instruct/grpo_fast.py \
5451
--response_length 16384 \
5552
--pack_length 18432 \
5653
--model_name_or_path ${MODEL_NAME_OR_PATH} \
57-
--chat_template_name olmo_thinker \
58-
--stop_strings "</answer>" \
54+
--chat_template_name olmo_thinker_code_rlzero \
5955
--non_stop_penalty False \
6056
--temperature 1.0 \
6157
--total_episodes 10000000 \

scripts/train/olmo3/7b_rlzero_instruction_following.sh

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,14 @@
11
#!/bin/bash
22

3-
# OLMo 3 model
4-
MODEL_NAME_OR_PATH="/weka/oe-training-default/ai2-llm/checkpoints/tylerr/long-context/olmo25_7b_lc_64k_6T_M100B_round5-sparkle_6634-pre_s2pdf_gzip2080_cweN-yake-all-olmo_packing_yarn-fullonly_50B-fb13a737/step11921-hf"
3+
MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
4+
DATASETS="allenai/Dolci-RLZero-IF-7B 1.0"
55

6-
DATASETS="saurabh5/IF_multi_constraints_upto5_filtered_olmo_completions_filtered 13314"
7-
8-
LOCAL_EVALS="hamishivi/IF_multi_constraints_upto5_filtered 8"
6+
LOCAL_EVALS="allenai/Dolci-RLZero-IF-7B 8"
97
LOCAL_EVAL_SPLITS="train"
108

119
EVALS="ifeval::hamish_zs_reasoning_deepseek"
1210

13-
EXP_NAME="grpo_if_from_zero"
11+
EXP_NAME="olmo3_7b_rlzero_if"
1412
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
1513
BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
1614
shift
@@ -30,9 +28,8 @@ python mason.py \
3028
--env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
3129
--gpus 8 \
3230
--budget ai2/oe-adapt \
33-
-- \
34-
source configs/beaker_configs/ray_node_setup.sh \&\& \
35-
python open_instruct/grpo_fast.py \
31+
-- source configs/beaker_configs/ray_node_setup.sh \
32+
\&\& uv run open_instruct/grpo_fast.py \
3633
--exp_name ${EXP_NAME} \
3734
--beta 0.0 \
3835
--async_steps 4 \
@@ -54,8 +51,7 @@ python open_instruct/grpo_fast.py \
5451
--response_length 16384 \
5552
--pack_length 18432 \
5653
--model_name_or_path ${MODEL_NAME_OR_PATH} \
57-
--chat_template_name olmo_thinker \
58-
--stop_strings "</answer>" \
54+
--chat_template_name olmo_thinker_rlzero \
5955
--non_stop_penalty False \
6056
--temperature 1.0 \
6157
--total_episodes 10000000 \

scripts/train/olmo3/7b_rlzero_math.sh

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,21 @@
11
#!/bin/bash
22

3-
# OLMo 3 model
4-
MODEL_NAME_OR_PATH="/weka/oe-adapt-default/michaeln/checkpoints/olmo3-7b-base"
5-
GS_MODEL_NAME="olmo3_7b_base"
3+
MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
4+
DATASETS="allenai/Dolci-RLZero-Math-7B 1.0"
65

7-
DATASETS="saurabh5/DAPO-Math-17k-Processed_filtered_olmo_completions_new_template_filtered 1.0 saurabh5/MATH_3000_Filtered_olmo_completions_new_template_filtered 1.0"
8-
9-
# AIME 2024, 2025 local evals
10-
LOCAL_EVALS="mnoukhov/aime2024-25-rlvr 1.0 mnoukhov/aime2024-25-rlvr 1.0"
6+
# AIME 2024, 2025 local single-sample evals
7+
# Full, bootstrapped pass@32 evals must be run separately
8+
LOCAL_EVALS="allenai/aime2024-25-rlvr 1.0 allenai/aime2024-25-rlvr 1.0"
119
LOCAL_EVAL_SPLITS="test_2024 test_2024 test_2025 test_2025"
1210

13-
# math evals
1411
EVALS="aime:zs_cot_r1::pass_at_32_2024_dapo,aime:zs_cot_r1::pass_at_32_2025_dapo"
1512

16-
EXP_NAME="grpo_17kfilter_${GS_MODEL_NAME}"
13+
EXP_NAME="olmo3_7b_rlzero_math"
1714
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
1815
BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
1916
shift
2017

2118
cluster=ai2/augusta
22-
2319
python mason.py \
2420
--task_name ${EXP_NAME} \
2521
--cluster ${cluster} \
@@ -31,12 +27,10 @@ python mason.py \
3127
--num_nodes 8 \
3228
--env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
3329
--env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
34-
--gs_model_name $GS_MODEL_NAME \
3530
--gpus 8 \
3631
--budget ai2/oe-adapt \
37-
-- \
38-
source configs/beaker_configs/ray_node_setup.sh \&\& \
39-
python open_instruct/grpo_fast.py \
32+
-- source configs/beaker_configs/ray_node_setup.sh \
33+
\&\& uv run open_instruct/grpo_fast.py \
4034
--exp_name ${EXP_NAME} \
4135
--beta 0.0 \
4236
--async_steps 4 \
@@ -59,7 +53,7 @@ python open_instruct/grpo_fast.py \
5953
--response_length 12000 \
6054
--pack_length 32768 \
6155
--model_name_or_path ${MODEL_NAME_OR_PATH} \
62-
--chat_template_name olmo_thinker_dapo \
56+
--chat_template_name olmo_thinker_rlzero \
6357
--non_stop_penalty False \
6458
--temperature 1.0 \
6559
--total_episodes 512000 \
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/bin/bash
2+
3+
MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
4+
DATASETS="allenai/Dolci-RLZero-Code-7B 1.0 allenai/Dolci-RLZero-IF-7B 1.0 allenai/Dolci-RLZero-Code-7B 1.0 allenai/Dolci-RLZero-General-7B 1.0"
5+
6+
LOCAL_EVALS="allenai/Dolci-RLZero-Code-7B 8 allenai/Dolci-RLZero-IF-7B 8 allenai/Dolci-RLZero-Code-7B 8 allenai/Dolci-RLZero-General-7B 8"
7+
LOCAL_EVAL_SPLITS="train train train train train train train train"
8+
9+
EVALS="alpaca_eval_v3::hamish_zs_reasoning_deepseek,agi_eval_english:0shot_cot::hamish_zs_reasoning_deepseek,gpqa:0shot_cot::hamish_zs_reasoning_deepseek"
10+
11+
EXP_NAME="olmo3_7b_rlzero_mix"
12+
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
13+
BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
14+
shift
15+
16+
cluster=ai2/augusta
17+
18+
python mason.py \
19+
--task_name ${EXP_NAME} \
20+
--cluster ${cluster} \
21+
--workspace ai2/olmo-instruct \
22+
--priority high \
23+
--pure_docker_mode \
24+
--image ${BEAKER_IMAGE} \
25+
--preemptible \
26+
--num_nodes 4 \
27+
--env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
28+
--env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
29+
--gpus 8 \
30+
--budget ai2/oe-adapt \
31+
-- source configs/beaker_configs/ray_node_setup.sh \
32+
\&\& uv run open_instruct/grpo_fast.py \
33+
--exp_name ${EXP_NAME} \
34+
--beta 0.0 \
35+
--async_steps 4 \
36+
--inflight_updates \
37+
--truncated_importance_sampling_ratio_cap 2.0 \
38+
--num_samples_per_prompt_rollout 8 \
39+
--num_unique_prompts_rollout 32 \
40+
--num_mini_batches 1 \
41+
--num_epochs 1 \
42+
--learning_rate 1e-6 \
43+
--per_device_train_batch_size 1 \
44+
--kl_estimator kl3 \
45+
--dataset_mixer_list $DATASETS \
46+
--dataset_mixer_list_splits train \
47+
--dataset_mixer_eval_list $LOCAL_EVALS \
48+
--dataset_mixer_eval_list_splits $LOCAL_EVAL_SPLITS \
49+
--max_token_length 10240 \
50+
--max_prompt_token_length 2048 \
51+
--response_length 16384 \
52+
--pack_length 18432 \
53+
--model_name_or_path ${MODEL_NAME_OR_PATH} \
54+
--chat_template_name olmo_thinker_rlzero \
55+
--non_stop_penalty False \
56+
--temperature 1.0 \
57+
--total_episodes 10000000 \
58+
--deepspeed_stage 3 \
59+
--num_learners_per_node 8 \
60+
--vllm_num_engines 32 \
61+
--vllm_tensor_parallel_size 1 \
62+
--llm_judge_model hosted_vllm/Qwen/Qwen3-32B \
63+
--llm_judge_timeout 600 \
64+
--llm_judge_max_tokens 2048 \
65+
--llm_judge_max_context_length 32768 \
66+
--lr_scheduler_type constant \
67+
--apply_verifiable_reward true \
68+
--seed 1 \
69+
--local_eval_every 50 \
70+
--save_freq 50 \
71+
--checkpoint_state_freq 50 \
72+
--gradient_checkpointing \
73+
--with_tracking \
74+
--vllm_enable_prefix_caching \
75+
--clip_higher 0.272 \
76+
--keep_last_n_checkpoints -1 \
77+
--mask_truncated_completions True \
78+
--oe_eval_max_length 16384 \
79+
--code_api_url https://p9f1719l7f.execute-api.us-west-2.amazonaws.com/prod/test_program \
80+
--try_launch_beaker_eval_jobs_on_weka True \
81+
--oe_eval_tasks $EVALS \
82+
--eval_on_step_0 True \
83+
--oe_eval_beaker_image oe-eval-beaker/oe_eval_olmo2_retrofit_auto \
84+
--output_dir /output/olmo3-7b-rlzero-general/checkpoints $@

0 commit comments

Comments
 (0)