diff --git a/open_instruct/dataset_transformation.py b/open_instruct/dataset_transformation.py
index 0a94a7669..1f4fd416b 100644
--- a/open_instruct/dataset_transformation.py
+++ b/open_instruct/dataset_transformation.py
@@ -523,47 +523,27 @@ def visualize_token_role(tokens: list[int], masks: list[int], tokenizer: PreTrai
"{% endif %}"
"{% endfor %}"
),
- "olmo_thinker_r1_style": (
- "A conversation between user and assistant. "
- "The user asks a question, and the assistant solves it. "
- "The assistant first thinks and reasons about the question "
- "and after thinking provides the user with the answer. "
- "The reasoning process is enclosed in tags "
- "and the answer is enclosed in tags "
- "so the full response is reasoning process here "
- " answer here ."
+ "olmo_thinker_rlzero": (
+ "Solve the following problem step by step. "
+ "The last line of your response should be the answer to the problem in form Answer: $Answer (without quotes) where $Answer is the answer to the problem."
"\n\n"
"{% for message in messages %}"
- "{% if message['role'] == 'system' %}"
- "{% if message.get('functions', none) is not none %}"
- "{{ '<|im_start|>system\n' + message['content'] + '\n' + '' + message['functions'] + '<|im_end|>\n' }}"
- "{% else %}"
- "{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' }}"
- "{% endif %}"
- "{% elif message['role'] == 'user' %}"
- "{% if message.get('functions', none) is not none %}"
- "{{ '<|im_start|>user\n' + message['content'] + '\n' + '' + message['functions'] + '<|im_end|>\n' }}"
- "{% else %}"
- "{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' }}"
- "{% endif %}"
- "{% elif message['role'] == 'assistant' %}"
- "{{ '<|im_start|>assistant\n' }}"
- "{% if message.get('content', none) is not none %}"
- "{{ message['content'] }}"
- "{% endif %}"
- "{% if message.get('function_calls', none) is not none %}"
- "{{ '' + message['function_calls'] + '' }}"
- "{% endif %}"
- "{% if not loop.last %}"
- "{{ '<|im_end|>' + '\n' }}"
- "{% else %}"
- "{{ eos_token }}"
- "{% endif %}"
- "{% elif message['role'] == 'environment' %}"
- "{{ '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' }}"
+ "{{ '\n\n' if not loop.first else '' }}"
+ "{{ message['content'] + '\n' }}"
+ "{% if loop.last and add_generation_prompt %}"
+ "{{ '\nRemember to put your answer on its own line after \"Answer:\"' }}"
"{% endif %}"
+ "{% endfor %}"
+ ),
+ "olmo_thinker_code_rlzero": (
+ "Solve the following code problem step by step. "
+ f"The last part of your response should be the solution to the problem in form ```\npython\nCODE\n``` where CODE is the solution for the problem."
+ "\n\n"
+ "{% for message in messages %}"
+ "{{ '\n\n' if not loop.first else '' }}"
+ "{{ message['content'] + '\n' }}"
"{% if loop.last and add_generation_prompt %}"
- "{{ '<|im_start|>assistant\n' }}"
+ f"\nRemember to put your solution inside the ```\npython\nCODE\n``` tags"
"{% endif %}"
"{% endfor %}"
),
diff --git a/scripts/train/olmo3/7b_rlzero_code.sh b/scripts/train/olmo3/7b_rlzero_code.sh
index 620acf57c..6ea80d421 100644
--- a/scripts/train/olmo3/7b_rlzero_code.sh
+++ b/scripts/train/olmo3/7b_rlzero_code.sh
@@ -1,16 +1,14 @@
#!/bin/bash
-# OLMo 3 model
-MODEL_NAME_OR_PATH="/weka/oe-training-default/ai2-llm/checkpoints/tylerr/long-context/olmo25_7b_lc_64k_6T_M100B_round5-sparkle_6634-pre_s2pdf_gzip2080_cweN-yake-all-olmo_packing_yarn-fullonly_50B-fb13a737/step11921-hf"
+MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
+DATASETS="allenai/Dolci-RLZero-Code-7B 1.0"
-DATASETS="saurabh5/rlvr_acecoder_filtered_filtered_olmo_completions_filtered 6656 hamishivi/synthetic2-rlvr-code-compressed_filtered 3328 hamishivi/klear-code-rlvr_filtered 3328"
-
-LOCAL_EVALS="hamishivi/rlvr_acecoder_filtered_filtered 4 hamishivi/klear-code-rlvr_filtered 4"
+LOCAL_EVALS="allenai/Dolci-RLZero-Code-7B 8"
LOCAL_EVAL_SPLITS="train"
EVALS="codex_humanevalplus:0-shot-chat::tulu-thinker_deepseek,mbppplus:0-shot-chat::tulu-thinker_deepseek,livecodebench_codegeneration::tulu-thinker_deepseek_lite"
-EXP_NAME="grpo_code_from_zero"
+EXP_NAME="olmo3_7b_rlzero_code"
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
shift
@@ -30,9 +28,8 @@ python mason.py \
--env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
--gpus 8 \
--budget ai2/oe-adapt \
- -- \
-source configs/beaker_configs/ray_node_setup.sh \&\& \
-python open_instruct/grpo_fast.py \
+ -- source configs/beaker_configs/ray_node_setup.sh \
+\&\& uv run open_instruct/grpo_fast.py \
--exp_name ${EXP_NAME} \
--beta 0.0 \
--async_steps 4 \
@@ -54,8 +51,7 @@ python open_instruct/grpo_fast.py \
--response_length 16384 \
--pack_length 18432 \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
- --chat_template_name olmo_thinker \
- --stop_strings "" \
+ --chat_template_name olmo_thinker_code_rlzero \
--non_stop_penalty False \
--temperature 1.0 \
--total_episodes 10000000 \
diff --git a/scripts/train/olmo3/7b_rlzero_instruction_following.sh b/scripts/train/olmo3/7b_rlzero_instruction_following.sh
index ea0321285..5b5165100 100644
--- a/scripts/train/olmo3/7b_rlzero_instruction_following.sh
+++ b/scripts/train/olmo3/7b_rlzero_instruction_following.sh
@@ -1,16 +1,14 @@
#!/bin/bash
-# OLMo 3 model
-MODEL_NAME_OR_PATH="/weka/oe-training-default/ai2-llm/checkpoints/tylerr/long-context/olmo25_7b_lc_64k_6T_M100B_round5-sparkle_6634-pre_s2pdf_gzip2080_cweN-yake-all-olmo_packing_yarn-fullonly_50B-fb13a737/step11921-hf"
+MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
+DATASETS="allenai/Dolci-RLZero-IF-7B 1.0"
-DATASETS="saurabh5/IF_multi_constraints_upto5_filtered_olmo_completions_filtered 13314"
-
-LOCAL_EVALS="hamishivi/IF_multi_constraints_upto5_filtered 8"
+LOCAL_EVALS="allenai/Dolci-RLZero-IF-7B 8"
LOCAL_EVAL_SPLITS="train"
EVALS="ifeval::hamish_zs_reasoning_deepseek"
-EXP_NAME="grpo_if_from_zero"
+EXP_NAME="olmo3_7b_rlzero_if"
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
shift
@@ -30,9 +28,8 @@ python mason.py \
--env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
--gpus 8 \
--budget ai2/oe-adapt \
- -- \
-source configs/beaker_configs/ray_node_setup.sh \&\& \
-python open_instruct/grpo_fast.py \
+ -- source configs/beaker_configs/ray_node_setup.sh \
+\&\& uv run open_instruct/grpo_fast.py \
--exp_name ${EXP_NAME} \
--beta 0.0 \
--async_steps 4 \
@@ -54,8 +51,7 @@ python open_instruct/grpo_fast.py \
--response_length 16384 \
--pack_length 18432 \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
- --chat_template_name olmo_thinker \
- --stop_strings "" \
+ --chat_template_name olmo_thinker_rlzero \
--non_stop_penalty False \
--temperature 1.0 \
--total_episodes 10000000 \
diff --git a/scripts/train/olmo3/7b_rlzero_math.sh b/scripts/train/olmo3/7b_rlzero_math.sh
index 8a23c8760..4761b5cb4 100644
--- a/scripts/train/olmo3/7b_rlzero_math.sh
+++ b/scripts/train/olmo3/7b_rlzero_math.sh
@@ -1,25 +1,21 @@
#!/bin/bash
-# OLMo 3 model
-MODEL_NAME_OR_PATH="/weka/oe-adapt-default/michaeln/checkpoints/olmo3-7b-base"
-GS_MODEL_NAME="olmo3_7b_base"
+MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
+DATASETS="allenai/Dolci-RLZero-Math-7B 1.0"
-DATASETS="saurabh5/DAPO-Math-17k-Processed_filtered_olmo_completions_new_template_filtered 1.0 saurabh5/MATH_3000_Filtered_olmo_completions_new_template_filtered 1.0"
-
-# AIME 2024, 2025 local evals
-LOCAL_EVALS="mnoukhov/aime2024-25-rlvr 1.0 mnoukhov/aime2024-25-rlvr 1.0"
+# AIME 2024, 2025 local single-sample evals
+# Full, bootstrapped pass@32 evals must be run separately
+LOCAL_EVALS="allenai/aime2024-25-rlvr 1.0 allenai/aime2024-25-rlvr 1.0"
LOCAL_EVAL_SPLITS="test_2024 test_2024 test_2025 test_2025"
-# math evals
EVALS="aime:zs_cot_r1::pass_at_32_2024_dapo,aime:zs_cot_r1::pass_at_32_2025_dapo"
-EXP_NAME="grpo_17kfilter_${GS_MODEL_NAME}"
+EXP_NAME="olmo3_7b_rlzero_math"
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
shift
cluster=ai2/augusta
-
python mason.py \
--task_name ${EXP_NAME} \
--cluster ${cluster} \
@@ -31,12 +27,10 @@ python mason.py \
--num_nodes 8 \
--env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
--env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
- --gs_model_name $GS_MODEL_NAME \
--gpus 8 \
--budget ai2/oe-adapt \
- -- \
-source configs/beaker_configs/ray_node_setup.sh \&\& \
-python open_instruct/grpo_fast.py \
+ -- source configs/beaker_configs/ray_node_setup.sh \
+\&\& uv run open_instruct/grpo_fast.py \
--exp_name ${EXP_NAME} \
--beta 0.0 \
--async_steps 4 \
@@ -59,7 +53,7 @@ python open_instruct/grpo_fast.py \
--response_length 12000 \
--pack_length 32768 \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
- --chat_template_name olmo_thinker_dapo \
+ --chat_template_name olmo_thinker_rlzero \
--non_stop_penalty False \
--temperature 1.0 \
--total_episodes 512000 \
diff --git a/scripts/train/olmo3/7b_rlzero_mix.sh b/scripts/train/olmo3/7b_rlzero_mix.sh
new file mode 100644
index 000000000..910bed387
--- /dev/null
+++ b/scripts/train/olmo3/7b_rlzero_mix.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
+DATASETS="allenai/Dolci-RLZero-Code-7B 1.0 allenai/Dolci-RLZero-IF-7B 1.0 allenai/Dolci-RLZero-Code-7B 1.0 allenai/Dolci-RLZero-General-7B 1.0"
+
+LOCAL_EVALS="allenai/Dolci-RLZero-Code-7B 8 allenai/Dolci-RLZero-IF-7B 8 allenai/Dolci-RLZero-Code-7B 8 allenai/Dolci-RLZero-General-7B 8"
+LOCAL_EVAL_SPLITS="train train train train train train train train"
+
+EVALS="alpaca_eval_v3::hamish_zs_reasoning_deepseek,agi_eval_english:0shot_cot::hamish_zs_reasoning_deepseek,gpqa:0shot_cot::hamish_zs_reasoning_deepseek"
+
+EXP_NAME="olmo3_7b_rlzero_mix"
+BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
+BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
+shift
+
+cluster=ai2/augusta
+
+python mason.py \
+ --task_name ${EXP_NAME} \
+ --cluster ${cluster} \
+ --workspace ai2/olmo-instruct \
+ --priority high \
+ --pure_docker_mode \
+ --image ${BEAKER_IMAGE} \
+ --preemptible \
+ --num_nodes 4 \
+ --env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
+ --env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
+ --gpus 8 \
+ --budget ai2/oe-adapt \
+ -- source configs/beaker_configs/ray_node_setup.sh \
+\&\& uv run open_instruct/grpo_fast.py \
+ --exp_name ${EXP_NAME} \
+ --beta 0.0 \
+ --async_steps 4 \
+ --inflight_updates \
+ --truncated_importance_sampling_ratio_cap 2.0 \
+ --num_samples_per_prompt_rollout 8 \
+ --num_unique_prompts_rollout 32 \
+ --num_mini_batches 1 \
+ --num_epochs 1 \
+ --learning_rate 1e-6 \
+ --per_device_train_batch_size 1 \
+ --kl_estimator kl3 \
+ --dataset_mixer_list $DATASETS \
+ --dataset_mixer_list_splits train \
+ --dataset_mixer_eval_list $LOCAL_EVALS \
+ --dataset_mixer_eval_list_splits $LOCAL_EVAL_SPLITS \
+ --max_token_length 10240 \
+ --max_prompt_token_length 2048 \
+ --response_length 16384 \
+ --pack_length 18432 \
+ --model_name_or_path ${MODEL_NAME_OR_PATH} \
+ --chat_template_name olmo_thinker_rlzero \
+ --non_stop_penalty False \
+ --temperature 1.0 \
+ --total_episodes 10000000 \
+ --deepspeed_stage 3 \
+ --num_learners_per_node 8 \
+ --vllm_num_engines 32 \
+ --vllm_tensor_parallel_size 1 \
+ --llm_judge_model hosted_vllm/Qwen/Qwen3-32B \
+ --llm_judge_timeout 600 \
+ --llm_judge_max_tokens 2048 \
+ --llm_judge_max_context_length 32768 \
+ --lr_scheduler_type constant \
+ --apply_verifiable_reward true \
+ --seed 1 \
+ --local_eval_every 50 \
+ --save_freq 50 \
+ --checkpoint_state_freq 50 \
+ --gradient_checkpointing \
+ --with_tracking \
+ --vllm_enable_prefix_caching \
+ --clip_higher 0.272 \
+ --keep_last_n_checkpoints -1 \
+ --mask_truncated_completions True \
+ --oe_eval_max_length 16384 \
+ --code_api_url https://p9f1719l7f.execute-api.us-west-2.amazonaws.com/prod/test_program \
+ --try_launch_beaker_eval_jobs_on_weka True \
+ --oe_eval_tasks $EVALS \
+ --eval_on_step_0 True \
+ --oe_eval_beaker_image oe-eval-beaker/oe_eval_olmo2_retrofit_auto \
+ --output_dir /output/olmo3-7b-rlzero-general/checkpoints $@