Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 17 additions & 37 deletions open_instruct/dataset_transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,47 +523,27 @@ def visualize_token_role(tokens: list[int], masks: list[int], tokenizer: PreTrai
"{% endif %}"
"{% endfor %}"
),
"olmo_thinker_r1_style": (
"A conversation between user and assistant. "
"The user asks a question, and the assistant solves it. "
"The assistant first thinks and reasons about the question "
"and after thinking provides the user with the answer. "
"The reasoning process is enclosed in <think> </think> tags "
"and the answer is enclosed in <answer> </answer> tags "
"so the full response is <think> reasoning process here </think> "
"<answer> answer here </answer>."
"olmo_thinker_rlzero": (
"Solve the following problem step by step. "
"The last line of your response should be the answer to the problem in form Answer: $Answer (without quotes) where $Answer is the answer to the problem."
"\n\n"
"{% for message in messages %}"
"{% if message['role'] == 'system' %}"
"{% if message.get('functions', none) is not none %}"
"{{ '<|im_start|>system\n' + message['content'] + '\n' + '<functions>' + message['functions'] + '</functions><|im_end|>\n' }}"
"{% else %}"
"{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' }}"
"{% endif %}"
"{% elif message['role'] == 'user' %}"
"{% if message.get('functions', none) is not none %}"
"{{ '<|im_start|>user\n' + message['content'] + '\n' + '<functions>' + message['functions'] + '</functions><|im_end|>\n' }}"
"{% else %}"
"{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' }}"
"{% endif %}"
"{% elif message['role'] == 'assistant' %}"
"{{ '<|im_start|>assistant\n' }}"
"{% if message.get('content', none) is not none %}"
"{{ message['content'] }}"
"{% endif %}"
"{% if message.get('function_calls', none) is not none %}"
"{{ '<function_calls>' + message['function_calls'] + '</function_calls>' }}"
"{% endif %}"
"{% if not loop.last %}"
"{{ '<|im_end|>' + '\n' }}"
"{% else %}"
"{{ eos_token }}"
"{% endif %}"
"{% elif message['role'] == 'environment' %}"
"{{ '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' }}"
"{{ '\n\n' if not loop.first else '' }}"
"{{ message['content'] + '\n' }}"
"{% if loop.last and add_generation_prompt %}"
"{{ '\nRemember to put your answer on its own line after \"Answer:\"' }}"
"{% endif %}"
"{% endfor %}"
),
"olmo_thinker_code_rlzero": (
"Solve the following code problem step by step. "
f"The last part of your response should be the solution to the problem in form ```\npython\nCODE\n``` where CODE is the solution for the problem."
"\n\n"
"{% for message in messages %}"
"{{ '\n\n' if not loop.first else '' }}"
"{{ message['content'] + '\n' }}"
"{% if loop.last and add_generation_prompt %}"
"{{ '<|im_start|>assistant\n<think>' }}"
f"\nRemember to put your solution inside the ```\npython\nCODE\n``` tags"
"{% endif %}"
"{% endfor %}"
),
Expand Down
18 changes: 7 additions & 11 deletions scripts/train/olmo3/7b_rlzero_code.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
#!/bin/bash

# OLMo 3 model
MODEL_NAME_OR_PATH="/weka/oe-training-default/ai2-llm/checkpoints/tylerr/long-context/olmo25_7b_lc_64k_6T_M100B_round5-sparkle_6634-pre_s2pdf_gzip2080_cweN-yake-all-olmo_packing_yarn-fullonly_50B-fb13a737/step11921-hf"
MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
DATASETS="allenai/Dolci-RLZero-Code-7B 1.0"

DATASETS="saurabh5/rlvr_acecoder_filtered_filtered_olmo_completions_filtered 6656 hamishivi/synthetic2-rlvr-code-compressed_filtered 3328 hamishivi/klear-code-rlvr_filtered 3328"

LOCAL_EVALS="hamishivi/rlvr_acecoder_filtered_filtered 4 hamishivi/klear-code-rlvr_filtered 4"
LOCAL_EVALS="allenai/Dolci-RLZero-Code-7B 8"
LOCAL_EVAL_SPLITS="train"

EVALS="codex_humanevalplus:0-shot-chat::tulu-thinker_deepseek,mbppplus:0-shot-chat::tulu-thinker_deepseek,livecodebench_codegeneration::tulu-thinker_deepseek_lite"

EXP_NAME="grpo_code_from_zero"
EXP_NAME="olmo3_7b_rlzero_code"
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
shift
Expand All @@ -30,9 +28,8 @@ python mason.py \
--env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
--gpus 8 \
--budget ai2/oe-adapt \
-- \
source configs/beaker_configs/ray_node_setup.sh \&\& \
python open_instruct/grpo_fast.py \
-- source configs/beaker_configs/ray_node_setup.sh \
\&\& uv run open_instruct/grpo_fast.py \
--exp_name ${EXP_NAME} \
--beta 0.0 \
--async_steps 4 \
Expand All @@ -54,8 +51,7 @@ python open_instruct/grpo_fast.py \
--response_length 16384 \
--pack_length 18432 \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--chat_template_name olmo_thinker \
--stop_strings "</answer>" \
--chat_template_name olmo_thinker_code_rlzero \
--non_stop_penalty False \
--temperature 1.0 \
--total_episodes 10000000 \
Expand Down
18 changes: 7 additions & 11 deletions scripts/train/olmo3/7b_rlzero_instruction_following.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
#!/bin/bash

# OLMo 3 model
MODEL_NAME_OR_PATH="/weka/oe-training-default/ai2-llm/checkpoints/tylerr/long-context/olmo25_7b_lc_64k_6T_M100B_round5-sparkle_6634-pre_s2pdf_gzip2080_cweN-yake-all-olmo_packing_yarn-fullonly_50B-fb13a737/step11921-hf"
MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
DATASETS="allenai/Dolci-RLZero-IF-7B 1.0"

DATASETS="saurabh5/IF_multi_constraints_upto5_filtered_olmo_completions_filtered 13314"

LOCAL_EVALS="hamishivi/IF_multi_constraints_upto5_filtered 8"
LOCAL_EVALS="allenai/Dolci-RLZero-IF-7B 8"
LOCAL_EVAL_SPLITS="train"

EVALS="ifeval::hamish_zs_reasoning_deepseek"

EXP_NAME="grpo_if_from_zero"
EXP_NAME="olmo3_7b_rlzero_if"
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
shift
Expand All @@ -30,9 +28,8 @@ python mason.py \
--env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
--gpus 8 \
--budget ai2/oe-adapt \
-- \
source configs/beaker_configs/ray_node_setup.sh \&\& \
python open_instruct/grpo_fast.py \
-- source configs/beaker_configs/ray_node_setup.sh \
\&\& uv run open_instruct/grpo_fast.py \
--exp_name ${EXP_NAME} \
--beta 0.0 \
--async_steps 4 \
Expand All @@ -54,8 +51,7 @@ python open_instruct/grpo_fast.py \
--response_length 16384 \
--pack_length 18432 \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--chat_template_name olmo_thinker \
--stop_strings "</answer>" \
--chat_template_name olmo_thinker_rlzero \
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Stop string mismatch with new templates

The script uses --stop_strings "</answer>" but the olmo_thinker_rlzero template instructs the model to output answers in the form Answer: $Answer, not using <answer> tags. The stop string will never match, causing generation to continue until hitting max length or EOS token instead of stopping at the intended point.

Fix in Cursor Fix in Web

--non_stop_penalty False \
--temperature 1.0 \
--total_episodes 10000000 \
Expand Down
24 changes: 9 additions & 15 deletions scripts/train/olmo3/7b_rlzero_math.sh
Original file line number Diff line number Diff line change
@@ -1,25 +1,21 @@
#!/bin/bash

# OLMo 3 model
MODEL_NAME_OR_PATH="/weka/oe-adapt-default/michaeln/checkpoints/olmo3-7b-base"
GS_MODEL_NAME="olmo3_7b_base"
MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
DATASETS="allenai/Dolci-RLZero-Math-7B 1.0"

DATASETS="saurabh5/DAPO-Math-17k-Processed_filtered_olmo_completions_new_template_filtered 1.0 saurabh5/MATH_3000_Filtered_olmo_completions_new_template_filtered 1.0"

# AIME 2024, 2025 local evals
LOCAL_EVALS="mnoukhov/aime2024-25-rlvr 1.0 mnoukhov/aime2024-25-rlvr 1.0"
# AIME 2024, 2025 local single-sample evals
# Full, bootstrapped pass@32 evals must be run separately
LOCAL_EVALS="allenai/aime2024-25-rlvr 1.0 allenai/aime2024-25-rlvr 1.0"
LOCAL_EVAL_SPLITS="test_2024 test_2024 test_2025 test_2025"

# math evals
EVALS="aime:zs_cot_r1::pass_at_32_2024_dapo,aime:zs_cot_r1::pass_at_32_2025_dapo"

EXP_NAME="grpo_17kfilter_${GS_MODEL_NAME}"
EXP_NAME="olmo3_7b_rlzero_math"
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
shift

cluster=ai2/augusta

python mason.py \
--task_name ${EXP_NAME} \
--cluster ${cluster} \
Expand All @@ -31,12 +27,10 @@ python mason.py \
--num_nodes 8 \
--env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
--env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
--gs_model_name $GS_MODEL_NAME \
--gpus 8 \
--budget ai2/oe-adapt \
-- \
source configs/beaker_configs/ray_node_setup.sh \&\& \
python open_instruct/grpo_fast.py \
-- source configs/beaker_configs/ray_node_setup.sh \
\&\& uv run open_instruct/grpo_fast.py \
--exp_name ${EXP_NAME} \
--beta 0.0 \
--async_steps 4 \
Expand All @@ -59,7 +53,7 @@ python open_instruct/grpo_fast.py \
--response_length 12000 \
--pack_length 32768 \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--chat_template_name olmo_thinker_dapo \
--chat_template_name olmo_thinker_rlzero \
--non_stop_penalty False \
--temperature 1.0 \
--total_episodes 512000 \
Expand Down
84 changes: 84 additions & 0 deletions scripts/train/olmo3/7b_rlzero_mix.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash

MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B"
DATASETS="allenai/Dolci-RLZero-Code-7B 1.0 allenai/Dolci-RLZero-IF-7B 1.0 allenai/Dolci-RLZero-Code-7B 1.0 allenai/Dolci-RLZero-General-7B 1.0"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Duplicate Code dataset in mix script

The DATASETS variable includes allenai/Dolci-RLZero-Code-7B twice (positions 1 and 3), likely instead of including the Math dataset once. The same duplication appears in LOCAL_EVALS. Given separate scripts exist for code, IF, and math training, the mix script should probably include all four dataset types: Code, IF, Math, and General, not Code twice.

Additional Locations (1)

Fix in Cursor Fix in Web


LOCAL_EVALS="allenai/Dolci-RLZero-Code-7B 8 allenai/Dolci-RLZero-IF-7B 8 allenai/Dolci-RLZero-Code-7B 8 allenai/Dolci-RLZero-General-7B 8"
LOCAL_EVAL_SPLITS="train train train train train train train train"

EVALS="alpaca_eval_v3::hamish_zs_reasoning_deepseek,agi_eval_english:0shot_cot::hamish_zs_reasoning_deepseek,gpqa:0shot_cot::hamish_zs_reasoning_deepseek"

EXP_NAME="olmo3_7b_rlzero_mix"
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
shift

cluster=ai2/augusta

python mason.py \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

uv run?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed

--task_name ${EXP_NAME} \
--cluster ${cluster} \
--workspace ai2/olmo-instruct \
--priority high \
--pure_docker_mode \
--image ${BEAKER_IMAGE} \
--preemptible \
--num_nodes 4 \
--env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
--env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
--gpus 8 \
--budget ai2/oe-adapt \
-- source configs/beaker_configs/ray_node_setup.sh \
\&\& uv run open_instruct/grpo_fast.py \
--exp_name ${EXP_NAME} \
--beta 0.0 \
--async_steps 4 \
--inflight_updates \
--truncated_importance_sampling_ratio_cap 2.0 \
--num_samples_per_prompt_rollout 8 \
--num_unique_prompts_rollout 32 \
--num_mini_batches 1 \
--num_epochs 1 \
--learning_rate 1e-6 \
--per_device_train_batch_size 1 \
--kl_estimator kl3 \
--dataset_mixer_list $DATASETS \
--dataset_mixer_list_splits train \
--dataset_mixer_eval_list $LOCAL_EVALS \
--dataset_mixer_eval_list_splits $LOCAL_EVAL_SPLITS \
--max_token_length 10240 \
--max_prompt_token_length 2048 \
--response_length 16384 \
--pack_length 18432 \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--chat_template_name olmo_thinker_rlzero \
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Template mismatch for Code dataset

The script uses olmo_thinker_rlzero template while including the Code dataset. The Code dataset requires olmo_thinker_code_rlzero template which formats responses with code blocks. Using the wrong template will cause the model to generate responses in the wrong format for code problems, breaking the expected output structure with markdown code fences.

Fix in Cursor Fix in Web

--non_stop_penalty False \
--temperature 1.0 \
--total_episodes 10000000 \
--deepspeed_stage 3 \
--num_learners_per_node 8 \
--vllm_num_engines 32 \
--vllm_tensor_parallel_size 1 \
--llm_judge_model hosted_vllm/Qwen/Qwen3-32B \
--llm_judge_timeout 600 \
--llm_judge_max_tokens 2048 \
--llm_judge_max_context_length 32768 \
--lr_scheduler_type constant \
--apply_verifiable_reward true \
--seed 1 \
--local_eval_every 50 \
--save_freq 50 \
--checkpoint_state_freq 50 \
--gradient_checkpointing \
--with_tracking \
--vllm_enable_prefix_caching \
--clip_higher 0.272 \
--keep_last_n_checkpoints -1 \
--mask_truncated_completions True \
--oe_eval_max_length 16384 \
--code_api_url https://p9f1719l7f.execute-api.us-west-2.amazonaws.com/prod/test_program \
--try_launch_beaker_eval_jobs_on_weka True \
--oe_eval_tasks $EVALS \
--eval_on_step_0 True \
--oe_eval_beaker_image oe-eval-beaker/oe_eval_olmo2_retrofit_auto \
--output_dir /output/olmo3-7b-rlzero-general/checkpoints $@