diff --git a/open_instruct/dataset_transformation.py b/open_instruct/dataset_transformation.py index 0a94a7669..1f4fd416b 100644 --- a/open_instruct/dataset_transformation.py +++ b/open_instruct/dataset_transformation.py @@ -523,47 +523,27 @@ def visualize_token_role(tokens: list[int], masks: list[int], tokenizer: PreTrai "{% endif %}" "{% endfor %}" ), - "olmo_thinker_r1_style": ( - "A conversation between user and assistant. " - "The user asks a question, and the assistant solves it. " - "The assistant first thinks and reasons about the question " - "and after thinking provides the user with the answer. " - "The reasoning process is enclosed in tags " - "and the answer is enclosed in tags " - "so the full response is reasoning process here " - " answer here ." + "olmo_thinker_rlzero": ( + "Solve the following problem step by step. " + "The last line of your response should be the answer to the problem in form Answer: $Answer (without quotes) where $Answer is the answer to the problem." "\n\n" "{% for message in messages %}" - "{% if message['role'] == 'system' %}" - "{% if message.get('functions', none) is not none %}" - "{{ '<|im_start|>system\n' + message['content'] + '\n' + '' + message['functions'] + '<|im_end|>\n' }}" - "{% else %}" - "{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' }}" - "{% endif %}" - "{% elif message['role'] == 'user' %}" - "{% if message.get('functions', none) is not none %}" - "{{ '<|im_start|>user\n' + message['content'] + '\n' + '' + message['functions'] + '<|im_end|>\n' }}" - "{% else %}" - "{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' }}" - "{% endif %}" - "{% elif message['role'] == 'assistant' %}" - "{{ '<|im_start|>assistant\n' }}" - "{% if message.get('content', none) is not none %}" - "{{ message['content'] }}" - "{% endif %}" - "{% if message.get('function_calls', none) is not none %}" - "{{ '' + message['function_calls'] + '' }}" - "{% endif %}" - "{% if not loop.last %}" - "{{ '<|im_end|>' + '\n' }}" - "{% else %}" - "{{ eos_token }}" - "{% endif %}" - "{% elif message['role'] == 'environment' %}" - "{{ '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' }}" + "{{ '\n\n' if not loop.first else '' }}" + "{{ message['content'] + '\n' }}" + "{% if loop.last and add_generation_prompt %}" + "{{ '\nRemember to put your answer on its own line after \"Answer:\"' }}" "{% endif %}" + "{% endfor %}" + ), + "olmo_thinker_code_rlzero": ( + "Solve the following code problem step by step. " + f"The last part of your response should be the solution to the problem in form ```\npython\nCODE\n``` where CODE is the solution for the problem." + "\n\n" + "{% for message in messages %}" + "{{ '\n\n' if not loop.first else '' }}" + "{{ message['content'] + '\n' }}" "{% if loop.last and add_generation_prompt %}" - "{{ '<|im_start|>assistant\n' }}" + f"\nRemember to put your solution inside the ```\npython\nCODE\n``` tags" "{% endif %}" "{% endfor %}" ), diff --git a/scripts/train/olmo3/7b_rlzero_code.sh b/scripts/train/olmo3/7b_rlzero_code.sh index 620acf57c..6ea80d421 100644 --- a/scripts/train/olmo3/7b_rlzero_code.sh +++ b/scripts/train/olmo3/7b_rlzero_code.sh @@ -1,16 +1,14 @@ #!/bin/bash -# OLMo 3 model -MODEL_NAME_OR_PATH="/weka/oe-training-default/ai2-llm/checkpoints/tylerr/long-context/olmo25_7b_lc_64k_6T_M100B_round5-sparkle_6634-pre_s2pdf_gzip2080_cweN-yake-all-olmo_packing_yarn-fullonly_50B-fb13a737/step11921-hf" +MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B" +DATASETS="allenai/Dolci-RLZero-Code-7B 1.0" -DATASETS="saurabh5/rlvr_acecoder_filtered_filtered_olmo_completions_filtered 6656 hamishivi/synthetic2-rlvr-code-compressed_filtered 3328 hamishivi/klear-code-rlvr_filtered 3328" - -LOCAL_EVALS="hamishivi/rlvr_acecoder_filtered_filtered 4 hamishivi/klear-code-rlvr_filtered 4" +LOCAL_EVALS="allenai/Dolci-RLZero-Code-7B 8" LOCAL_EVAL_SPLITS="train" EVALS="codex_humanevalplus:0-shot-chat::tulu-thinker_deepseek,mbppplus:0-shot-chat::tulu-thinker_deepseek,livecodebench_codegeneration::tulu-thinker_deepseek_lite" -EXP_NAME="grpo_code_from_zero" +EXP_NAME="olmo3_7b_rlzero_code" BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name') BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}" shift @@ -30,9 +28,8 @@ python mason.py \ --env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \ --gpus 8 \ --budget ai2/oe-adapt \ - -- \ -source configs/beaker_configs/ray_node_setup.sh \&\& \ -python open_instruct/grpo_fast.py \ + -- source configs/beaker_configs/ray_node_setup.sh \ +\&\& uv run open_instruct/grpo_fast.py \ --exp_name ${EXP_NAME} \ --beta 0.0 \ --async_steps 4 \ @@ -54,8 +51,7 @@ python open_instruct/grpo_fast.py \ --response_length 16384 \ --pack_length 18432 \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ - --chat_template_name olmo_thinker \ - --stop_strings "" \ + --chat_template_name olmo_thinker_code_rlzero \ --non_stop_penalty False \ --temperature 1.0 \ --total_episodes 10000000 \ diff --git a/scripts/train/olmo3/7b_rlzero_instruction_following.sh b/scripts/train/olmo3/7b_rlzero_instruction_following.sh index ea0321285..5b5165100 100644 --- a/scripts/train/olmo3/7b_rlzero_instruction_following.sh +++ b/scripts/train/olmo3/7b_rlzero_instruction_following.sh @@ -1,16 +1,14 @@ #!/bin/bash -# OLMo 3 model -MODEL_NAME_OR_PATH="/weka/oe-training-default/ai2-llm/checkpoints/tylerr/long-context/olmo25_7b_lc_64k_6T_M100B_round5-sparkle_6634-pre_s2pdf_gzip2080_cweN-yake-all-olmo_packing_yarn-fullonly_50B-fb13a737/step11921-hf" +MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B" +DATASETS="allenai/Dolci-RLZero-IF-7B 1.0" -DATASETS="saurabh5/IF_multi_constraints_upto5_filtered_olmo_completions_filtered 13314" - -LOCAL_EVALS="hamishivi/IF_multi_constraints_upto5_filtered 8" +LOCAL_EVALS="allenai/Dolci-RLZero-IF-7B 8" LOCAL_EVAL_SPLITS="train" EVALS="ifeval::hamish_zs_reasoning_deepseek" -EXP_NAME="grpo_if_from_zero" +EXP_NAME="olmo3_7b_rlzero_if" BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name') BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}" shift @@ -30,9 +28,8 @@ python mason.py \ --env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \ --gpus 8 \ --budget ai2/oe-adapt \ - -- \ -source configs/beaker_configs/ray_node_setup.sh \&\& \ -python open_instruct/grpo_fast.py \ + -- source configs/beaker_configs/ray_node_setup.sh \ +\&\& uv run open_instruct/grpo_fast.py \ --exp_name ${EXP_NAME} \ --beta 0.0 \ --async_steps 4 \ @@ -54,8 +51,7 @@ python open_instruct/grpo_fast.py \ --response_length 16384 \ --pack_length 18432 \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ - --chat_template_name olmo_thinker \ - --stop_strings "" \ + --chat_template_name olmo_thinker_rlzero \ --non_stop_penalty False \ --temperature 1.0 \ --total_episodes 10000000 \ diff --git a/scripts/train/olmo3/7b_rlzero_math.sh b/scripts/train/olmo3/7b_rlzero_math.sh index 8a23c8760..4761b5cb4 100644 --- a/scripts/train/olmo3/7b_rlzero_math.sh +++ b/scripts/train/olmo3/7b_rlzero_math.sh @@ -1,25 +1,21 @@ #!/bin/bash -# OLMo 3 model -MODEL_NAME_OR_PATH="/weka/oe-adapt-default/michaeln/checkpoints/olmo3-7b-base" -GS_MODEL_NAME="olmo3_7b_base" +MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B" +DATASETS="allenai/Dolci-RLZero-Math-7B 1.0" -DATASETS="saurabh5/DAPO-Math-17k-Processed_filtered_olmo_completions_new_template_filtered 1.0 saurabh5/MATH_3000_Filtered_olmo_completions_new_template_filtered 1.0" - -# AIME 2024, 2025 local evals -LOCAL_EVALS="mnoukhov/aime2024-25-rlvr 1.0 mnoukhov/aime2024-25-rlvr 1.0" +# AIME 2024, 2025 local single-sample evals +# Full, bootstrapped pass@32 evals must be run separately +LOCAL_EVALS="allenai/aime2024-25-rlvr 1.0 allenai/aime2024-25-rlvr 1.0" LOCAL_EVAL_SPLITS="test_2024 test_2024 test_2025 test_2025" -# math evals EVALS="aime:zs_cot_r1::pass_at_32_2024_dapo,aime:zs_cot_r1::pass_at_32_2025_dapo" -EXP_NAME="grpo_17kfilter_${GS_MODEL_NAME}" +EXP_NAME="olmo3_7b_rlzero_math" BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name') BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}" shift cluster=ai2/augusta - python mason.py \ --task_name ${EXP_NAME} \ --cluster ${cluster} \ @@ -31,12 +27,10 @@ python mason.py \ --num_nodes 8 \ --env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ --env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \ - --gs_model_name $GS_MODEL_NAME \ --gpus 8 \ --budget ai2/oe-adapt \ - -- \ -source configs/beaker_configs/ray_node_setup.sh \&\& \ -python open_instruct/grpo_fast.py \ + -- source configs/beaker_configs/ray_node_setup.sh \ +\&\& uv run open_instruct/grpo_fast.py \ --exp_name ${EXP_NAME} \ --beta 0.0 \ --async_steps 4 \ @@ -59,7 +53,7 @@ python open_instruct/grpo_fast.py \ --response_length 12000 \ --pack_length 32768 \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ - --chat_template_name olmo_thinker_dapo \ + --chat_template_name olmo_thinker_rlzero \ --non_stop_penalty False \ --temperature 1.0 \ --total_episodes 512000 \ diff --git a/scripts/train/olmo3/7b_rlzero_mix.sh b/scripts/train/olmo3/7b_rlzero_mix.sh new file mode 100644 index 000000000..910bed387 --- /dev/null +++ b/scripts/train/olmo3/7b_rlzero_mix.sh @@ -0,0 +1,84 @@ +#!/bin/bash + +MODEL_NAME_OR_PATH="allenai/Olmo-3-1025-7B" +DATASETS="allenai/Dolci-RLZero-Code-7B 1.0 allenai/Dolci-RLZero-IF-7B 1.0 allenai/Dolci-RLZero-Code-7B 1.0 allenai/Dolci-RLZero-General-7B 1.0" + +LOCAL_EVALS="allenai/Dolci-RLZero-Code-7B 8 allenai/Dolci-RLZero-IF-7B 8 allenai/Dolci-RLZero-Code-7B 8 allenai/Dolci-RLZero-General-7B 8" +LOCAL_EVAL_SPLITS="train train train train train train train train" + +EVALS="alpaca_eval_v3::hamish_zs_reasoning_deepseek,agi_eval_english:0shot_cot::hamish_zs_reasoning_deepseek,gpqa:0shot_cot::hamish_zs_reasoning_deepseek" + +EXP_NAME="olmo3_7b_rlzero_mix" +BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name') +BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}" +shift + +cluster=ai2/augusta + +python mason.py \ + --task_name ${EXP_NAME} \ + --cluster ${cluster} \ + --workspace ai2/olmo-instruct \ + --priority high \ + --pure_docker_mode \ + --image ${BEAKER_IMAGE} \ + --preemptible \ + --num_nodes 4 \ + --env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ + --env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \ + --gpus 8 \ + --budget ai2/oe-adapt \ + -- source configs/beaker_configs/ray_node_setup.sh \ +\&\& uv run open_instruct/grpo_fast.py \ + --exp_name ${EXP_NAME} \ + --beta 0.0 \ + --async_steps 4 \ + --inflight_updates \ + --truncated_importance_sampling_ratio_cap 2.0 \ + --num_samples_per_prompt_rollout 8 \ + --num_unique_prompts_rollout 32 \ + --num_mini_batches 1 \ + --num_epochs 1 \ + --learning_rate 1e-6 \ + --per_device_train_batch_size 1 \ + --kl_estimator kl3 \ + --dataset_mixer_list $DATASETS \ + --dataset_mixer_list_splits train \ + --dataset_mixer_eval_list $LOCAL_EVALS \ + --dataset_mixer_eval_list_splits $LOCAL_EVAL_SPLITS \ + --max_token_length 10240 \ + --max_prompt_token_length 2048 \ + --response_length 16384 \ + --pack_length 18432 \ + --model_name_or_path ${MODEL_NAME_OR_PATH} \ + --chat_template_name olmo_thinker_rlzero \ + --non_stop_penalty False \ + --temperature 1.0 \ + --total_episodes 10000000 \ + --deepspeed_stage 3 \ + --num_learners_per_node 8 \ + --vllm_num_engines 32 \ + --vllm_tensor_parallel_size 1 \ + --llm_judge_model hosted_vllm/Qwen/Qwen3-32B \ + --llm_judge_timeout 600 \ + --llm_judge_max_tokens 2048 \ + --llm_judge_max_context_length 32768 \ + --lr_scheduler_type constant \ + --apply_verifiable_reward true \ + --seed 1 \ + --local_eval_every 50 \ + --save_freq 50 \ + --checkpoint_state_freq 50 \ + --gradient_checkpointing \ + --with_tracking \ + --vllm_enable_prefix_caching \ + --clip_higher 0.272 \ + --keep_last_n_checkpoints -1 \ + --mask_truncated_completions True \ + --oe_eval_max_length 16384 \ + --code_api_url https://p9f1719l7f.execute-api.us-west-2.amazonaws.com/prod/test_program \ + --try_launch_beaker_eval_jobs_on_weka True \ + --oe_eval_tasks $EVALS \ + --eval_on_step_0 True \ + --oe_eval_beaker_image oe-eval-beaker/oe_eval_olmo2_retrofit_auto \ + --output_dir /output/olmo3-7b-rlzero-general/checkpoints $@