allenai · mnoukhov · Jul 8, 2025 · Jul 8, 2025 · Jul 8, 2025 · Jul 8, 2025
diff --git a/.gitignore b/.gitignore
@@ -159,3 +159,4 @@ dmypy.json
 cache/
 local_dataset_cache/
 scratch/
+vllm_olmo3/
diff --git a/Dockerfile b/Dockerfile
@@ -65,11 +65,14 @@ ENV UV_CACHE_DIR=/root/.cache/uv
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 ENV UV_COMPILE_BYTECODE=0
 
+# Install custom vllm for olmo3
+RUN git clone -b shanea/olmo3 https://github.com/2015aroras/vllm.git vllm_olmo3
+
 # Install dependencies
 RUN --mount=type=cache,target=${UV_CACHE_DIR} \
     --mount=type=bind,source=uv.lock,target=uv.lock \
     --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
-    uv sync --frozen --no-cache
+    uv sync --frozen --no-cache --extra vllm
 
 RUN uv run --no-sync -m nltk.downloader punkt punkt_tab
 
@@ -78,6 +81,7 @@ COPY eval eval
 COPY configs configs
 COPY scripts scripts
 COPY mason.py mason.py
+
 COPY oe-eval-internal oe-eval-internal
 COPY open_instruct open_instruct
 

diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: style quality
+.PHONY: style quality docker
 
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = open_instruct
@@ -16,3 +16,10 @@ style-check:   ## *fail* if anything needs rewriting
 
 quality-check: ## *fail* if any rewrite was needed
 	uv run ruff check --exit-non-zero-on-fix $(check_dirs)
+
+docker:
+	DOCKER_BUILDKIT=1 docker build -f Dockerfile --build-arg UV_CACHE_DIR=$(UV_CACHE_DIR) -t open_instruct_olmo3 .
+	# if you are internally at AI2, you can create an image like this:
+	$(eval beaker_user := $(shell beaker account whoami --format json | jq -r '.[0].name'))
+	# beaker image delete $(beaker_user)/open_instruct_olmo3
+	beaker image create open_instruct_olmo2_retrofit -n open_instruct_olmo3 -w ai2/$(beaker_user)
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,12 +19,11 @@ dependencies = [
     "nvitop>=1.4.2",
     "packaging>=24.2",
     "peft>=0.13.2",
-    "ray[default]>=2.44.1",
+    "ray[default]==2.46.0",
     "setuptools>=75.6.0,<80.0.0",
     "tensorboard>=2.18.0",
     "torch>=2.7.0,<2.8",
-    "transformers>=4.52.4,<4.54.0", # see https://github.com/vllm-project/vllm-ascend/issues/2046
-    "vllm==0.9.1",
+    "transformers @ git+https://github.com/2015aroras/transformers.git@shanea/olmo3",
     "wandb==0.18.1",
     "langdetect==1.0.9",
     "immutabledict==1.2.0",
@@ -46,12 +45,14 @@ flash-attn = [{ requirement = "torch", match-runtime = true }]
 
 [tool.uv.extra-build-variables]
 flash-attn = { FLASH_ATTENTION_SKIP_CUDA_BUILD = "TRUE" }
+vllm = { VLLM_USE_PRECOMPILED = "1" }
 
 # pytorch related setups
 [tool.uv.sources]
 torch = [
   { index = "pytorch-cu128", marker = "platform_system != 'Darwin'"},
 ]
+vllm = { path = "./vllm_olmo3", editable = true }
 
 [[tool.uv.index]]
 name = "pytorch-cu128"
@@ -70,12 +71,20 @@ code = [
     "pydantic>=2.0.0",
     "requests>=2.28.0",
 ]
+vllm = [
+    "vllm"
+]
 
 [tool.uv]
 preview = true
 python-preference = "only-managed"
 link-mode = "hardlink"
 
+[[tool.uv.dependency-metadata]]
+name = "flash-attn"
+version = "2.8.0.post2"
+requires-dist = ["torch", "setuptools"]
+
 [dependency-groups]
 dev = [
     "beaker-py>=1.32.2,<2.0",
@@ -132,6 +141,7 @@ ignore = [
 
 [tool.ruff.lint.isort]
 known-first-party = ["open-instruct"]
+known-third-party = ["wandb"]
 # case insensitive to match isort --profile black
 case-sensitive = false
 # Disable split-on-trailing-comma to work with skip-magic-trailing-comma

diff --git a/scripts/train/debug/grpo_fast.sh b/scripts/train/debug/grpo_fast.sh
@@ -1,4 +1,4 @@
-uv run python open_instruct/grpo_fast.py \
+python open_instruct/grpo_fast.py \
     --dataset_mixer_list ai2-adapt-dev/rlvr_gsm8k_zs 64 \
     --dataset_mixer_list_splits train \
     --dataset_mixer_eval_list ai2-adapt-dev/rlvr_gsm8k_zs 16 \
@@ -8,24 +8,24 @@ uv run python open_instruct/grpo_fast.py \
     --response_length 512 \
     --pack_length 1024 \
     --per_device_train_batch_size 1 \
-    --num_unique_prompts_rollout 8 \
+    --num_unique_prompts_rollout 16 \
     --num_samples_per_prompt_rollout 4 \
-    --model_name_or_path Qwen/Qwen3-1.7B \
+    --model_name_or_path Qwen/Qwen3-0.6B \
     --stop_strings "</answer>" \
     --apply_r1_style_format_reward \
     --apply_verifiable_reward true \
-    --temperature 0.7 \
+    --temperature 1.0 \
     --ground_truths_key ground_truth \
     --chat_template_name r1_simple_chat_postpend_think \
     --learning_rate 3e-7 \
-    --total_episodes 200 \
+    --total_episodes 256 \
     --deepspeed_stage 2 \
     --num_epochs 1 \
     --num_learners_per_node 1 \
     --vllm_tensor_parallel_size 1 \
     --beta 0.01 \
     --seed 3 \
-    --local_eval_every 1 \
+    --num_evals 4 \
     --vllm_sync_backend gloo \
     --vllm_gpu_memory_utilization 0.3 \
     --save_traces \

diff --git a/scripts/train/rlvr/grpo_olmo3.sh b/scripts/train/rlvr/grpo_olmo3.sh
@@ -0,0 +1,82 @@
+# full integration mix
+# dataset_mix="saurabh5/rlvr_acecoder_filtered 63033 hamishivi/rlvr_orz_math_57k_collected 56878 hamishivi/tulu_3_rewritten_400k_string_f1_only_v2 56878 allenai/IF_multi_constraints_upto5 56878"
+# math only mix
+dataset_mix="hamishivi/rlvr_orz_math_57k_collected 56878"
+
+# all evals
+# evals="minerva_math::hamish_zs_reasoning,gsm8k::zs_cot_latex,gsm8k::hamish_zs_reasoning,minerva_math_500::hamish_zs_reasoning,zebralogic::hamish_zs_reasoning,aime::hamish_zs_reasoning,agi_eval_english:0shot_cot::hamish_zs_reasoning,gpqa:0shot_cot::hamish_zs_reasoning,ifeval::hamish_zs_reasoning,popqa::hamish_zs_reasoning,mmlu:cot::hamish_zs_reasoning,alpaca_eval_v3::hamish_zs_reasoning,bbh:cot::hamish_zs_reasoning,mbppplus:0-shot-chat::tulu-thinker,codex_humanevalplus:0-shot-chat-v1::tulu-thinker"
+# math evals
+evals="minerva_math::hamish_zs_reasoning,minerva_math_500::hamish_zs_reasoning,aime:zs_cot_r1::pass_at_32_2024_temp1,aime:zs_cot_r1::pass_at_32_2025_temp1"
+
+model_name_or_path="/weka/oe-training-default/ai2-llm/checkpoints/OLMo3-midtraining/anneal-round5-100B-olmo3_7b-anneal-decon-12T-00bb6023/step47684-hf"
+gs_model_name="olmo3-midtraining-round5"
+
+exp_name="grpo_mathonly_1m_${gs_model_name}"
+EXP_NAME=${EXP_NAME:-${exp_name}}
+
+
+# cluster
+cluster=ai2/augusta-google-1
+# cluster=ai2/jupiter-cirrascale-2
+
+NUM_GPUS=${NUM_GPUS:-8}
+
+python mason.py \
+    --task_name ${EXP_NAME} \
+    --cluster ${cluster} \
+    --workspace ai2/tulu-thinker \
+    --priority high \
+    --pure_docker_mode \
+    --image michaeln/open_instruct_olmo3 \
+    --preemptible \
+    --num_nodes 2 \
+    --env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
+    --env VLLM_ATTENTION_BACKEND="FLASH_ATTN" \
+    --gs_model_name $gs_model_name \
+    --gpus ${NUM_GPUS} \
+    --budget ai2/oe-adapt \
+    -- \
+source configs/beaker_configs/ray_node_setup.sh \&\& \
+source configs/beaker_configs/code_api_setup.sh \&\& \
+python open_instruct/grpo_fast.py \
+    --exp_name ${EXP_NAME} \
+    --beta 0.0 \
+    --num_samples_per_prompt_rollout 16 \
+    --num_unique_prompts_rollout 128 \
+    --num_mini_batches 4 \
+    --num_epochs 1 \
+    --learning_rate 1e-6 \
+    --per_device_train_batch_size 1 \
+    --kl_estimator kl3 \
+    --dataset_mixer_list ${dataset_mix} \
+    --dataset_mixer_list_splits train \
+    --dataset_mixer_eval_list hamishivi/tulu_3_rewritten_100k 32 \
+    --dataset_mixer_eval_list_splits train \
+    --max_token_length 8192 \
+    --max_prompt_token_length 2048 \
+    --response_length 6144 \
+    --pack_length 8192 \
+    --model_name_or_path ${model_name_or_path} \
+    --chat_template_name olmo_thinker_r1_style \
+    --stop_strings "</answer>" \
+    --non_stop_penalty False \
+    --temperature 1.0 \
+    --total_episodes 1024000 \
+    --deepspeed_stage 2 \
+    --num_learners_per_node 8 \
+    --vllm_num_engines 8 \
+    --lr_scheduler_type constant \
+    --apply_verifiable_reward true \
+    --seed 1 \
+    --local_eval_every 25 \
+    --save_freq 25 \
+    --checkpoint_state_freq 25 \
+    --gradient_checkpointing \
+    --with_tracking \
+    --vllm_enable_prefix_caching \
+    --clip_higher 0.272 \
+    --mask_truncated_completions True \
+    --oe_eval_max_length 8192 \
+    --try_launch_beaker_eval_jobs_on_weka True \
+    --oe_eval_tasks ${evals} \
+    --oe_eval_beaker_image oe-eval-beaker/oe_eval_olmo3_auto $@