diff --git a/src/cookbook/recipes/olmo3-midtraining/baseline-olmo3_7b-step289000-anneal-10B-dolma2-round1.yaml b/src/cookbook/recipes/olmo3-midtraining/baseline-olmo3_7b-step289000-anneal-10B-dolma2-round1.yaml new file mode 100644 index 00000000..4b7640df --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/baseline-olmo3_7b-step289000-anneal-10B-dolma2-round1.yaml @@ -0,0 +1,67 @@ +name: "baseline-olmo3_7b-step289000-anneal-10B-dolma2-round1" +description: "Baseline: OLMo3 7B step 289000 (~4.03T tokens) anneal to 10B Tokens with dolma2 midtraining mix (round 1)" +budget: "ai2/oe-base" +workspace: "ai2/oe-data" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.45 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy + - name: finemath + # 10% less the .0195 over 10% for dolminos2math + target_ratio: 0.0806 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1194 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.25B + - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/baseline-olmo3_7b-step289000-anneal-10B-dolma2.yaml b/src/cookbook/recipes/olmo3-midtraining/baseline-olmo3_7b-step289000-anneal-10B-dolma2.yaml new file mode 100644 index 00000000..c563dcd4 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/baseline-olmo3_7b-step289000-anneal-10B-dolma2.yaml @@ -0,0 +1,34 @@ +name: "baseline-olmo3_7b-step289000-anneal-10B-dolma2" +description: "Baseline: OLMo3 7B step 289000 (~4.03T tokens) anneal to 10B Tokens with dolma2 mix" +budget: "ai2/oe-base" +workspace: "ai2/oe-data" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: dolma2-0625-v0.1 + target_ratio: 1.0 + paths: + - gs://ai2-llm/preprocessed/dolma2-0625/v0.1/allenai/dolma2-tokenizer/all-dressed-snazzy2/*/*.npy + - gs://ai2-llm/preprocessed/dolma2-0625/v0.1/allenai/dolma2-tokenizer/arxiv/*.npy + - gs://ai2-llm/preprocessed/dolma2-0625/v0.1/allenai/dolma2-tokenizer/finemath-3plus/*.npy + - gs://ai2-llm/preprocessed/dolma2-0625/v0.1/allenai/dolma2-tokenizer/s2pdf/*/*.npy + - gs://ai2-llm/preprocessed/dolma2-0625/v0.1/allenai/dolma2-tokenizer/stack-edu/*/*.npy + - gs://ai2-llm/preprocessed/dolma2-0625/v0.1/allenai/dolma2-tokenizer/wikipedia/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/baseline-olmo3_7b-step527000-anneal-10B-dolma2-round1.yaml b/src/cookbook/recipes/olmo3-midtraining/baseline-olmo3_7b-step527000-anneal-10B-dolma2-round1.yaml new file mode 100644 index 00000000..d189b049 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/baseline-olmo3_7b-step527000-anneal-10B-dolma2-round1.yaml @@ -0,0 +1,67 @@ +name: "baseline-olmo3_7b-step527000-anneal-10B-dolma2-round1" +description: "Baseline: OLMo3 7B step 527000 (~8T tokens) anneal to 10B Tokens with dolma2 midtraining mix (round 1)" +budget: "ai2/oe-base" +workspace: "ai2/oe-data" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step527000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.45 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy + - name: finemath + # 10% less the .0195 over 10% for dolminos2math + target_ratio: 0.0806 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1194 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.25B + - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/baseline-olmo3_7b-step527000-anneal-10B-dolma2.yaml b/src/cookbook/recipes/olmo3-midtraining/baseline-olmo3_7b-step527000-anneal-10B-dolma2.yaml new file mode 100644 index 00000000..8393b08e --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/baseline-olmo3_7b-step527000-anneal-10B-dolma2.yaml @@ -0,0 +1,34 @@ +name: "baseline-olmo3_7b-step527000-anneal-10B-dolma2" +description: "Baseline: OLMo3 7B step 527000 (~8T tokens) anneal to 10B Tokens with dolma2 mix" +budget: "ai2/oe-base" +workspace: "ai2/oe-data" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step527000 +load_state: false +dataset: + sources: + - name: dolma2-0625-v0.1 + target_ratio: 1.0 + paths: + - gs://ai2-llm/preprocessed/dolma2-0625/v0.1/allenai/dolma2-tokenizer/all-dressed-snazzy2/*/*.npy + - gs://ai2-llm/preprocessed/dolma2-0625/v0.1/allenai/dolma2-tokenizer/arxiv/*.npy + - gs://ai2-llm/preprocessed/dolma2-0625/v0.1/allenai/dolma2-tokenizer/finemath-3plus/*.npy + - gs://ai2-llm/preprocessed/dolma2-0625/v0.1/allenai/dolma2-tokenizer/s2pdf/*/*.npy + - gs://ai2-llm/preprocessed/dolma2-0625/v0.1/allenai/dolma2-tokenizer/stack-edu/*/*.npy + - gs://ai2-llm/preprocessed/dolma2-0625/v0.1/allenai/dolma2-tokenizer/wikipedia/*.npy \ No newline at end of file