From 417609c29b93733882fe49b779c5bc2a53f32f03 Mon Sep 17 00:00:00 2001 From: Valerie Sarge Date: Mon, 5 Aug 2024 15:45:16 -0700 Subject: [PATCH 1/3] Add default gc_interval=100 for llama and nemotron fine-tuning Signed-off-by: Valerie Sarge --- launcher_scripts/conf/peft/llama/sft.yaml | 2 ++ launcher_scripts/conf/peft/nemotron/sft.yaml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/launcher_scripts/conf/peft/llama/sft.yaml b/launcher_scripts/conf/peft/llama/sft.yaml index fecab19b2b..6549a90539 100644 --- a/launcher_scripts/conf/peft/llama/sft.yaml +++ b/launcher_scripts/conf/peft/llama/sft.yaml @@ -120,6 +120,8 @@ model: attention_dropout: 0.0 ffn_dropout: 0.0 + gc_interval: 100 + peft: peft_scheme: null # null (SFT, no PEFT), ptuning, lora restore_from_path: null diff --git a/launcher_scripts/conf/peft/nemotron/sft.yaml b/launcher_scripts/conf/peft/nemotron/sft.yaml index d534162afd..9f93d22069 100644 --- a/launcher_scripts/conf/peft/nemotron/sft.yaml +++ b/launcher_scripts/conf/peft/nemotron/sft.yaml @@ -120,6 +120,8 @@ model: attention_dropout: 0.0 ffn_dropout: 0.0 + gc_interval: 100 + peft: peft_scheme: null # null (SFT, no PEFT), ptuning, lora restore_from_path: null From f990f916724ccfbabb72c7c918f92737fccee1d9 Mon Sep 17 00:00:00 2001 From: Valerie Sarge Date: Mon, 5 Aug 2024 16:51:41 -0700 Subject: [PATCH 2/3] Add gc_interval=100 for llama2 7b + 13b pretrain Signed-off-by: Valerie Sarge --- launcher_scripts/conf/training/llama/llama2_13b.yaml | 1 + launcher_scripts/conf/training/llama/llama2_7b.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml index 121e370e55..fcddf113f9 100644 --- a/launcher_scripts/conf/training/llama/llama2_13b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_13b.yaml @@ -139,6 +139,7 @@ model: tp_comm_atomic_ag: False tp_comm_atomic_rs: False use_flash_attention: true + gc_interval: 100 nsys_profile: enabled: False trace: [nvtx,cuda] diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml index 00a4ec0fee..9ebc546494 100644 --- a/launcher_scripts/conf/training/llama/llama2_7b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_7b.yaml @@ -139,6 +139,7 @@ model: tp_comm_atomic_ag: False tp_comm_atomic_rs: False use_flash_attention: true + gc_interval: 100 nsys_profile: enabled: False trace: [nvtx,cuda] From 7738982c26b9ca050632fac27e496ab74a95e246 Mon Sep 17 00:00:00 2001 From: Valerie Sarge Date: Mon, 5 Aug 2024 17:00:41 -0700 Subject: [PATCH 3/3] Add gc_interval=100 for nemotron pretrain configs missing it Signed-off-by: Valerie Sarge --- launcher_scripts/conf/training/nemotron/nemotron_15b.yaml | 1 + launcher_scripts/conf/training/nemotron/nemotron_340b.yaml | 3 ++- launcher_scripts/conf/training/nemotron/nemotron_4b.yaml | 1 + launcher_scripts/conf/training/nemotron/nemotron_8b.yaml | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml index 2e9305d010..80fb0a97bc 100644 --- a/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml +++ b/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml @@ -158,6 +158,7 @@ model: ub_tp_comm_overlap: True tp_comm_atomic_ag: False tp_comm_atomic_rs: False + gc_interval: 100 nsys_profile: enabled: False diff --git a/launcher_scripts/conf/training/nemotron/nemotron_340b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_340b.yaml index ea7c39eeeb..68eefad395 100644 --- a/launcher_scripts/conf/training/nemotron/nemotron_340b.yaml +++ b/launcher_scripts/conf/training/nemotron/nemotron_340b.yaml @@ -154,6 +154,7 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False + gc_interval: 100 optim: name: distributed_fused_adam @@ -188,4 +189,4 @@ model: - .0333 - ${data_dir}/my-nemotron_00_text_document - .0333 - - ${data_dir}/my-nemotron_00_text_document \ No newline at end of file + - ${data_dir}/my-nemotron_00_text_document diff --git a/launcher_scripts/conf/training/nemotron/nemotron_4b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_4b.yaml index 4e529cb65b..f7996084bd 100644 --- a/launcher_scripts/conf/training/nemotron/nemotron_4b.yaml +++ b/launcher_scripts/conf/training/nemotron/nemotron_4b.yaml @@ -158,6 +158,7 @@ model: ub_tp_comm_overlap: true tp_comm_atomic_ag: False tp_comm_atomic_rs: False + gc_interval: 100 nsys_profile: enabled: False diff --git a/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml index 69c5e64efa..00516cccf4 100644 --- a/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml +++ b/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml @@ -158,6 +158,7 @@ model: ub_tp_comm_overlap: true tp_comm_atomic_ag: False tp_comm_atomic_rs: False + gc_interval: 100 nsys_profile: enabled: False