NVIDIA · vysarge · Aug 5, 2024 · Aug 5, 2024 · Aug 6, 2024
diff --git a/launcher_scripts/conf/peft/llama/sft.yaml b/launcher_scripts/conf/peft/llama/sft.yaml
@@ -120,6 +120,8 @@ model:
   attention_dropout: 0.0
   ffn_dropout: 0.0
 
+  gc_interval: 100
+
   peft:
     peft_scheme: null  # null (SFT, no PEFT), ptuning, lora
     restore_from_path: null

diff --git a/launcher_scripts/conf/peft/nemotron/sft.yaml b/launcher_scripts/conf/peft/nemotron/sft.yaml
@@ -120,6 +120,8 @@ model:
   attention_dropout: 0.0
   ffn_dropout: 0.0
 
+  gc_interval: 100
+
   peft:
     peft_scheme: null  # null (SFT, no PEFT), ptuning, lora
     restore_from_path: null

diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml
@@ -139,6 +139,7 @@ model:
   tp_comm_atomic_ag: False
   tp_comm_atomic_rs: False
   use_flash_attention: true
+  gc_interval: 100
   nsys_profile:
     enabled: False
     trace: [nvtx,cuda]

diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml
@@ -139,6 +139,7 @@ model:
   tp_comm_atomic_ag: False
   tp_comm_atomic_rs: False
   use_flash_attention: true
+  gc_interval: 100
   nsys_profile:
     enabled: False
     trace: [nvtx,cuda]

diff --git a/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml
@@ -158,6 +158,7 @@ model:
   ub_tp_comm_overlap: True
   tp_comm_atomic_ag: False
   tp_comm_atomic_rs: False
+  gc_interval: 100
 
   nsys_profile:
     enabled: False

diff --git a/launcher_scripts/conf/training/nemotron/nemotron_340b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_340b.yaml
@@ -154,6 +154,7 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
+  gc_interval: 100
 
   optim:
     name: distributed_fused_adam
@@ -188,4 +189,4 @@ model:
       - .0333
       - ${data_dir}/my-nemotron_00_text_document
       - .0333
-      - ${data_dir}/my-nemotron_00_text_document
+      - ${data_dir}/my-nemotron_00_text_document
diff --git a/launcher_scripts/conf/training/nemotron/nemotron_4b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_4b.yaml
@@ -158,6 +158,7 @@ model:
   ub_tp_comm_overlap: true
   tp_comm_atomic_ag: False
   tp_comm_atomic_rs: False
+  gc_interval: 100
 
   nsys_profile:
     enabled: False

diff --git a/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml
@@ -158,6 +158,7 @@ model:
   ub_tp_comm_overlap: true
   tp_comm_atomic_ag: False
   tp_comm_atomic_rs: False
+  gc_interval: 100
 
   nsys_profile:
     enabled: False