From 6aaf614409f307bd205b2a75c70450566f272884 Mon Sep 17 00:00:00 2001
From: Youngeun Kwon <youngeunk@nvidia.com>
Date: Wed, 26 Feb 2025 14:26:58 -0800
Subject: [PATCH 1/6] add assertion for the custom fsdp

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>
---
 nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md | 2 ++
 nemo/lightning/megatron_parallel.py                     | 6 ++++++
 2 files changed, 8 insertions(+)
diff --git a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
index 80eb0416cc18..507ccdc715a4 100644
--- a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
+++ b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
@@ -57,6 +57,8 @@
   bucket_size: Optional[int] = None # Maximum number of parameters in each bucket
   average_in_collective: bool = False # If true, compute average in collective directly, as opposed to dividing by the dp_size first and then computing sum in the collective
   fp8_param_gather: bool = False # If true, keep the compute param in fp8 (do not use any other intermediate dtype) and perform the param all-gather in fp8
+  use_custom_fsdp: bool = False # If true, use MCore's custom FSDP implementation. Turn this option on will override recipe.model.config.gradient_accumulation_fusion to False
+  data_parallel_sharding_strategy: str = "no_shard" # Data parallel sharding strategy, choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params']
   ```  
   </blockquote>
   </details>
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index f12cc683ee4f..a27334d7545e 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -54,6 +54,7 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from torch import Tensor, nn
 from typing_extensions import override
+from nemo.utils import logging
 
 try:
     from megatron.core.distributed.custom_fsdp import FullyShardedDataParallel
@@ -686,6 +687,11 @@ def init_ddp(self):
             )  # We need to do this explicitly since this is a attr pytorch uses
             model_chunk.__class__.__getattr__ = getattr_proxy  # type: ignore
 
+            # Ensure that if using custom FSDP, gradient_accumulation_fusion is disabled on the model config.
+            if self.ddp_config.use_custom_fsdp and module.config.gradient_accumulation_fusion == True:
+                logging.warning("Setting gradient_accumulation_fusion to False as it's incompatible with custom FSDP")
+                module.config.gradient_accumulation_fusion = False
+
         # param_sync_func is set in nemo.lightning.pytorch.optim.megatron
         no_sync_func, grad_sync_func = extract_ddp_funcs(self.ddp_config, self)
         for module in self:

From da27acd917151a527ff6a38479f9221e2612e207 Mon Sep 17 00:00:00 2001
From: Youngeun Kwon <youngeunk@nvidia.com>
Date: Wed, 26 Feb 2025 14:31:43 -0800
Subject: [PATCH 2/6] change it into assertion

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>
---
 nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md | 2 +-
 nemo/lightning/megatron_parallel.py                     | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
index 507ccdc715a4..0ad5e381de42 100644
--- a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
+++ b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
@@ -57,7 +57,7 @@
   bucket_size: Optional[int] = None # Maximum number of parameters in each bucket
   average_in_collective: bool = False # If true, compute average in collective directly, as opposed to dividing by the dp_size first and then computing sum in the collective
   fp8_param_gather: bool = False # If true, keep the compute param in fp8 (do not use any other intermediate dtype) and perform the param all-gather in fp8
-  use_custom_fsdp: bool = False # If true, use MCore's custom FSDP implementation. Turn this option on will override recipe.model.config.gradient_accumulation_fusion to False
+  use_custom_fsdp: bool = False # If true, use MCore's custom FSDP implementation. recipe.model.config.gradient_accumulation_fusion must be False when using this
   data_parallel_sharding_strategy: str = "no_shard" # Data parallel sharding strategy, choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params']
   ```  
   </blockquote>
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index a27334d7545e..af400015da2f 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -54,7 +54,6 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from torch import Tensor, nn
 from typing_extensions import override
-from nemo.utils import logging
 
 try:
     from megatron.core.distributed.custom_fsdp import FullyShardedDataParallel
@@ -624,7 +623,6 @@ def init_model_parallel(self):
                     )
         if self.convert_module_fn:
             self.apply_convert_module_fn()
-
         self.init_ddp()
 
     def apply_convert_module_fn(self):
@@ -688,9 +686,8 @@ def init_ddp(self):
             model_chunk.__class__.__getattr__ = getattr_proxy  # type: ignore
 
             # Ensure that if using custom FSDP, gradient_accumulation_fusion is disabled on the model config.
-            if self.ddp_config.use_custom_fsdp and module.config.gradient_accumulation_fusion == True:
-                logging.warning("Setting gradient_accumulation_fusion to False as it's incompatible with custom FSDP")
-                module.config.gradient_accumulation_fusion = False
+            if self.ddp_config.use_custom_fsdp:
+                assert module.config.gradient_accumulation_fusion == False, "gradient_accumulation_fusion must be False when using custom FSDP"
 
         # param_sync_func is set in nemo.lightning.pytorch.optim.megatron
         no_sync_func, grad_sync_func = extract_ddp_funcs(self.ddp_config, self)

From 36dcfa94e94ad971959c7085ba2887d5e70a6ee8 Mon Sep 17 00:00:00 2001
From: youngeunkwon0405 <youngeunkwon0405@users.noreply.github.com>
Date: Wed, 26 Feb 2025 22:36:35 +0000
Subject: [PATCH 3/6] Apply isort and black reformatting

Signed-off-by: youngeunkwon0405 <youngeunkwon0405@users.noreply.github.com>
---
 nemo/lightning/megatron_parallel.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 00e9a572dadb..d7dbc0fa1ca2 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -691,7 +691,9 @@ def init_ddp(self):
 
             # Ensure that if using custom FSDP, gradient_accumulation_fusion is disabled on the model config.
             if self.ddp_config.use_custom_fsdp:
-                assert module.config.gradient_accumulation_fusion == False, "gradient_accumulation_fusion must be False when using custom FSDP"
+                assert (
+                    module.config.gradient_accumulation_fusion == False
+                ), "gradient_accumulation_fusion must be False when using custom FSDP"
 
         # param_sync_func is set in nemo.lightning.pytorch.optim.megatron
         no_sync_func, grad_sync_func = extract_ddp_funcs(self.ddp_config, self)

From 2ceb6c994df571af09122924dbf83151a217c11c Mon Sep 17 00:00:00 2001
From: Youngeun Kwon <youngeunk@nvidia.com>
Date: Wed, 26 Feb 2025 15:11:01 -0800
Subject: [PATCH 4/6] modify the comments and assertion text

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>
---
 nemo/lightning/megatron_parallel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index d7dbc0fa1ca2..22dd34e77e02 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -689,11 +689,11 @@ def init_ddp(self):
             )  # We need to do this explicitly since this is a attr pytorch uses
             model_chunk.__class__.__getattr__ = getattr_proxy  # type: ignore
 
-            # Ensure that if using custom FSDP, gradient_accumulation_fusion is disabled on the model config.
+            # Ensure that if using FSDP, gradient_accumulation_fusion is disabled on the model config.
             if self.ddp_config.use_custom_fsdp:
                 assert (
                     module.config.gradient_accumulation_fusion == False
-                ), "gradient_accumulation_fusion must be False when using custom FSDP"
+                ), "gradient_accumulation_fusion cannot be used with FSDP"
 
         # param_sync_func is set in nemo.lightning.pytorch.optim.megatron
         no_sync_func, grad_sync_func = extract_ddp_funcs(self.ddp_config, self)

From ea1cb0e9468b857c74cafc829ba61a4e2ffe8e35 Mon Sep 17 00:00:00 2001
From: Youngeun Kwon <youngeunk@nvidia.com>
Date: Wed, 26 Feb 2025 15:12:17 -0800
Subject: [PATCH 5/6] modify the comments

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>
---
 nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
index 0ad5e381de42..af5db187b30f 100644
--- a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
+++ b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
@@ -58,7 +58,7 @@
   average_in_collective: bool = False # If true, compute average in collective directly, as opposed to dividing by the dp_size first and then computing sum in the collective
   fp8_param_gather: bool = False # If true, keep the compute param in fp8 (do not use any other intermediate dtype) and perform the param all-gather in fp8
   use_custom_fsdp: bool = False # If true, use MCore's custom FSDP implementation. recipe.model.config.gradient_accumulation_fusion must be False when using this
-  data_parallel_sharding_strategy: str = "no_shard" # Data parallel sharding strategy, choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params']
+  data_parallel_sharding_strategy: str = "no_shard" # Sharding strategy when using custom FSDP, choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params']
   ```  
   </blockquote>
   </details>

From eeef8324753a46181268a5d7ce8d5a661133f74f Mon Sep 17 00:00:00 2001
From: Youngeun Kwon <youngeunk@nvidia.com>
Date: Wed, 26 Feb 2025 15:28:16 -0800
Subject: [PATCH 6/6] Add more arg map info

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>
---
 nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
index af5db187b30f..9c5dff0e8b2b 100644
--- a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
+++ b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
@@ -59,6 +59,9 @@
   fp8_param_gather: bool = False # If true, keep the compute param in fp8 (do not use any other intermediate dtype) and perform the param all-gather in fp8
   use_custom_fsdp: bool = False # If true, use MCore's custom FSDP implementation. recipe.model.config.gradient_accumulation_fusion must be False when using this
   data_parallel_sharding_strategy: str = "no_shard" # Sharding strategy when using custom FSDP, choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params']
+  suggested_communication_unit_size: int = 400_000_000 # When using custom FSDP and batch communication is needed across multiple buckets, this variable guides the size of communication unit size
+  preserve_fp32_weights: bool = True # If true, preserve fp32 weights in the custom FSDP ParamAndGradBuffer
+  keep_fp8_transpose_cache_when_using_custom_fsdp: bool = False # If true, keep the fp8 transpose cache when using custom FSDP
   ```  
   </blockquote>
   </details>