From 6aaf614409f307bd205b2a75c70450566f272884 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Wed, 26 Feb 2025 14:26:58 -0800 Subject: [PATCH 1/6] add assertion for the custom fsdp Signed-off-by: Youngeun Kwon --- nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md | 2 ++ nemo/lightning/megatron_parallel.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md index 80eb0416cc18..507ccdc715a4 100644 --- a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md +++ b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md @@ -57,6 +57,8 @@ bucket_size: Optional[int] = None # Maximum number of parameters in each bucket average_in_collective: bool = False # If true, compute average in collective directly, as opposed to dividing by the dp_size first and then computing sum in the collective fp8_param_gather: bool = False # If true, keep the compute param in fp8 (do not use any other intermediate dtype) and perform the param all-gather in fp8 + use_custom_fsdp: bool = False # If true, use MCore's custom FSDP implementation. Turn this option on will override recipe.model.config.gradient_accumulation_fusion to False + data_parallel_sharding_strategy: str = "no_shard" # Data parallel sharding strategy, choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params'] ``` diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index f12cc683ee4f..a27334d7545e 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -54,6 +54,7 @@ from megatron.core.transformer.transformer_config import TransformerConfig from torch import Tensor, nn from typing_extensions import override +from nemo.utils import logging try: from megatron.core.distributed.custom_fsdp import FullyShardedDataParallel @@ -686,6 +687,11 @@ def init_ddp(self): ) # We need to do this explicitly since this is a attr pytorch uses model_chunk.__class__.__getattr__ = getattr_proxy # type: ignore + # Ensure that if using custom FSDP, gradient_accumulation_fusion is disabled on the model config. + if self.ddp_config.use_custom_fsdp and module.config.gradient_accumulation_fusion == True: + logging.warning("Setting gradient_accumulation_fusion to False as it's incompatible with custom FSDP") + module.config.gradient_accumulation_fusion = False + # param_sync_func is set in nemo.lightning.pytorch.optim.megatron no_sync_func, grad_sync_func = extract_ddp_funcs(self.ddp_config, self) for module in self: From da27acd917151a527ff6a38479f9221e2612e207 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Wed, 26 Feb 2025 14:31:43 -0800 Subject: [PATCH 2/6] change it into assertion Signed-off-by: Youngeun Kwon --- nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md | 2 +- nemo/lightning/megatron_parallel.py | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md index 507ccdc715a4..0ad5e381de42 100644 --- a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md +++ b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md @@ -57,7 +57,7 @@ bucket_size: Optional[int] = None # Maximum number of parameters in each bucket average_in_collective: bool = False # If true, compute average in collective directly, as opposed to dividing by the dp_size first and then computing sum in the collective fp8_param_gather: bool = False # If true, keep the compute param in fp8 (do not use any other intermediate dtype) and perform the param all-gather in fp8 - use_custom_fsdp: bool = False # If true, use MCore's custom FSDP implementation. Turn this option on will override recipe.model.config.gradient_accumulation_fusion to False + use_custom_fsdp: bool = False # If true, use MCore's custom FSDP implementation. recipe.model.config.gradient_accumulation_fusion must be False when using this data_parallel_sharding_strategy: str = "no_shard" # Data parallel sharding strategy, choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params'] ``` diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index a27334d7545e..af400015da2f 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -54,7 +54,6 @@ from megatron.core.transformer.transformer_config import TransformerConfig from torch import Tensor, nn from typing_extensions import override -from nemo.utils import logging try: from megatron.core.distributed.custom_fsdp import FullyShardedDataParallel @@ -624,7 +623,6 @@ def init_model_parallel(self): ) if self.convert_module_fn: self.apply_convert_module_fn() - self.init_ddp() def apply_convert_module_fn(self): @@ -688,9 +686,8 @@ def init_ddp(self): model_chunk.__class__.__getattr__ = getattr_proxy # type: ignore # Ensure that if using custom FSDP, gradient_accumulation_fusion is disabled on the model config. - if self.ddp_config.use_custom_fsdp and module.config.gradient_accumulation_fusion == True: - logging.warning("Setting gradient_accumulation_fusion to False as it's incompatible with custom FSDP") - module.config.gradient_accumulation_fusion = False + if self.ddp_config.use_custom_fsdp: + assert module.config.gradient_accumulation_fusion == False, "gradient_accumulation_fusion must be False when using custom FSDP" # param_sync_func is set in nemo.lightning.pytorch.optim.megatron no_sync_func, grad_sync_func = extract_ddp_funcs(self.ddp_config, self) From 36dcfa94e94ad971959c7085ba2887d5e70a6ee8 Mon Sep 17 00:00:00 2001 From: youngeunkwon0405 Date: Wed, 26 Feb 2025 22:36:35 +0000 Subject: [PATCH 3/6] Apply isort and black reformatting Signed-off-by: youngeunkwon0405 --- nemo/lightning/megatron_parallel.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index 00e9a572dadb..d7dbc0fa1ca2 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -691,7 +691,9 @@ def init_ddp(self): # Ensure that if using custom FSDP, gradient_accumulation_fusion is disabled on the model config. if self.ddp_config.use_custom_fsdp: - assert module.config.gradient_accumulation_fusion == False, "gradient_accumulation_fusion must be False when using custom FSDP" + assert ( + module.config.gradient_accumulation_fusion == False + ), "gradient_accumulation_fusion must be False when using custom FSDP" # param_sync_func is set in nemo.lightning.pytorch.optim.megatron no_sync_func, grad_sync_func = extract_ddp_funcs(self.ddp_config, self) From 2ceb6c994df571af09122924dbf83151a217c11c Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Wed, 26 Feb 2025 15:11:01 -0800 Subject: [PATCH 4/6] modify the comments and assertion text Signed-off-by: Youngeun Kwon --- nemo/lightning/megatron_parallel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index d7dbc0fa1ca2..22dd34e77e02 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -689,11 +689,11 @@ def init_ddp(self): ) # We need to do this explicitly since this is a attr pytorch uses model_chunk.__class__.__getattr__ = getattr_proxy # type: ignore - # Ensure that if using custom FSDP, gradient_accumulation_fusion is disabled on the model config. + # Ensure that if using FSDP, gradient_accumulation_fusion is disabled on the model config. if self.ddp_config.use_custom_fsdp: assert ( module.config.gradient_accumulation_fusion == False - ), "gradient_accumulation_fusion must be False when using custom FSDP" + ), "gradient_accumulation_fusion cannot be used with FSDP" # param_sync_func is set in nemo.lightning.pytorch.optim.megatron no_sync_func, grad_sync_func = extract_ddp_funcs(self.ddp_config, self) From ea1cb0e9468b857c74cafc829ba61a4e2ffe8e35 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Wed, 26 Feb 2025 15:12:17 -0800 Subject: [PATCH 5/6] modify the comments Signed-off-by: Youngeun Kwon --- nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md index 0ad5e381de42..af5db187b30f 100644 --- a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md +++ b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md @@ -58,7 +58,7 @@ average_in_collective: bool = False # If true, compute average in collective directly, as opposed to dividing by the dp_size first and then computing sum in the collective fp8_param_gather: bool = False # If true, keep the compute param in fp8 (do not use any other intermediate dtype) and perform the param all-gather in fp8 use_custom_fsdp: bool = False # If true, use MCore's custom FSDP implementation. recipe.model.config.gradient_accumulation_fusion must be False when using this - data_parallel_sharding_strategy: str = "no_shard" # Data parallel sharding strategy, choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params'] + data_parallel_sharding_strategy: str = "no_shard" # Sharding strategy when using custom FSDP, choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params'] ``` From eeef8324753a46181268a5d7ce8d5a661133f74f Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Wed, 26 Feb 2025 15:28:16 -0800 Subject: [PATCH 6/6] Add more arg map info Signed-off-by: Youngeun Kwon --- nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md index af5db187b30f..9c5dff0e8b2b 100644 --- a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md +++ b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md @@ -59,6 +59,9 @@ fp8_param_gather: bool = False # If true, keep the compute param in fp8 (do not use any other intermediate dtype) and perform the param all-gather in fp8 use_custom_fsdp: bool = False # If true, use MCore's custom FSDP implementation. recipe.model.config.gradient_accumulation_fusion must be False when using this data_parallel_sharding_strategy: str = "no_shard" # Sharding strategy when using custom FSDP, choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params'] + suggested_communication_unit_size: int = 400_000_000 # When using custom FSDP and batch communication is needed across multiple buckets, this variable guides the size of communication unit size + preserve_fp32_weights: bool = True # If true, preserve fp32 weights in the custom FSDP ParamAndGradBuffer + keep_fp8_transpose_cache_when_using_custom_fsdp: bool = False # If true, keep the fp8 transpose cache when using custom FSDP ```