From 1ed3bf8338750f31f05503d27f003545bdefd252 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Tue, 3 Dec 2024 13:56:29 -0800 Subject: [PATCH 1/4] Set gloo process group for FSDP with CPU offload --- recipes/full_finetune_distributed.py | 5 ++++- recipes/knowledge_distillation_distributed.py | 4 +++- recipes/lora_dpo_distributed.py | 4 +++- recipes/lora_finetune_distributed.py | 3 ++- recipes/qat_distributed.py | 4 +++- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 2e000cc67a..3a174796fb 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -946,11 +946,14 @@ def recipe_main(cfg: DictConfig) -> None: "Distributed finetune recipe should be run via a distributed launcher." "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) - init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl") + + process_group = "gloo" if cfg.device == "cpu" else "nccl" if cfg.get("fsdp_cpu_offload", False): # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU training.set_torch_num_threads() + process_group = "cuda:nccl,cpu:gloo" + init_process_group(backend=process_group) config.log_config(recipe_name="FullFinetuneRecipeDistributed", cfg=cfg) diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py index c920f4b069..a33524db23 100644 --- a/recipes/knowledge_distillation_distributed.py +++ b/recipes/knowledge_distillation_distributed.py @@ -971,11 +971,13 @@ def recipe_main(cfg: DictConfig) -> None: "Distributed finetune recipe should be run via a distributed launcher." "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) + process_group = "gloo" if cfg.device == "cpu" else "nccl" if cfg.get("fsdp_cpu_offload", False): # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU training.set_torch_num_threads() - init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl") + process_group = "cuda:nccl,cpu:gloo" + init_process_group(backend=process_group) config.log_config(recipe_name="KDRecipeDistributed", cfg=cfg) diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index 7f6b0a8394..8c7e03cb23 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -782,11 +782,13 @@ def recipe_main(cfg: DictConfig) -> None: "Distributed finetune recipe should be run via a distributed launcher." "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) + process_group = "gloo" if cfg.device == "cpu" else "nccl" if cfg.get("fsdp_cpu_offload", False): # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU training.set_torch_num_threads() - init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl") + process_group = "cuda:nccl,cpu:gloo" + init_process_group(backend=process_group) config.log_config(recipe_name="LoRADPORecipeDistributed", cfg=cfg) diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index 5aef0e2e97..407ce75450 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -924,7 +924,8 @@ def recipe_main(cfg: DictConfig) -> None: # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU training.set_torch_num_threads() - init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl") + process_group = "cuda:nccl,cpu:gloo" + init_process_group(backend=process_group) config.log_config(recipe_name="LoRAFinetuneRecipeDistributed", cfg=cfg) diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index e7df34d97c..f833f967cf 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -935,11 +935,13 @@ def recipe_main(cfg: DictConfig) -> None: "Distributed finetune recipe should be run via a distributed launcher." "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) - init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl") + process_group = "gloo" if cfg.device == "cpu" else "nccl" if cfg.get("fsdp_cpu_offload", False): # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU training.set_torch_num_threads() + process_group = "cuda:nccl,cpu:gloo" + init_process_group(backend=process_group) config.log_config(recipe_name="QATRecipeDistributed", cfg=cfg) From 7764a6d119508aaeaa8d5cdd01489a834b3370d6 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Tue, 3 Dec 2024 13:57:56 -0800 Subject: [PATCH 2/4] cleanup --- recipes/full_finetune_distributed.py | 1 - recipes/lora_finetune_distributed.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 3a174796fb..d23a05bdb2 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -946,7 +946,6 @@ def recipe_main(cfg: DictConfig) -> None: "Distributed finetune recipe should be run via a distributed launcher." "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) - process_group = "gloo" if cfg.device == "cpu" else "nccl" if cfg.get("fsdp_cpu_offload", False): # Utilize all available CPU cores for intra-op parallelism. This provides ~2x diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index 407ce75450..e73f6a1417 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -920,6 +920,7 @@ def recipe_main(cfg: DictConfig) -> None: "Distributed finetune recipe should be run via a distributed launcher." "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) + process_group = "gloo" if cfg.device == "cpu" else "nccl" if cfg.get("fsdp_cpu_offload", False): # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU From 1b992aacb45824cf6b313585bb0080a260900c31 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Tue, 3 Dec 2024 17:37:39 -0800 Subject: [PATCH 3/4] Remove special handling for CPU recipe tests --- recipes/full_finetune_distributed.py | 4 +--- recipes/knowledge_distillation_distributed.py | 4 +--- recipes/lora_dpo_distributed.py | 4 +--- recipes/lora_finetune_distributed.py | 4 +--- recipes/qat_distributed.py | 4 +--- 5 files changed, 5 insertions(+), 15 deletions(-) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index d23a05bdb2..4a227701d7 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -946,13 +946,11 @@ def recipe_main(cfg: DictConfig) -> None: "Distributed finetune recipe should be run via a distributed launcher." "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) - process_group = "gloo" if cfg.device == "cpu" else "nccl" + init_process_group("cuda:nccl,cpu:gloo") if cfg.get("fsdp_cpu_offload", False): # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU training.set_torch_num_threads() - process_group = "cuda:nccl,cpu:gloo" - init_process_group(backend=process_group) config.log_config(recipe_name="FullFinetuneRecipeDistributed", cfg=cfg) diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py index a33524db23..d74bc40e2b 100644 --- a/recipes/knowledge_distillation_distributed.py +++ b/recipes/knowledge_distillation_distributed.py @@ -971,13 +971,11 @@ def recipe_main(cfg: DictConfig) -> None: "Distributed finetune recipe should be run via a distributed launcher." "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) - process_group = "gloo" if cfg.device == "cpu" else "nccl" + init_process_group("cuda:nccl,cpu:gloo") if cfg.get("fsdp_cpu_offload", False): # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU training.set_torch_num_threads() - process_group = "cuda:nccl,cpu:gloo" - init_process_group(backend=process_group) config.log_config(recipe_name="KDRecipeDistributed", cfg=cfg) diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index 8c7e03cb23..993fd2ac1f 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -782,13 +782,11 @@ def recipe_main(cfg: DictConfig) -> None: "Distributed finetune recipe should be run via a distributed launcher." "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) - process_group = "gloo" if cfg.device == "cpu" else "nccl" + init_process_group("cuda:nccl,cpu:gloo") if cfg.get("fsdp_cpu_offload", False): # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU training.set_torch_num_threads() - process_group = "cuda:nccl,cpu:gloo" - init_process_group(backend=process_group) config.log_config(recipe_name="LoRADPORecipeDistributed", cfg=cfg) diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index e73f6a1417..e95fbb40c6 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -920,13 +920,11 @@ def recipe_main(cfg: DictConfig) -> None: "Distributed finetune recipe should be run via a distributed launcher." "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) - process_group = "gloo" if cfg.device == "cpu" else "nccl" + init_process_group("cuda:nccl,cpu:gloo") if cfg.get("fsdp_cpu_offload", False): # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU training.set_torch_num_threads() - process_group = "cuda:nccl,cpu:gloo" - init_process_group(backend=process_group) config.log_config(recipe_name="LoRAFinetuneRecipeDistributed", cfg=cfg) diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index f833f967cf..e005dc0247 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -935,13 +935,11 @@ def recipe_main(cfg: DictConfig) -> None: "Distributed finetune recipe should be run via a distributed launcher." "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) - process_group = "gloo" if cfg.device == "cpu" else "nccl" + init_process_group("cuda:nccl,cpu:gloo") if cfg.get("fsdp_cpu_offload", False): # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU training.set_torch_num_threads() - process_group = "cuda:nccl,cpu:gloo" - init_process_group(backend=process_group) config.log_config(recipe_name="QATRecipeDistributed", cfg=cfg) From e4f00c44a7b284bd4b6ff520ef3414481dab3688 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Wed, 4 Dec 2024 07:29:35 -0800 Subject: [PATCH 4/4] add test case --- tests/recipes/test_full_finetune_distributed.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/recipes/test_full_finetune_distributed.py b/tests/recipes/test_full_finetune_distributed.py index 9c8d0eacd5..fd235899da 100644 --- a/tests/recipes/test_full_finetune_distributed.py +++ b/tests/recipes/test_full_finetune_distributed.py @@ -103,6 +103,8 @@ def test_loss( # should be the same. if not optim_in_bwd: cmd.append("clip_grad_norm=100") + # Test that gradient clipping works with CPU offload + cmd.append("fsdp_cpu_offload=True") else: cmd.append("optimizer_in_bwd=True")