[PyTorch] Use dummy wgrad in GroupedLinear (#2305)

Autumn1998 · yaox12 · web-flow · commit d2945c6a571e · 2025-10-27T16:10:58.000+08:00
dummy wgrad

Signed-off-by: tongliu &lt;tongliu@nvidia.com&gt;
Signed-off-by: Xin Yao &lt;xiny@nvidia.com&gt;

Co-authored-by: Xin Yao &lt;xiny@nvidia.com&gt;
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
@@ -13,6 +13,7 @@
 
 from transformer_engine.common.recipe import Recipe
 from .base import (
+    get_dummy_wgrad,
     get_multi_stream_cublas_workspace,
     TransformerEngineBaseModule,
     _2X_ACC_FPROP,
@@ -447,18 +448,15 @@ def handle_custom_ddp_from_mcore(weight, wgrad):
                         ):
                             weight.grad_added_to_main_grad = True
                             if getattr(weight, "zero_out_wgrad", False):
-                                wgrad = torch.zeros(
-                                    weight.main_grad.shape,
-                                    dtype=weight.dtype,
-                                    device=torch.cuda.current_device(),
-                                    requires_grad=False,
+                                wgrad = get_dummy_wgrad(
+                                    list(weight.main_grad.shape),
+                                    weight.dtype,
+                                    zero=True,
                                 )
                             else:
-                                wgrad = torch.empty(
-                                    weight.main_grad.shape,
-                                    dtype=weight.dtype,
-                                    device=torch.cuda.current_device(),
-                                    requires_grad=False,
+                                wgrad = get_dummy_wgrad(
+                                    list(weight.main_grad.shape),
+                                    weight.dtype,
                                 )
                         elif ctx.fuse_wgrad_accumulation:
                             wgrad = None