fix memory overhead of all gather from sequence parallel

yuzhongw-nvidia · yuzhongw-nvidia · commit a9f2eb96bff7 · 2025-09-05T03:07:13.000-07:00
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -353,8 +353,11 @@ def forward(
 
         # Deallocate GEMM input tensor if no longer needed
         if not weight.requires_grad and not return_layernorm_output:
-            ln_out = ln_out_total = None
             clear_tensor_data(ln_out, ln_out_total)
+            ln_out = ln_out_total = None
+        elif ln_out_total is not ln_out_return and not ub_overlap_ag_fprop:
+            clear_tensor_data(ln_out_total)
+            ln_out_total = None
 
         # ------------------------------------------------------
         # Prepare output tensor
@@ -892,7 +895,22 @@ def wgrad_gemm(
                     del grad_bias_
 
                     # Deallocate input tensor if permitted
-                    if not ctx.return_layernorm_output:
+                    if (
+                        not ctx.return_layernorm_output
+                        and not ctx.return_layernorm_output_gathered
+                    ):
+                        # Do not need to return layernorm output
+                        clear_tensor_data(ln_out)
+                    elif (
+                        ctx.return_layernorm_output_gathered
+                        and ctx.ln_out_needs_gather
+                    ):
+                        # ln_out is not the returned tensor
+                        clear_tensor_data(ln_out)
+                    if (
+                        ctx.ln_out_needs_gather
+                        and not ctx.ub_bulk_dgrad
+                    ):
                         clear_tensor_data(ln_out_total)
 
                 # Update grad input if overlapping reduce-scatter with wgrad GEMM
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
@@ -317,6 +317,11 @@ def forward(
         # Finished forward GEMM...
         # ------------------------------------------------------
 
+        # Deallocate GEMM input tensor if no longer needed
+        if with_input_all_gather_nccl:
+            clear_tensor_data(inputmat_total)
+            inputmat_total = None
+
         # ------------------------------------------------------
         # Prepare output tensor
         # Note: Perform tensor-parallel communication
@@ -881,6 +886,14 @@ def wgrad_gemm(
                     # Deallocate input tensor if permitted
                     if ctx.owns_input:
                         clear_tensor_data(inputmat_total)
+                    elif ctx.backward_input_needs_gather and not ctx.ub_bulk_dgrad:
+                        clear_tensor_data(inputmat_total)
+
+                    if (
+                        ctx.parallel_mode == "row" and ctx.sequence_parallel
+                        and not ctx.ub_overlap_ag
+                    ):
+                        clear_tensor_data(grad_output)
 
                 # Update grad input if overlapping reduce-scatter with wgrad GEMM
                 if ctx.ub_bulk_wgrad:
diff --git a/transformer_engine/pytorch/tensor/_internal/float8_blockwise_tensor_base.py b/transformer_engine/pytorch/tensor/_internal/float8_blockwise_tensor_base.py
@@ -349,9 +349,12 @@ def _create_columnwise(self):
     def _transpose_columnwise_data(self):
         """Plainly transpose the columnwise data and scale inv."""
         if self._columnwise_data is not None:
+            _old_data = self._columnwise_data
             self._columnwise_data = tex.fp8_transpose(
                 self._columnwise_data, self._fp8_dtype, out=None
             )
+            _old_data.data = _empty_tensor()
+            del _old_data
 
     def __repr__(self):
         if self._rowwise_data is not None: