Only cache column-wise input in LayerNormLinear

Signed-off-by: Tim Moon <[email protected]>
NVIDIA · Feb 25, 2025 · 2099726 · 2099726
1 parent 03d95e5
commit 2099726
Showing 1 changed file with 6 additions and 0 deletions.
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -310,6 +310,12 @@ def forward(
                 clear_tensor_data(ln_out, ln_out_total)
 
         if is_grad_enabled:
+
+            # Input with column-wise usage is needed for dgrad GEMM
+            if backward_needs_input:
+                if isinstance(ln_out, QuantizedTensor):
+                    ln_out.update_usage(rowwise_usage=False)
+
             if cpu_offloading:
                 if fp8 and weightmat is not None:
                     set_offloading_param(weightmat, "weight_offloading", True)