Use log1p(x) instead of log(1+x) (#1401)

This function is more accurate than torch.log() for small values of input - https://pytorch.org/docs/stable/generated/torch.log1p.html Found with TorchFix https://github.com/pytorch-labs/torchfix/ Signed-off-by: Sergii Dymchenko <[email protected]> Co-authored-by: Xiaowei Ren <[email protected]> Co-authored-by: Tim Moon <[email protected]>
NVIDIA · Jan 28, 2025 · 199e612 · 199e612
1 parent 2fce82b
commit 199e612
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py
@@ -1604,7 +1604,7 @@ def flash_attn_fwd_softmax_lse_correction(
     """Merge softmax stats of each step in Attention with context parallelism"""
     max_scale = torch.max(softmax_lse, softmax_lse_per_step)
     min_scale = torch.min(softmax_lse, softmax_lse_per_step)
-    new_scale = max_scale + torch.log(1 + torch.exp(min_scale - max_scale))
+    new_scale = max_scale + torch.log1p(torch.exp(min_scale - max_scale))
     softmax_lse.copy_(new_scale)