Faster KL Div (#822)

PaulZhang12 · web-flow · commit ebb9f34fb4b9 · 2025-10-06T17:12:21.000-04:00
diff --git a/examples/kl_div.py b/examples/kl_div.py
@@ -72,34 +72,34 @@ def kl_div_forward(
     else:
         loss = torch.zeros((BT,), dtype=torch.float32, device=y_pred.device)
 
-    kl_loss = torch.zeros_like(y_pred)
-
     # Call register_block_size to know block_size_n outside of the reduction loop.
     block_size_n = hl.register_block_size(V)
+    block_size_m = hl.register_block_size(BT)
 
-    BT_SIZE = helion.cdiv(BT, BT)  # Process all at once for simplicity
-    for tile_bt in hl.tile(BT, block_size=BT_SIZE):
+    for tile_bt in hl.tile(BT, block_size=block_size_m):
         loss_sum = hl.zeros([tile_bt, block_size_n], dtype=torch.float32)
 
         for tile_v in hl.tile(V, block_size=block_size_n):
+            kl_loss = hl.zeros([block_size_m, block_size_n], dtype=torch.float32)
+
             y_pred_val = y_pred[tile_bt, tile_v]
             y_true_val = y_true[tile_bt, tile_v]
 
             if log_target:
                 # KL(P || Q) = exp(y_true) * (y_true - y_pred) when both in log-space
                 prob_true = torch.exp(y_true_val)
-                kl_loss[tile_bt, tile_v] = prob_true * (y_true_val - y_pred_val)
+                kl_loss += prob_true * (y_true_val - y_pred_val)
 
             else:
                 # KL(P || Q) = y_true * (log(y_true) - y_pred) when y_pred in log-space
                 log_true = torch.log(torch.clamp(y_true_val, min=eps))
-                kl_loss[tile_bt, tile_v] = y_true_val * (log_true - y_pred_val)
+                kl_loss += y_true_val * (log_true - y_pred_val)
 
             if reduction == "none":
-                loss[tile_bt, tile_v] = kl_loss[tile_bt, tile_v]
+                loss[tile_bt, tile_v] = kl_loss
             else:
                 # Sum over vocabulary dimension
-                loss_sum += kl_loss[tile_bt, tile_v]
+                loss_sum += kl_loss
 
         if reduction != "none":
             loss[tile_bt] = loss_sum.sum(dim=-1)
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -2266,7 +2266,6 @@ def jsd_forward(_input: Tensor, target: Tensor, shift_labels: Tensor | None=None
 from __future__ import annotations
 
 import torch
-import helion
 import triton
 import triton.language as tl
 from torch._inductor.runtime import triton_helpers
@@ -2275,41 +2274,44 @@ from torch._inductor.runtime.triton_compat import libdevice
 from helion.runtime import default_launcher as _default_launcher
 
 @triton.jit
-def _helion_kl_div_forward(y_pred, y_true, kl_loss, loss, kl_loss_stride_0, kl_loss_stride_1, loss_stride_0, y_pred_stride_0, y_pred_stride_1, y_true_stride_0, y_true_stride_1, BT, V, log_target, eps, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_0: tl.constexpr):
+def _helion_kl_div_forward(y_pred, y_true, loss, loss_stride_0, y_pred_stride_0, y_pred_stride_1, y_true_stride_0, y_true_stride_1, BT, V, log_target, eps, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_0: tl.constexpr):
     pid_0 = tl.program_id(0)
     offset_1 = pid_0 * _BLOCK_SIZE_1
     indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
     mask_1 = indices_1 < BT
     loss_sum = tl.full([_BLOCK_SIZE_1, _BLOCK_SIZE_0], 0.0, tl.float32)
-    for offset_0 in tl.range(0, V.to(tl.int32), _BLOCK_SIZE_0):
-        indices_0 = offset_0 + tl.arange(0, _BLOCK_SIZE_0).to(tl.int32)
-        mask_0 = indices_0 < V
+    for offset_0 in tl.range(0, V.to(tl.int32)):
+        indices_0 = offset_0 + tl.arange(0, 1).to(tl.int32)
         loss_sum_copy = loss_sum
         loss_sum_copy_0 = loss_sum_copy
-        y_pred_val = tl.load(y_pred + (indices_1[:, None] * y_pred_stride_0 + indices_0[None, :] * y_pred_stride_1), mask_1[:, None] & mask_0[None, :], other=0)
-        y_true_val = tl.load(y_true + (indices_1[:, None] * y_true_stride_0 + indices_0[None, :] * y_true_stride_1), mask_1[:, None] & mask_0[None, :], other=0)
+        kl_loss = tl.full([_BLOCK_SIZE_1, _BLOCK_SIZE_0], 0.0, tl.float32)
+        y_pred_val = tl.load(y_pred + (indices_1[:, None] * y_pred_stride_0 + indices_0[None, :] * y_pred_stride_1), mask_1[:, None], other=0)
+        y_true_val = tl.load(y_true + (indices_1[:, None] * y_true_stride_0 + indices_0[None, :] * y_true_stride_1), mask_1[:, None], other=0)
         if log_target:
             y_true_val_copy = y_true_val
             y_pred_val_copy = y_pred_val
+            kl_loss_copy = kl_loss
             y_true_val_copy_0 = y_true_val_copy
             y_pred_val_copy_0 = y_pred_val_copy
+            kl_loss_copy_0 = kl_loss_copy
             v_0 = libdevice.exp(y_true_val_copy_0)
             v_1 = y_true_val_copy_0 - y_pred_val_copy_0
             v_2 = v_0 * v_1
-            tl.store(kl_loss + (indices_1[:, None] * kl_loss_stride_0 + indices_0[None, :] * kl_loss_stride_1), v_2, mask_1[:, None] & mask_0[None, :])
+            kl_loss = kl_loss_copy_0 + v_2
         _not = not log_target
         if _not:
             y_true_val_copy_1 = y_true_val
             y_pred_val_copy_1 = y_pred_val
+            kl_loss_copy_1 = kl_loss
             y_true_val_copy_1_0 = y_true_val_copy_1
             y_pred_val_copy_1_0 = y_pred_val_copy_1
-            v_3 = triton_helpers.maximum(y_true_val_copy_1_0, eps)
-            v_4 = tl_math.log(v_3)
-            v_5 = v_4 - y_pred_val_copy_1_0
-            v_6 = y_true_val_copy_1_0 * v_5
-            tl.store(kl_loss + (indices_1[:, None] * kl_loss_stride_0 + indices_0[None, :] * kl_loss_stride_1), v_6, mask_1[:, None] & mask_0[None, :])
-        load_2 = tl.load(kl_loss + (indices_1[:, None] * kl_loss_stride_0 + indices_0[None, :] * kl_loss_stride_1), mask_1[:, None] & mask_0[None, :], other=0)
-        loss_sum = loss_sum_copy_0 + load_2
+            kl_loss_copy_1_0 = kl_loss_copy_1
+            v_4 = triton_helpers.maximum(y_true_val_copy_1_0, eps)
+            v_5 = tl_math.log(v_4)
+            v_6 = v_5 - y_pred_val_copy_1_0
+            v_7 = y_true_val_copy_1_0 * v_6
+            kl_loss = kl_loss_copy_1_0 + v_7
+        loss_sum = loss_sum_copy_0 + kl_loss
     sum_1 = tl.cast(tl.sum(loss_sum, 1), tl.float32)
     tl.store(loss + indices_1 * loss_stride_0, sum_1, mask_1)
 
@@ -2333,11 +2335,8 @@ def kl_div_forward(y_pred: Tensor, y_true: Tensor, log_target: bool=False, reduc
         loss = torch.zeros_like(y_pred)
     else:
         loss = torch.zeros((BT,), dtype=torch.float32, device=y_pred.device)
-    kl_loss = torch.zeros_like(y_pred)
-    BT_SIZE = helion.cdiv(BT, BT)
-    _BLOCK_SIZE_1 = BT_SIZE
-    _BLOCK_SIZE_0 = 4096
-    _launcher(_helion_kl_div_forward, (triton.cdiv(BT, _BLOCK_SIZE_1),), y_pred, y_true, kl_loss, loss, kl_loss.stride(0), kl_loss.stride(1), loss.stride(0), y_pred.stride(0), y_pred.stride(1), y_true.stride(0), y_true.stride(1), BT, V, log_target, eps, _BLOCK_SIZE_1, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _BLOCK_SIZE_1 = 4096
+    _launcher(_helion_kl_div_forward, (triton.cdiv(BT, _BLOCK_SIZE_1),), y_pred, y_true, loss, loss.stride(0), y_pred.stride(0), y_pred.stride(1), y_true.stride(0), y_true.stride(1), BT, V, log_target, eps, _BLOCK_SIZE_1, 1, num_warps=4, num_stages=3)
     if reduction == 'batchmean':
         final_loss = torch.sum(loss) / BT
     elif reduction == 'sum':
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -1140,7 +1140,7 @@ def test_kl_div(self):
                 args,
                 torch_kl_div(*args),
                 fn_name="kl_div_forward",
-                block_sizes=[4096],
+                block_sizes=[1, 4096],
                 num_warps=4,
                 num_stages=3,
             )

Original file line number	Diff line number	Diff line change
`@@ -1140,7 +1140,7 @@ def test_kl_div(self):`
`1140`	`1140`	`args,`
`1141`	`1141`	`torch_kl_div(*args),`
`1142`	`1142`	`fn_name="kl_div_forward",`
`1143`		`- block_sizes=[4096],`
	`1143`	`+ block_sizes=[1, 4096],`
`1144`	`1144`	`num_warps=4,`
`1145`	`1145`	`num_stages=3,`
`1146`	`1146`	`)`