[wip] mx: expose a fast path for casting to fp4x2

vkuzo · vkuzo · commit 015443293c37 · 2025-08-21T07:52:36.000-07:00
Summary: not ready for review yet Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 2d88961 ghstack-comment-id: 3210931181 Pull-Request: #2832
diff --git a/test/prototype/mx_formats/test_kernels.py b/test/prototype/mx_formats/test_kernels.py
@@ -561,3 +561,45 @@ def test_cuda_mx_dim1_invalid_block_size():
             scale_dim_x=1,
             scale_dim_y=invalid_block_size,
         )
+
+
+def _fp32_to_fp4_reference(
+    data_hp: torch.Tensor,
+) -> torch.Tensor:
+    # works
+    data_hp = data_hp.float()
+    data_lp = f32_to_f4_unpacked(data_hp)
+
+    # does not work
+    # data_lp = f32_to_f4_unpacked(data_hp.float())
+
+    data_lp = pack_uint4(data_lp)
+    return data_lp
+
+
+# TODO add skips
+def test_fp32_cast_to_fp4x2():
+    from torchao.prototype.mx_formats.kernels import triton_fp32_cast_to_fp4x2
+
+    M, K = 16, 16
+    x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
+    # make x's range be the representable range of fp4
+    x = x * 6.0
+
+    # this leads to values in `x` being overridden inplace
+    # TODO fix it
+    print(0, x)
+    data = triton_fp32_cast_to_fp4x2(x)
+    print(1, x)
+    return
+
+    data_ref = _fp32_to_fp4_reference(x)
+    # print(2, x[0])
+    data = triton_fp32_cast_to_fp4x2(x)
+    # print(3, x[0])
+    # print(0, x)
+    # print(1, data_ref, data_ref.shape)
+    # print(2, data, data.shape)
+    torch.testing.assert_close(data_ref, data)
+    assert data.shape == (M, K // 2)
+    print("done")
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -1454,6 +1454,49 @@ def _(scale_tensor):
         padded_cols = n_col_blocks * 4
 
         return scale_tensor.new_empty((padded_rows, padded_cols))
+
+    @triton.jit
+    def fp32_cast_to_fp4x2_triton_kernel(
+        x_ptr,
+        q_ptr,
+        stride_xm,
+        stride_xn,
+        M,
+        N,
+    ):
+        pid_m = tl.program_id(1)
+        pid_n = tl.program_id(0)
+
+        offs_m = pid_m * 128 + tl.arange(0, 128)[:, None]
+        offs_n = pid_n * 64 + tl.arange(0, 64)[None, :]
+        mask = None
+        other = None
+        x = tl.load(
+            x_ptr + offs_m * stride_xm + offs_n * stride_xn, mask=mask, other=other
+        )  # [128, 64]
+        x_blocks = x.to(tl.float32).reshape(128, 4, 16)  # [128, 4, 16]
+
+        # Convert to FP4
+        x_fp4x2 = convert_fp32_to_fp4_packed(x_blocks.reshape(128, 32, 2).split())
+        offs_m = pid_m * 128 + tl.arange(0, 128)[:, None]
+        offs_n = pid_n * 32 + tl.arange(0, 32)[None, :]
+        tl.store(q_ptr + offs_m * (N // 2) + offs_n, x_fp4x2, mask=None)
+
+    def triton_fp32_cast_to_fp4x2(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        M, N = x.shape
+        assert N % 16 == 0, "N must be divisible by 16 for NVFP4 quantization"
+        xq = x.new_empty(M, N // 2, dtype=torch.uint8)
+        grid = (triton.cdiv(N, 64), triton.cdiv(M, 128))
+        fp32_cast_to_fp4x2_triton_kernel[grid](
+            x,
+            xq,
+            x.stride(0),
+            x.stride(1),
+            M,
+            N,
+        )
+
+        return xq.view(torch.uint8)
 else:
 
     def triton_to_mxfp8_dim1(