nod-ai · sogartar · Feb 21, 2025 · Feb 2, 2025 · Feb 2, 2025 · Feb 4, 2025
diff --git a/sharktank/sharktank/kernels/batch_matmul_transpose_b.py b/sharktank/sharktank/kernels/batch_matmul_transpose_b.py
@@ -7,16 +7,36 @@
 from sharktank.kernels.base import *
 
 import torch
+from typing import cast, Optional
 
-from iree.compiler.ir import IntegerType
+from iree.compiler.ir import IntegerType, Type
+from iree.turbine.support.conversions import (
+    TORCH_DTYPE_TO_IREE_TYPE_ASM,
+    IREE_TYPE_ASM_TO_TORCH_DTYPE,
+)
+from iree.turbine.runtime.op_reg import AttrArg
 
 __all__ = [
     "batch_matmul_transpose_b",
 ]
 
 
+def batch_matmul_transpose_b(
+    lhs: torch.Tensor,
+    rhs: torch.Tensor,
+    /,
+    *,
+    accum_dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    if accum_dtype is None:
+        accum_dtype = lhs.dtype
+    return _batch_matmul_transpose_b(
+        lhs, rhs, accum_dtype=TORCH_DTYPE_TO_IREE_TYPE_ASM[accum_dtype]
+    )
+
+
 @CustomOp.register(library=LIBRARY)
-class batch_matmul_transpose_b(CustomOp):
+class _batch_matmul_transpose_b(CustomOp):
     """Generic block scaled matmul with transposed RHS.
 
     The LHS is expected to be a 3d tensor of shape [B, M, K]. RHS must be
@@ -25,11 +45,14 @@ class batch_matmul_transpose_b(CustomOp):
     The kernel will be specialized for all values of N, K and LHS dtype.
     """
 
-    signature = "batch_matmul_transpose_b(Tensor lhs, Tensor rhs) -> (Tensor)"
+    signature = (
+        "batch_matmul_transpose_b(Tensor lhs, Tensor rhs, str accum_dtype) -> (Tensor)"
+    )
 
     def select(self, ksel: KernelSelection):
         lhs_desc = ksel.arg_tensor(0)  # Shape [B, M, K]
         rhs_desc = ksel.arg_tensor(1)  # Shape [B, N, K]
+        accum_type_attr = ksel.attr_str(2)
 
         # Rank check.
         torch._check(
@@ -60,7 +83,8 @@ def select(self, ksel: KernelSelection):
         )
         # Shape batch, m, n
         c_desc = ksel.return_new_tensor(
-            [lhs_batch, lhs_m, rhs_n], dtype=lhs_desc.t.dtype
+            [lhs_batch, lhs_m, rhs_n],
+            dtype=IREE_TYPE_ASM_TO_TORCH_DTYPE[accum_type_attr.v],
         )
         specialize_all_known_dims(lhs_desc)
         specialize_all_known_dims(rhs_desc)
@@ -74,12 +98,14 @@ def select(self, ksel: KernelSelection):
     def generate(self, ksel: KernelSelection, kb: KernelBuilder):
         lhs = kb.arg_value(0)
         rhs = kb.arg_value(1)
+        accum_type_str = cast(AttrArg, ksel.arg_descs[2]).v
         result_desc = ksel.result_descs[0]
 
         # Generate specialization signature and types.
-        a_asm_type, a_ident, accum_type = unpack_tensor_type(lhs.type)
+        a_asm_type, a_ident, _ = unpack_tensor_type(lhs.type)
         b_asm_type, b_ident, _ = unpack_tensor_type(rhs.type)
-        spec_sig = f"L{a_ident}_R{b_ident}"
+        accum_type = Type.parse(accum_type_str)
+        spec_sig = f"L{a_ident}_R{b_ident}_{accum_type_str}"
         template_file = "batch_matmul_transpose_b.mlir"
         target_function_name = f"sharktank_batch_matmul_transpose_b_{spec_sig}"
         cst_zero = "0" if IntegerType.isinstance(accum_type) else "0."

diff --git a/sharktank/sharktank/layers/linear.py b/sharktank/sharktank/layers/linear.py
@@ -85,8 +85,8 @@ def forward(self, x):
         # We can truncate to fp16 in iree, so we do a cast here
         # to account for this in the IR. This is may not be the right
         # level to do this, but for now its here.
-        if not isinstance(y, QuantizedTensor):
-            if y.dtype == torch.float8_e4m3fnuz:
+        if not isinstance(y, QuantizedTensor) and isinstance(x, QuantizedTensor):
+            if x.unpack().qs.dtype == torch.float8_e4m3fnuz:
                 y = ops.to(y, torch.bfloat16)
                 return y
         if qdq_output is not None:

diff --git a/sharktank/sharktank/ops/qlinear_impls.py b/sharktank/sharktank/ops/qlinear_impls.py
@@ -50,10 +50,10 @@ def qlinear_tensor_scaled(
 
     # Handle only integer and fp8 quantizations.
     if x_layout.qs.dtype.is_floating_point or weight_layout.qs.dtype.is_floating_point:
-        if x_layout.qs.dtype == torch.float8_e4m3fnuz:
-            # assume quark
-            return matmul(x_layout.qs, weight_layout.qs, transpose_rhs=True)
-        else:
+        if (
+            x_layout.qs.dtype != torch.float8_e4m3fnuz
+            or weight_layout.qs.dtype != torch.float8_e4m3fnuz
+        ):
             return NotImplemented
 
     # Bias.
@@ -170,7 +170,13 @@ def linear_quantized_weight(
 linear.override(Tensor, QuantizedTensor, AnyTensor)(linear_quantized_weight)
 
 
-def _invoke_mmt_kernel(lhs, rhs, *, accum_dtype):
+def _is_dtype_unsigned_integer(dtype: torch.dtype):
+    return not dtype.is_complex and not dtype.is_floating_point and not dtype.is_signed
+
+
+def _invoke_mmt_kernel(
+    lhs: torch.Tensor, rhs: torch.Tensor, *, accum_dtype: torch.dtype
+):
     if debugging.flags.use_custom_iree_kernels:
         # The custom kernel requires that the lhs and rhs be the same
         # rank. Broadcast the rhs to match.
@@ -187,9 +193,17 @@ def _invoke_mmt_kernel(lhs, rhs, *, accum_dtype):
             rhs_size = [lhs.shape[0]] + list(rhs.shape)
             rhs = rhs.unsqueeze(0).expand(rhs_size)
             rhs_rank = len(rhs.shape)
-        y_qs = kernels.batch_matmul_transpose_b(
-            lhs.to(accum_dtype), rhs.to(accum_dtype)
-        )
+        if (
+            _is_dtype_unsigned_integer(lhs.dtype)
+            or _is_dtype_unsigned_integer(rhs.dtype)
+            or _is_dtype_unsigned_integer(accum_dtype)
+        ):
+            # TODO: make the kernel work with unsigned types.
+            y_qs = kernels.batch_matmul_transpose_b(
+                lhs.to(dtype=accum_dtype), rhs.to(dtype=accum_dtype)
+            )
+        else:
+            y_qs = kernels.batch_matmul_transpose_b(lhs, rhs, accum_dtype=accum_dtype)
         # Squeeze the batch dimension to maintain shape parity with other
         # layers.
         if len(y_qs.shape) > 2:

diff --git a/sharktank/tests/kernels/batch_matmul_transpose_b_test.py b/sharktank/tests/kernels/batch_matmul_transpose_b_test.py
@@ -10,11 +10,13 @@
 
 import unittest
 from parameterized import parameterized
-
+import pytest
 import torch
 
 from iree.turbine import aot
+from iree.turbine.support.conversions import TORCH_DTYPE_TO_IREE_TYPE_ASM
 from sharktank import kernels
+from sharktank.utils.testing import skip
 
 
 class batch_matmul_transpose_b_test(unittest.TestCase):
@@ -40,24 +42,117 @@ def testBS32(self, atol, rtol):
         ref = torch.matmul(a, bT)
         torch.testing.assert_close(result, ref, atol=atol, rtol=rtol)
 
-    def testExportStaticDims(self):
+    @pytest.mark.xfail(
+        reason="""Does not compile for llvm-cpu with
+          <unknown>:0: error: 'llvm.fpext' op operand #0 must be floating point LLVM type or LLVM dialect-compatible vector of floating point LLVM type, but got 'vector<4xi8>'
+          <unknown>:0: note: see current operation: %120 = "llvm.fpext"(%109) : (vector<4xi8>) -> vector<4xf32>
+          """
+    )
+    def testArgF8AccumF32(self):
+        arg_dtype = torch.float8_e4m3fnuz
+        a = torch.rand([3, 4, 6]).to(arg_dtype)
+        b = torch.rand([3, 5, 6]).to(arg_dtype)
+        accum_dtype = torch.float32
+        result = kernels.batch_matmul_transpose_b(a, b, accum_dtype=accum_dtype)
+
+        # Dequantize and test with normal matmul.
+        # Tolerances are empirical and results are not expected to match exactly.
+        bT = torch.transpose(b, 1, 2)
+        ref = torch.matmul(a.to(dtype=accum_dtype), bT.to(dtype=accum_dtype))
+        torch.testing.assert_close(result, ref, atol=1e-3, rtol=0)
+
+    @pytest.mark.xfail(
+        reason="Does not work with unsigned types. The kernel needs to be adapted."
+    )
+    def testArgUi8AccumI32(self):
+        arg_dtype = torch.uint8
+        a = ((torch.rand([2, 3, 5]) * 255) + 0.5).to(dtype=arg_dtype)
+        b = ((torch.rand([2, 4, 5]) * 255) + 0.5).to(dtype=arg_dtype)
+        accum_dtype = torch.int32
+        result = kernels.batch_matmul_transpose_b(a, b, accum_dtype=accum_dtype)
+
+        bT = torch.transpose(b, 1, 2)
+        ref = torch.matmul(a.to(dtype=accum_dtype), bT.to(dtype=accum_dtype))
+        torch.testing.assert_close(result, ref, atol=0, rtol=0)
+
+    @pytest.mark.xfail(
+        reason="Does not work with unsigned types. The kernel needs to be adapted."
+    )
+    def testArgLhsI8RhsUi8AccumI32(self):
+        a = ((torch.rand([2, 3, 5]) - 0.5) * 255).to(dtype=torch.int8)
+        b = ((torch.rand([2, 4, 5]) * 255) + 0.5).to(dtype=torch.uint8)
+        accum_dtype = torch.int32
+        result = kernels.batch_matmul_transpose_b(a, b, accum_dtype=accum_dtype)
+
+        bT = torch.transpose(b, 1, 2)
+        ref = torch.matmul(a.to(dtype=accum_dtype), bT.to(dtype=accum_dtype))
+        torch.testing.assert_close(result, ref, atol=0, rtol=0)
+
+    def testArgI8AccumI32(self):
+        arg_dtype = torch.int8
+        a = ((torch.rand([2, 3, 5]) - 0.5) * 255).to(dtype=arg_dtype)
+        b = ((torch.rand([2, 3, 5]) - 0.5) * 255).to(dtype=arg_dtype)
+        accum_dtype = torch.int32
+        result = kernels.batch_matmul_transpose_b(a, b, accum_dtype=accum_dtype)
+
+        bT = torch.transpose(b, 1, 2)
+        ref = torch.matmul(a.to(dtype=accum_dtype), bT.to(dtype=accum_dtype))
+        torch.testing.assert_close(result, ref, atol=0, rtol=0)
+
+    @pytest.mark.xfail(
+        reason="""No uint32 dtype conversions in IREE Turbine.
+        Does not work with unsigned types. The kernel needs to be adapted.
+        The problem is that we reinterpret cast to signless integer types.
+        Maybe linalg.batch_matmul_transpose_b when promoting from i8 to i32 assumes a
+        signed type even though i8 is signless."""
+    )
+    def testArgUi8AccumUi32(self):
+        arg_dtype = torch.uint8
+        a = ((torch.rand([2, 3, 5]) * 255) + 0.5).to(dtype=arg_dtype)
+        b = ((torch.rand([2, 4, 5]) * 255) + 0.5).to(dtype=arg_dtype)
+        accum_dtype = torch.uint32
+        result = kernels.batch_matmul_transpose_b(a, b, accum_dtype=accum_dtype)
+
+        bT = torch.transpose(b, 1, 2)
+        ref = torch.matmul(a.to(dtype=torch.int32), bT.to(dtype=torch.int32))
+        ref = ref.to(dtype=accum_dtype)
+        torch.testing.assert_close(result, ref, atol=0, rtol=0)
+
+    @parameterized.expand(
+        [
+            (torch.int32, None),
+            (torch.float8_e4m3fnuz, torch.float32),
+        ]
+    )
+    def testExportStaticDims(
+        self, arg_dtype: torch.dtype, accum_dtype: torch.dtype | None
+    ):
         class MyModule(torch.nn.Module):
             def forward(self, a, b):
-                return kernels.batch_matmul_transpose_b(a, b)
+                return kernels.batch_matmul_transpose_b(a, b, accum_dtype=accum_dtype)
 
         mod = MyModule()
-        dtype = torch.int32
         ep = torch.export.export(
             mod,
             args=(
-                (torch.rand([4, 16, 2]) * 64).to(dtype),
-                (torch.rand([4, 8, 2]) * 64).to(dtype),
+                (torch.rand([4, 16, 2]) * 64).to(arg_dtype),
+                (torch.rand([4, 8, 2]) * 64).to(arg_dtype),
             ),
         )
         output = aot.export(ep)
         output.verify()
         asm = str(output.mlir_module)
-        self.assertIn("@sharktank_batch_matmul_transpose_b_L4x16x2xi32_R4x8x2xi32", asm)
+        arg_dtype_asm = TORCH_DTYPE_TO_IREE_TYPE_ASM[arg_dtype]
+        accum_dtype_asm = arg_dtype_asm
+        if accum_dtype is not None:
+            accum_dtype_asm = TORCH_DTYPE_TO_IREE_TYPE_ASM[accum_dtype]
+        self.assertIn(
+            (
+                "@sharktank_batch_matmul_transpose_b_"
+                f"L4x16x2x{arg_dtype_asm}_R4x8x2x{arg_dtype_asm}_{accum_dtype_asm}"
+            ),
+            asm,
+        )
 
 
 if __name__ == "__main__":