add subclass based method for inference

drisspg · drisspg · commit 93067f0f23b0 · 2025-04-25T15:36:56.000-07:00
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
@@ -25,6 +25,7 @@
     MXInferenceLinear,
     MXLinear,
 )
+from torchao.prototype.mx_formats.mx_subclass import MXFPConfig
 from torchao.quantization import quantize_
 from torchao.quantization.utils import compute_error
 from torchao.utils import (
@@ -372,3 +373,34 @@ def test_inference_print_str():
     s = str(m)
     assert "bl_sz=32" in s
     assert "kernel=emulated" in s
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not TORCH_VERSION_AT_LEAST_2_8, reason="torch.compile requires PyTorch 2.8+"
+)
+@pytest.mark.skipif(not is_sm_at_least_100, reason="Reqs sm100")
+@pytest.mark.parametrize("elem_dtype", [torch.float8_e4m3fn])
+def test_inference_subclass(elem_dtype):
+    """
+    Smoke test for inference compile
+    """
+    if elem_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
+        if not is_sm_at_least_89():
+            pytest.skip("CUDA capability >= 8.9 required for float8 in triton")
+
+    m = nn.Sequential(nn.Linear(32, 128, bias=False, dtype=torch.bfloat16))
+    m = m.cuda()
+    m_mx = copy.deepcopy(m)
+    config = MXFPConfig()
+    quantize_(m_mx, config=config)
+    m_mx = torch.compile(m_mx, fullgraph=True)
+
+    x = torch.randn(128, 32, device="cuda", dtype=torch.bfloat16)
+    y_ref = m(x)
+    y_mx = m_mx(x)
+    sqnr = compute_error(y_ref, y_mx)
+    if elem_dtype is torch.float8_e4m3fn:
+        assert sqnr >= 20.0
+    else:
+        assert sqnr >= 11.5
diff --git a/torchao/__init__.py b/torchao/__init__.py
@@ -43,7 +43,7 @@
     quantize_,
 )
 
-from . import dtypes, optim, swizzle, testing
+from . import dtypes, optim, quantization, swizzle, testing
 
 __all__ = [
     "dtypes",
@@ -53,4 +53,5 @@
     "swizzle",
     "testing",
     "ops",
+    "quantization",
 ]
diff --git a/torchao/prototype/mx_formats/mx_ops.py b/torchao/prototype/mx_formats/mx_ops.py
@@ -17,7 +17,7 @@
 the underlying data fields to the MX matmul.
 """
 
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 import torch
 from torch.utils._pytree import tree_map
@@ -69,13 +69,29 @@ def mx_desugar_op(aten_op, args, kwargs=None):
     return new
 
 
+def _get_gemm_choice(
+    choice_a: Optional[MXGemmKernelChoice], choice_b: Optional[MXGemmKernelChoice]
+) -> MXGemmKernelChoice:
+    if choice_a is not None and choice_b is not None:
+        assert choice_a == choice_b, (
+            "Both MXTensor inputs must have the same gemm config if specified"
+        )
+        return choice_a
+
+    # Assert that at least one is set and return that one
+    assert choice_a is not None or choice_b is not None, (
+        "At least one gemm choice must be specified"
+    )
+    return choice_a if choice_a is not None else choice_b
+
+
 @implements([aten.mm.default, aten.matmul.default])
 def mx_mm(aten_op, args, kwargs=None):
     a = args[0]
     b = args[1]
     assert isinstance(a, MXTensor) and isinstance(b, MXTensor)
-    assert a._gemm_kernel_choice == b._gemm_kernel_choice, "unsupported"
-    if a._gemm_kernel_choice in (MXGemmKernelChoice.CUBLAS, MXGemmKernelChoice.CUTLASS):
+    gemm_choice = _get_gemm_choice(a._gemm_kernel_choice, b._gemm_kernel_choice)
+    if gemm_choice in (MXGemmKernelChoice.CUBLAS, MXGemmKernelChoice.CUTLASS):
         # real MX gemm backed by torchao's CUTLASS kernels
         M, K, N = a.shape[0], a.shape[1], b.shape[1]
         assert a._data.is_contiguous()
@@ -88,7 +104,7 @@ def mx_mm(aten_op, args, kwargs=None):
         b_scale_block = to_blocked(b_scale)
         if a._elem_dtype == torch.float8_e4m3fn:
             assert b._elem_dtype == torch.float8_e4m3fn
-            assert a._gemm_kernel_choice is MXGemmKernelChoice.CUBLAS, (
+            assert gemm_choice is MXGemmKernelChoice.CUBLAS, (
                 "CUBLAS is the only supported kernel choice for MX FP8 operations"
             )
             res = torch._scaled_mm(
@@ -101,7 +117,7 @@ def mx_mm(aten_op, args, kwargs=None):
         else:
             assert a._elem_dtype == DTYPE_FP4
             assert b._elem_dtype == DTYPE_FP4
-            assert a._gemm_kernel_choice is MXGemmKernelChoice.CUTLASS, "unsupported"
+            assert gemm_choice is MXGemmKernelChoice.CUTLASS, "unsupported"
             res = torchao.ops.mx_fp4_bf16(
                 a._data, b._data, a_scale_block, b_scale_block
             )
diff --git a/torchao/prototype/mx_formats/mx_subclass.py b/torchao/prototype/mx_formats/mx_subclass.py
@@ -0,0 +1,117 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import types
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+import torchao
+from torchao.core.config import AOBaseConfig
+from torchao.prototype.mx_formats import (
+    MXGemmKernelChoice,
+)
+from torchao.prototype.mx_formats.config import (
+    _validate_elem_dtype,
+    _validate_gemm_kernel_choice,
+)
+from torchao.prototype.mx_formats.mx_tensor import MXTensor
+from torchao.quantization.quant_api import to_linear_activation_quantized
+from torchao.quantization.transform_module import (
+    register_quantize_module_handler,
+)
+from torchao.utils import is_sm_at_least_100
+
+
+@dataclass
+class MXFPConfig(AOBaseConfig):
+    block_size: int = 32
+
+    # Dtypes for Input and Weights
+    activation_dtype: torch.dtype = torch.float8_e4m3fn
+    weight_dtype: torch.dtype = torch.float8_e4m3fn
+
+    # Which kernel to run for mm
+    gemm_kernel_choice: MXGemmKernelChoice = MXGemmKernelChoice.CUBLAS
+
+    # Set some magic perf settings
+    set_inductor_config: bool = True
+
+    def __post_init__(self):
+        assert self.activation_dtype == self.weight_dtype, (
+            "For now - we only support matching input/weight dtypes."
+        )
+        _validate_elem_dtype(self.activation_dtype)
+        _validate_elem_dtype(self.weight_dtype)
+        _validate_gemm_kernel_choice(
+            self.gemm_kernel_choice, self.block_size, self.weight_dtype
+        )
+
+
+def _linear_extra_repr(self):
+    return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight={repr(self.weight)}"
+
+
+def _input_activation_quant_func_mxfp(
+    x: torch.Tensor,
+    activation_dtype: torch.dtype,
+    block_size: int,
+    scale: Optional[torch.Tensor] = None,
+):
+    """ """
+
+    # TODO scale for static quant
+
+    activation = MXTensor.to_mx(
+        x,
+        activation_dtype,
+        block_size=block_size,
+        gemm_kernel_choice=None,  # Get from weight
+        pack_fp6=False,  # TODO
+    )
+    return activation
+
+
+@register_quantize_module_handler(MXFPConfig)
+def _mx_inference_linear_transform(module: torch.nn.Module, config: MXFPConfig):
+    # TODO Sm120 has slightly more restrictive reqs
+    # TODO handle AMD
+    assert is_sm_at_least_100(), "MXFP is only supported on sm100 machiens for now"
+    if config.set_inductor_config:
+        torchao.quantization.utils.recommended_inductor_config_setter()
+
+    activation_dtype = config.activation_dtype
+    weight_dtype = config.weight_dtype
+    weight = module.weight
+
+    assert weight.dtype == torch.bfloat16, (
+        f"Only supporting bf16 out dtype for now, got {weight.dtype}"
+    )
+
+    # Convert weight to MX Tensor
+    quantized_weight = MXTensor.to_mx(
+        weight,
+        weight_dtype,
+        block_size=config.block_size,
+        gemm_kernel_choice=config.gemm_kernel_choice,
+        pack_fp6=False,  # TODO
+    )
+
+    input_quant_func = _input_activation_quant_func_mxfp
+    input_quant_kwargs = {
+        "block_size": config.block_size,
+        "activation_dtype": activation_dtype,
+        "scale": None,
+    }
+
+    quantized_weight = to_linear_activation_quantized(
+        quantized_weight, input_quant_func, quant_kwargs=input_quant_kwargs
+    )
+
+    module.weight = torch.nn.Parameter(quantized_weight, requires_grad=False)
+    module.extra_repr = types.MethodType(_linear_extra_repr, module)
+    return module

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@`
`43`	`43`	`quantize_,`
`44`	`44`	`)`
`45`	`45`
`46`		`-from . import dtypes, optim, swizzle, testing`
	`46`	`+from . import dtypes, optim, quantization, swizzle, testing`
`47`	`47`
`48`	`48`	`__all__ = [`
`49`	`49`	`"dtypes",`
`@@ -53,4 +53,5 @@`
`53`	`53`	`"swizzle",`
`54`	`54`	`"testing",`
`55`	`55`	`"ops",`
	`56`	`+ "quantization",`
`56`	`57`	`]`