Add all fbgemm kernel Tensors into Int4WeightOnlyConfig and Float8DynamicActivationInt4WeightConfig

jerryzh168 · jerryzh168 · commit e506cf3ba3fa · 2025-07-02T19:19:40.000-07:00
Summary: att, we will deprecate FbgemmConfig since it's a single kernel. we'd like to categorize things to derived dtype + packed format Test Plan: python test/quantization/quantize_/test_int4_groupwise_preshuffle.py Reviewers: Subscribers: Tasks: Tags: stack-info: PR: #2474, branch: jerryzh168/stack/10
diff --git a/test/integration/test_serialization_bc.py b/test/integration/test_serialization_bc.py
@@ -13,6 +13,7 @@
 
 _MODEL_NAMES = [
     "torchao-testing/opt-125m-float8dq-row-fbgemm",
+    "torchao-testing/opt-125m-int4wo-preshuffle",
 ]
 
 
diff --git a/test/quantization/quantize_/int4/test_int4_groupwise_preshuffle.py b/test/quantization/quantize_/int4/test_int4_groupwise_preshuffle.py
@@ -17,7 +17,8 @@
 
 from torchao.float8.config import e4m3_dtype
 from torchao.quantization import (
-    FbgemmConfig,
+    Float8ActivationInt4WeightConfig,
+    Int4WeightOnlyConfig,
     quantize_,
 )
 from torchao.quantization.utils import compute_error
@@ -27,36 +28,14 @@
     is_sm_at_least_90,
 )
 
-BF16_ACT_CONFIG = FbgemmConfig(
-    input_dtype=torch.bfloat16,
-    weight_dtype=torch.int4,
-    output_dtype=torch.bfloat16,
-    block_size=[1, 128],
-    preshuffle=True,
+BF16_ACT_CONFIG = Int4WeightOnlyConfig(
+    group_size=128,
+    use_preshuffle=True,
 )
 
-BF16_ACT_BMM_CONFIG = FbgemmConfig(
-    input_dtype=torch.bfloat16,
-    weight_dtype=torch.int4,
-    output_dtype=torch.bfloat16,
-    block_size=[1, 1, 128],
-    preshuffle=True,
-)
-
-FP8_ACT_CONFIG = FbgemmConfig(
-    input_dtype=e4m3_dtype,
-    weight_dtype=torch.int4,
-    output_dtype=torch.bfloat16,
-    block_size=[1, 128],
-    preshuffle=True,
-)
-
-FP8_ACT_BMM_CONFIG = FbgemmConfig(
-    input_dtype=e4m3_dtype,
-    weight_dtype=torch.int4,
-    output_dtype=torch.bfloat16,
-    block_size=[1, 1, 128],
-    preshuffle=True,
+FP8_ACT_CONFIG = Float8ActivationInt4WeightConfig(
+    group_size=128,
+    use_preshuffle=True,
 )
 
 
@@ -83,7 +62,7 @@ def test_linear(self, config):
 
     # Note: this order will error out: `Got bad cuda status: an illegal memory access was encountered at line: 449`
     # @parametrize("bmm_config", [BF16_ACT_BMM_CONFIG, FP8_ACT_BMM_CONFIG])
-    @parametrize("bmm_config", [FP8_ACT_BMM_CONFIG, BF16_ACT_BMM_CONFIG])
+    @parametrize("bmm_config", [FP8_ACT_CONFIG, BF16_ACT_CONFIG])
     def test_bmm(self, bmm_config):
         class M(torch.nn.Module):
             def __init__(self, weight):
diff --git a/test/quantization/quantize_/int4/test_int4_groupwise_tensor.py b/test/quantization/quantize_/int4/test_int4_groupwise_tensor.py
@@ -13,7 +13,7 @@
 )
 
 from torchao.quantization import (
-    FbgemmConfig,
+    Int4WeightOnlyConfig,
     quantize_,
 )
 from torchao.quantization.utils import compute_error
@@ -26,19 +26,12 @@
 @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_8, "Need pytorch 2.8+")
 @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
 @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
-class TestFbgemmInt4Tensor(TestCase):
+class TestInt4GroupwiseTensor(TestCase):
     def setUp(self):
-        self.config = FbgemmConfig(
-            input_dtype=torch.bfloat16,
-            weight_dtype=torch.int4,
-            output_dtype=torch.bfloat16,
-            block_size=[1, 128],
-        )
-        self.bmm_config = FbgemmConfig(
-            input_dtype=torch.bfloat16,
-            weight_dtype=torch.int4,
-            output_dtype=torch.bfloat16,
-            block_size=[1, 1, 128],
+        self.config = Int4WeightOnlyConfig(
+            group_size=128,
+            use_preshuffle=False,
+            gemm_kernel_choice="fbgemm",
         )
         self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
 
@@ -135,7 +128,7 @@ def forward(self, x):
         original = m(input)
         # we need to transpose the weight first for bmm
         m.weight = torch.nn.Parameter(m.weight.transpose(1, 2).contiguous())
-        quantize_(m, self.bmm_config, filter_fn=lambda x, fqn: True)
+        quantize_(m, self.config, filter_fn=lambda x, fqn: True)
         quantized = m(input)
         self.assertTrue(compute_error(original, quantized) > 18)
 
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -44,6 +44,7 @@
 from .quant_api import (
     CutlassInt4PackedLayout,
     FbgemmConfig,
+    Float8ActivationInt4WeightConfig,
     Float8DynamicActivationFloat8SemiSparseWeightConfig,
     Float8DynamicActivationFloat8WeightConfig,
     Float8MMConfig,
@@ -141,6 +142,7 @@
     "Int8DynamicActivationInt8WeightConfig",
     "Int8DynamicActivationIntxWeightConfig",
     "Int4WeightOnlyConfig",
+    "Float8ActivationInt4WeightConfig",
     "Int8WeightOnlyConfig",
     "Float8WeightOnlyConfig",
     "Float8DynamicActivationFloat8WeightConfig",
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -50,7 +50,6 @@
     to_affine_quantized_floatx_static,
     to_affine_quantized_intx,
     to_fbgemm_fp8,
-    to_fbgemm_int4,
     to_marlinqqq_quantized_intx,
 )
 from torchao.dtypes.uintx.packed_linear_int8_dynamic_activation_intx_weight_layout import (
@@ -73,6 +72,7 @@
 from torchao.quantization.quantize_ import (
     Float8Tensor,
     Int4GroupwisePreshuffleTensor,
+    Int4GroupwiseTensor,
 )
 from torchao.quantization.transform_module import (
     _QUANTIZE_CONFIG_HANDLER,
@@ -1117,6 +1117,8 @@ class Int4WeightOnlyConfig(AOBaseConfig):
     zero_point_domain: Optional[ZeroPointDomain] = ZeroPointDomain.NONE
     set_inductor_config: bool = True
     preserve_zero: Optional[bool] = None
+    use_preshuffle: bool = False
+    gemm_kernel_choice: GemmKernelChoice = GemmKernelChoice.ATEN
 
 
 # for BC
@@ -1134,15 +1136,38 @@ def _int4_weight_only_quantize_tensor(weight, config):
     layout = config.layout
     use_hqq = config.use_hqq
     zero_point_domain = config.zero_point_domain
+    use_preshuffle = config.use_preshuffle
+    gemm_kernel_choice = config.gemm_kernel_choice
 
     if weight.shape[-1] % group_size != 0:
         logger.info(
             f"Skipping quantizing weight with int4 weight only quantization because the shape of weight {weight.shape} is not compatible with group_size {group_size}"
         )
         return weight
 
+    if use_preshuffle and gemm_kernel_choice != GemmKernelChoice.FBGEMM:
+        raise NotImplementedError(
+            f"use_preshuffle is only supported for fbgemm kernel, got: {gemm_kernel_choice}"
+        )
+
+    block_size = tuple([1 for _ in range(weight.ndim - 1)] + [group_size])
+
+    if gemm_kernel_choice == GemmKernelChoice.FBGEMM:
+        if use_preshuffle:
+            new_weight = Int4GroupwisePreshuffleTensor.from_float(
+                weight,
+                block_size,
+                activation_dtype="bf16",
+            )
+            return new_weight
+        else:
+            new_weight = Int4GroupwiseTensor.from_float(
+                weight,
+                block_size,
+            )
+            return new_weight
+
     mapping_type = MappingType.ASYMMETRIC
-    block_size = tuple([1 for _ in range(weight.dim() - 1)] + [group_size])
     target_dtype = torch.int32
     quant_min = 0
     quant_max = 15
@@ -1214,6 +1239,39 @@ def _int4_weight_only_transform(
     return module
 
 
+@dataclass
+class Float8ActivationInt4WeightConfig(AOBaseConfig):
+    group_size: int = 128
+    use_preshuffle: bool = False
+    kernel: str = "fbgemm"
+
+
+@register_quantize_module_handler(Float8ActivationInt4WeightConfig)
+def _(module: torch.nn.Module, config: Int4WeightOnlyConfig) -> torch.nn.Module:
+    assert hasattr(module, "weight"), (
+        "applying int8 weight only quant requires module to have weight attribute"
+        + " but {module} does not have one"
+    )
+    group_size = config.group_size
+    use_preshuffle = config.use_preshuffle
+    kernel = config.kernel
+
+    assert use_preshuffle, (
+        f"only use_preshuffle == True is supported right now, got: {use_preshuffle}"
+    )
+    assert kernel == "fbgemm", f"only fbgemm kernel is supported, got: {kernel}"
+    weight = module.weight
+    block_size = tuple([1 for _ in range(weight.ndim - 1)] + [group_size])
+    new_weight = Int4GroupwisePreshuffleTensor.from_float(
+        module.weight,
+        block_size,
+        activation_dtype="fp8",
+    )
+    module.weight = torch.nn.Parameter(new_weight, requires_grad=False)
+    module.extra_repr = types.MethodType(_linear_extra_repr, module)
+    return module
+
+
 @dataclass
 class Int8WeightOnlyConfig(AOBaseConfig):
     """
@@ -2078,7 +2136,7 @@ def _(module: torch.nn.Module, config: FbgemmConfig) -> torch.nn.Module:
                 activation_dtype="bf16",
             )
         else:
-            weight = to_fbgemm_int4(
+            weight = Int4GroupwiseTensor.from_float(
                 module.weight,
                 config.block_size,
             )
diff --git a/torchao/quantization/quantize_/__init__.py b/torchao/quantization/quantize_/__init__.py
@@ -3,9 +3,11 @@
 )
 from .int4 import (
     Int4GroupwisePreshuffleTensor,
+    Int4GroupwiseTensor,
 )
 
 __all__ = [
     "Int4GroupwisePreshuffleTensor",
+    "Int4GroupwiseTensor",
     "Float8Tensor",
 ]
diff --git a/torchao/quantization/quantize_/int4/__init__.py b/torchao/quantization/quantize_/int4/__init__.py
@@ -1,7 +1,11 @@
 from .int4_groupwise_preshuffle_tensor import (
     Int4GroupwisePreshuffleTensor,
 )
+from .int4_groupwise_tensor import (
+    Int4GroupwiseTensor,
+)
 
 __all__ = [
     "Int4GroupwisePreshuffleTensor",
+    "Int4GroupwiseTensor",
 ]
diff --git a/torchao/quantization/quantize_/int4/int4_groupwise_tensor.py b/torchao/quantization/quantize_/int4/int4_groupwise_tensor.py
@@ -17,8 +17,7 @@
 )
 
 __all__ = [
-    "to_fbgemm_int4",
-    "FbgemmInt4Tensor",
+    "Int4GroupwiseTensor",
 ]
 
 aten = torch.ops.aten
@@ -31,7 +30,7 @@
     pack_int4 = None
 
 
-class FbgemmInt4Tensor(TorchAOBaseTensor):
+class Int4GroupwiseTensor(TorchAOBaseTensor):
     tensor_data_attrs = ["packed_weight", "scale", "zero_point"]
     tensor_attributes = ["group_size", "shape"]
 
@@ -118,7 +117,7 @@ def from_float(
         zero_point = zero_point.to(w.dtype)
 
         del w
-        return FbgemmInt4Tensor(
+        return Int4GroupwiseTensor(
             packed_weight=wq,
             scale=scale,
             zero_point=zero_point,
@@ -127,7 +126,7 @@ def from_float(
         )
 
 
-implements = FbgemmInt4Tensor.implements
+implements = Int4GroupwiseTensor.implements
 
 
 @implements([torch.nn.functional.linear, aten.linear.default])
@@ -143,8 +142,8 @@ def _(func, types, args, kwargs):
     res = torch.ops.fbgemm.bf16i4bf16_rowwise(
         input_tensor,
         weight_tensor.packed_weight.contiguous(),
-        weight_tensor.scale,
-        weight_tensor.zero_point,
+        weight_tensor.scale.contiguous(),
+        weight_tensor.zero_point.contiguous(),
     )
     res = res.reshape(*orig_act_size[:-1], orig_out_features)
     if bias is not None:
@@ -185,10 +184,10 @@ def _(func, types, args, kwargs):
     )
 
 
-def _same_metadata(self: "FbgemmInt4Tensor", src: "FbgemmInt4Tensor") -> bool:
+def _same_metadata(self: "Int4GroupwiseTensor", src: "Int4GroupwiseTensor") -> bool:
     return (
-        isinstance(self, FbgemmInt4Tensor)
-        and isinstance(src, FbgemmInt4Tensor)
+        isinstance(self, Int4GroupwiseTensor)
+        and isinstance(src, Int4GroupwiseTensor)
         and self.shape == src.shape
         and self.packed_weight.shape == src.packed_weight.shape
         and self.scale.shape == src.scale.shape
@@ -287,9 +286,6 @@ def _(func, types, args, kwargs):
     return return_and_correct_aliasing(func, args, kwargs, new)
 
 
-to_fbgemm_int4 = FbgemmInt4Tensor.from_float
-
-
 if TORCH_VERSION_AT_LEAST_2_5:
-    # Allow a model with FbgemmInt4Tensor weights to be loaded with `weights_only=True`
-    torch.serialization.add_safe_globals([FbgemmInt4Tensor])
+    # Allow a model with Int4GroupwiseTensor weights to be loaded with `weights_only=True`
+    torch.serialization.add_safe_globals([Int4GroupwiseTensor])

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@`
`13`	`13`
`14`	`14`	`_MODEL_NAMES = [`
`15`	`15`	`"torchao-testing/opt-125m-float8dq-row-fbgemm",`
	`16`	`+ "torchao-testing/opt-125m-int4wo-preshuffle",`
`16`	`17`	`]`
`17`	`18`
`18`	`19`
Original file line number	Diff line number	Diff line change
`@@ -3,9 +3,11 @@`
`3`	`3`	`)`
`4`	`4`	`from .int4 import (`
`5`	`5`	`Int4GroupwisePreshuffleTensor,`
	`6`	`+ Int4GroupwiseTensor,`
`6`	`7`	`)`
`7`	`8`
`8`	`9`	`__all__ = [`
`9`	`10`	`"Int4GroupwisePreshuffleTensor",`
	`11`	`+ "Int4GroupwiseTensor",`
`10`	`12`	`"Float8Tensor",`
`11`	`13`	`]`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,11 @@`
`1`	`1`	`from .int4_groupwise_preshuffle_tensor import (`
`2`	`2`	`Int4GroupwisePreshuffleTensor,`
`3`	`3`	`)`
	`4`	`+from .int4_groupwise_tensor import (`
	`5`	`+ Int4GroupwiseTensor,`
	`6`	`+)`
`4`	`7`
`5`	`8`	`__all__ = [`
`6`	`9`	`"Int4GroupwisePreshuffleTensor",`
	`10`	`+ "Int4GroupwiseTensor",`
`7`	`11`	`]`