Add fbgemm preshuffle kernel into Int4WeightOnlyConfig and Float8DynamicActivationInt4WeightConfig

jerryzh168 · jerryzh168 · commit ff4682e2bb6e · 2025-07-02T16:44:02.000-07:00
Summary: att, we will deprecate FbgemmConfig since it's a single kernel. we'd like to categorize things to derived dtype + packed format Test Plan: python test/quantization/quantize_/test_int4_groupwise_preshuffle.py Reviewers: Subscribers: Tasks: Tags: stack-info: PR: #2474, branch: jerryzh168/stack/10
diff --git a/test/quantization/quantize_/test_int4_groupwise_preshuffle.py b/test/quantization/quantize_/test_int4_groupwise_preshuffle.py
@@ -16,7 +16,8 @@
 )
 
 from torchao.quantization import (
-    FbgemmConfig,
+    Float8ActivationInt4WeightConfig,
+    Int4WeightOnlyConfig,
     quantize_,
 )
 from torchao.quantization.utils import compute_error
@@ -26,40 +27,14 @@
     is_sm_at_least_90,
 )
 
-BF16_ACT_CONFIG = FbgemmConfig(
-    input_dtype=torch.bfloat16,
-    weight_dtype=torch.int4,
-    output_dtype=torch.bfloat16,
-    block_size=[1, 128],
-    preshuffle=True,
-    activation_dtype_for_int4="bf16",
+BF16_ACT_CONFIG = Int4WeightOnlyConfig(
+    group_size=128,
+    use_preshuffle=True,
 )
 
-BF16_ACT_BMM_CONFIG = FbgemmConfig(
-    input_dtype=torch.bfloat16,
-    weight_dtype=torch.int4,
-    output_dtype=torch.bfloat16,
-    block_size=[1, 1, 128],
-    preshuffle=True,
-    activation_dtype_for_int4="bf16",
-)
-
-FP8_ACT_CONFIG = FbgemmConfig(
-    input_dtype=torch.bfloat16,
-    weight_dtype=torch.int4,
-    output_dtype=torch.bfloat16,
-    block_size=[1, 128],
-    preshuffle=True,
-    activation_dtype_for_int4="fp8",
-)
-
-FP8_ACT_BMM_CONFIG = FbgemmConfig(
-    input_dtype=torch.bfloat16,
-    weight_dtype=torch.int4,
-    output_dtype=torch.bfloat16,
-    block_size=[1, 1, 128],
-    preshuffle=True,
-    activation_dtype_for_int4="fp8",
+FP8_ACT_CONFIG = Float8ActivationInt4WeightConfig(
+    group_size=128,
+    use_preshuffle=True,
 )
 
 
@@ -86,7 +61,7 @@ def test_linear(self, config):
 
     # Note: this order will error out: `Got bad cuda status: an illegal memory access was encountered at line: 449`
     # @parametrize("bmm_config", [BF16_ACT_BMM_CONFIG, FP8_ACT_BMM_CONFIG])
-    @parametrize("bmm_config", [FP8_ACT_BMM_CONFIG, BF16_ACT_BMM_CONFIG])
+    @parametrize("bmm_config", [FP8_ACT_CONFIG, BF16_ACT_CONFIG])
     def test_bmm(self, bmm_config):
         class M(torch.nn.Module):
             def __init__(self, weight):
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -44,6 +44,7 @@
 from .quant_api import (
     CutlassInt4PackedLayout,
     FbgemmConfig,
+    Float8ActivationInt4WeightConfig,
     Float8DynamicActivationFloat8SemiSparseWeightConfig,
     Float8DynamicActivationFloat8WeightConfig,
     Float8MMConfig,
@@ -141,6 +142,7 @@
     "Int8DynamicActivationInt8WeightConfig",
     "Int8DynamicActivationIntxWeightConfig",
     "Int4WeightOnlyConfig",
+    "Float8ActivationInt4WeightConfig",
     "Int8WeightOnlyConfig",
     "Float8WeightOnlyConfig",
     "Float8DynamicActivationFloat8WeightConfig",
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -1115,6 +1115,7 @@ class Int4WeightOnlyConfig(AOBaseConfig):
     zero_point_domain: Optional[ZeroPointDomain] = ZeroPointDomain.NONE
     set_inductor_config: bool = True
     preserve_zero: Optional[bool] = None
+    use_preshuffle: bool = False
 
 
 # for BC
@@ -1132,15 +1133,25 @@ def _int4_weight_only_quantize_tensor(weight, config):
     layout = config.layout
     use_hqq = config.use_hqq
     zero_point_domain = config.zero_point_domain
+    use_preshuffle = config.use_preshuffle
 
     if weight.shape[-1] % group_size != 0:
         logger.info(
             f"Skipping quantizing weight with int4 weight only quantization because the shape of weight {weight.shape} is not compatible with group_size {group_size}"
         )
         return weight
 
+    block_size = tuple([1 for _ in range(weight.ndim - 1)] + [group_size])
+
+    if use_preshuffle:
+        new_weight = Int4GroupwisePreshuffleTensor.from_float(
+            weight,
+            block_size,
+            activation_dtype="bf16",
+        )
+        return new_weight
+
     mapping_type = MappingType.ASYMMETRIC
-    block_size = tuple([1 for _ in range(weight.dim() - 1)] + [group_size])
     target_dtype = torch.int32
     quant_min = 0
     quant_max = 15
@@ -1212,6 +1223,39 @@ def _int4_weight_only_transform(
     return module
 
 
+@dataclass
+class Float8ActivationInt4WeightConfig(AOBaseConfig):
+    group_size: int = 128
+    use_preshuffle: bool = False
+    kernel: str = "fbgemm"
+
+
+@register_quantize_module_handler(Float8ActivationInt4WeightConfig)
+def _(module: torch.nn.Module, config: Int4WeightOnlyConfig) -> torch.nn.Module:
+    assert hasattr(module, "weight"), (
+        "applying int8 weight only quant requires module to have weight attribute"
+        + " but {module} does not have one"
+    )
+    group_size = config.group_size
+    use_preshuffle = config.use_preshuffle
+    kernel = config.kernel
+
+    assert use_preshuffle, (
+        f"only use_preshuffle == True is supported right now, got: {use_preshuffle}"
+    )
+    assert kernel == "fbgemm", f"only fbgemm kernel is supported, got: {kernel}"
+    weight = module.weight
+    block_size = tuple([1 for _ in range(weight.ndim - 1)] + [group_size])
+    new_weight = Int4GroupwisePreshuffleTensor.from_float(
+        module.weight,
+        block_size,
+        activation_dtype="fp8",
+    )
+    module.weight = torch.nn.Parameter(new_weight, requires_grad=False)
+    module.extra_repr = types.MethodType(_linear_extra_repr, module)
+    return module
+
+
 @dataclass
 class Int8WeightOnlyConfig(AOBaseConfig):
     """