Add support for float8 activation for Int4GroupwisePreshuffleTensor

jerryzh168 · jerryzh168 · commit cc359e6ebb6b · 2025-07-02T13:35:37.000-07:00
Summary: Added basic op support like linear and bmm, we have both float8 and bf16 in the same Tensor because it's the same dtype, only difference is whether the activation is quantized or not. Although there is some differneces in implementation: bf16 activaton: * group_scale * group_zero fp8 activation * group_scale * row_scale Test Plan: python test/dtypes/test_float8_activation_int4_groupwise_preshuffle.py Reviewers: Subscribers: Tasks: Tags: stack-info: PR: #2437, branch: jerryzh168/stack/4
diff --git a/test/quantization/quantize_/test_int4_groupwise_preshuffle.py b/test/quantization/quantize_/test_int4_groupwise_preshuffle.py
@@ -4,11 +4,14 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
+import tempfile
 import unittest
 
 import torch
 from torch.testing._internal.common_utils import (
     TestCase,
+    instantiate_parametrized_tests,
+    parametrize,
     run_tests,
 )
 
@@ -23,6 +26,42 @@
     is_sm_at_least_90,
 )
 
+BF16_ACT_CONFIG = FbgemmConfig(
+    input_dtype=torch.bfloat16,
+    weight_dtype=torch.int4,
+    output_dtype=torch.bfloat16,
+    block_size=[1, 128],
+    preshuffle=True,
+    activation_dtype_for_int4="bf16",
+)
+
+BF16_ACT_BMM_CONFIG = FbgemmConfig(
+    input_dtype=torch.bfloat16,
+    weight_dtype=torch.int4,
+    output_dtype=torch.bfloat16,
+    block_size=[1, 1, 128],
+    preshuffle=True,
+    activation_dtype_for_int4="bf16",
+)
+
+FP8_ACT_CONFIG = FbgemmConfig(
+    input_dtype=torch.bfloat16,
+    weight_dtype=torch.int4,
+    output_dtype=torch.bfloat16,
+    block_size=[1, 128],
+    preshuffle=True,
+    activation_dtype_for_int4="fp8",
+)
+
+FP8_ACT_BMM_CONFIG = FbgemmConfig(
+    input_dtype=torch.bfloat16,
+    weight_dtype=torch.int4,
+    output_dtype=torch.bfloat16,
+    block_size=[1, 1, 128],
+    preshuffle=True,
+    activation_dtype_for_int4="fp8",
+)
+
 
 @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_8, "Need pytorch 2.8+")
 @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
@@ -32,33 +71,23 @@
 )
 class TestInt4GroupwisePreshuffleTensor(TestCase):
     def setUp(self):
-        self.config = FbgemmConfig(
-            input_dtype=torch.bfloat16,
-            weight_dtype=torch.int4,
-            output_dtype=torch.bfloat16,
-            block_size=[1, 128],
-            preshuffle=True,
-        )
-        self.bmm_config = FbgemmConfig(
-            input_dtype=torch.bfloat16,
-            weight_dtype=torch.int4,
-            output_dtype=torch.bfloat16,
-            block_size=[1, 1, 128],
-            preshuffle=True,
-        )
         self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
 
-    def test_linear(self):
+    @parametrize("config", [BF16_ACT_CONFIG, FP8_ACT_CONFIG])
+    def test_linear(self, config):
         dtype = torch.bfloat16
         device = "cuda"
         input = torch.randn(1, 128, dtype=dtype, device=device)
         linear = torch.nn.Linear(128, 256, dtype=dtype, device=device)
         original = linear(input)
-        quantize_(linear, self.config)
+        quantize_(linear, config)
         quantized = linear(input)
         self.assertTrue(compute_error(original, quantized) > 20)
 
-    def test_bmm(self):
+    # Note: this order will error out: `Got bad cuda status: an illegal memory access was encountered at line: 449`
+    # @parametrize("bmm_config", [BF16_ACT_BMM_CONFIG, FP8_ACT_BMM_CONFIG])
+    @parametrize("bmm_config", [FP8_ACT_BMM_CONFIG, BF16_ACT_BMM_CONFIG])
+    def test_bmm(self, bmm_config):
         class M(torch.nn.Module):
             def __init__(self, weight):
                 super().__init__()
@@ -74,32 +103,46 @@ def forward(self, x):
         m = M(weight).eval()
         original = m(input)
         m.weight = torch.nn.Parameter(m.weight.transpose(1, 2).contiguous())
-        quantize_(m, self.bmm_config, filter_fn=lambda x, fqn: True)
+        quantize_(m, bmm_config, filter_fn=lambda x, fqn: True)
         quantized = m(input)
         self.assertTrue(compute_error(original, quantized) > 18)
 
-    def test_to_device(self):
+    @parametrize("config", [BF16_ACT_CONFIG, FP8_ACT_CONFIG])
+    def test_to_device(self, config):
         for device in self.GPU_DEVICES:
             linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, self.config)
+            quantize_(linear, config)
             linear.to(device)
 
             linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, self.config)
+            quantize_(linear, config)
             linear.to(device=device)
 
             linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, self.config)
+            quantize_(linear, config)
             linear.to(device)
 
-    def test_module_path(self):
+    @parametrize("config", [BF16_ACT_CONFIG, FP8_ACT_CONFIG])
+    def test_module_path(self, config):
         linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-        quantize_(linear, self.config)
+        quantize_(linear, config)
         self.assertEqual(
             str(type(linear.weight)),
             "<class 'torchao.quantization.Int4GroupwisePreshuffleTensor'>",
         )
 
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(linear.state_dict(), f)
+            f.seek(0)
+            state_dict = torch.load(f)
+            self.assertEqual(
+                str(type(state_dict["weight"])),
+                "<class 'torchao.quantization.Int4GroupwisePreshuffleTensor'>",
+            )
+
+
+instantiate_parametrized_tests(TestInt4GroupwisePreshuffleTensor)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
@@ -10,6 +10,9 @@
 )
 from .fbgemm_fp8_tensor import FbgemmFp8Tensor, to_fbgemm_fp8
 from .fbgemm_int4_tensor import FbgemmInt4Tensor, to_fbgemm_int4
+from .float8_activation_int4_groupwise_preshuffle_tensor import (
+    Float8ActivationInt4GroupwisePreshuffleTensor,
+)
 from .floatx import (
     CutlassSemiSparseLayout,
     Float8Layout,
@@ -70,4 +73,5 @@
     "FbgemmFp8Tensor",
     "Int8DynamicActInt4WeightCPULayout",
     "Int4GroupwisePreshuffleTensor",
+    "Float8ActivationInt4GroupwisePreshuffleTensor",
 ]
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -2040,6 +2040,8 @@ class FbgemmConfig(AOBaseConfig):
        weight_dtype (torch.dtype): weight dtype of the kernel
        output_dtype (torch.dtype): output dtype of the kernel
        group_size (int): The group size for weight
+       preshuffle (bool): whether preshuffle the weights or not
+       activation_dtype_for_int4 (str): the dtype for activation for int4 weight, either bf16 or fp8
     """
 
     input_dtype: torch.dtype
@@ -2048,6 +2050,7 @@ class FbgemmConfig(AOBaseConfig):
     block_size: Optional[List[int]] = None
     activation_scale_ub: Optional[float] = None
     preshuffle: bool = False
+    activation_dtype_for_int4: str = "bf16"
 
 
 @register_quantize_module_handler(FbgemmConfig)
@@ -2067,7 +2070,9 @@ def _(module: torch.nn.Module, config: FbgemmConfig) -> torch.nn.Module:
     ):
         if config.preshuffle:
             weight = Int4GroupwisePreshuffleTensor.from_float(
-                module.weight, config.block_size
+                module.weight,
+                config.block_size,
+                activation_dtype=config.activation_dtype_for_int4,
             )
         else:
             weight = to_fbgemm_int4(
diff --git a/torchao/quantization/quantize_/int4_groupwise_preshuffle_tensor.py b/torchao/quantization/quantize_/int4_groupwise_preshuffle_tensor.py