Per-row IntxWeightOnlyConfig test

lisjin · lisjin · commit bea111cab3d3 · 2025-04-22T07:13:07.000-07:00
diff --git a/test/prototype/test_parq.py b/test/prototype/test_parq.py
@@ -122,9 +122,11 @@ def test_ternarybit_lsbq_parq_prox(self):
 
 
 class TestUnifTorchaoQuantizer(common_utils.TestCase):
-    def setUp(self):
+    def __init__(self, methodName, group_size: int = 32):
+        super(TestUnifTorchaoQuantizer, self).__init__(methodName)
         torch.manual_seed(123)
-        self.group_size = 32
+        self.group_size = group_size
+        self.out_dim = 512
 
     def compare_quantized_models(
         self,
@@ -152,14 +154,13 @@ def compare_quantized_models(
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
     @unittest.skipIf(_DEVICE == "cpu", "Need GPU available")
     def test_int4_weight_only(self):
-        model = M(n=1024, k=1024).to(torch.bfloat16).to(_DEVICE)
+        model = M(m=self.out_dim, n=self.out_dim).to(torch.bfloat16).to(_DEVICE)
         model.reset_parameters()
         m_ref = copy.deepcopy(model)
 
-        config = Int4WeightOnlyConfig(group_size=self.group_size)
-        quantize_(m_ref, config, device=_DEVICE)
+        quantize_(m_ref, Int4WeightOnlyConfig(group_size=self.group_size))
 
-        # copied from torchao.quantization.quant_api._int4_weight_only_transform
+        # based off torchao.quantization.quant_api._int4_weight_only_transform
         b = 4
         quantizer = UnifTorchaoQuantizer(
             symmetric=False,
@@ -169,20 +170,19 @@ def test_int4_weight_only(self):
             eps=1e-6,
             preserve_zero=False,
         )
-        self.assertTrue(
-            quantizer.get_quant_size(b) == quantizer.quant_max - quantizer.quant_min + 1
-        )
         self.compare_quantized_models(model, m_ref, quantizer, b)
 
-    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.4+")
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.6+")
     @unittest.skipIf(_DEVICE == "cpu", "Need GPU available")
     def test_intx_weight_only(self):
-        model = M(n=512, k=512).to(_DEVICE)
+        model = M(m=self.out_dim, n=self.out_dim).to(_DEVICE)
         model.reset_parameters()
         m_ref = copy.deepcopy(model)
 
         config = IntxWeightOnlyConfig(granularity=PerGroup(self.group_size))
-        quantize_(m_ref, config, device=_DEVICE)
+        quantize_(m_ref, config)
+
+        # based off torchao.quantization.quant_api._intx_weight_only_transform
         b = 8
         q_dtype = torch.int8
         quant_min, quant_max = _DTYPE_TO_QVALUE_BOUNDS[q_dtype]
@@ -195,11 +195,18 @@ def test_intx_weight_only(self):
             preserve_zero=True,
             zero_point_domain=ZeroPointDomain.INT,
         )
-        self.assertTrue(
-            quantizer.get_quant_size(b) == max(abs(quant_min), quant_max) + 1
-        )
         self.compare_quantized_models(model, m_ref, quantizer, b)
 
 
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    suite.addTests(loader.loadTestsFromTestCase(TestPARQuantization))
+    suite.addTests(loader.loadTestsFromTestCase(TestUnifTorchaoQuantizer))
+
+    group_size = suite._tests[-1].out_dim  # row-wise grouping
+    suite.addTest(TestUnifTorchaoQuantizer("test_intx_weight_only", group_size))
+    return suite
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/prototype/parq/quant/uniform_torchao.py b/torchao/prototype/parq/quant/uniform_torchao.py
@@ -55,7 +55,7 @@ def q_kwargs(self) -> dict[str, Union[int, float]]:
         }
 
     def get_quant_size(self, b: int) -> int:
-        return 2 ** (b - 1) + 1 if self.mapping_type == MappingType.SYMMETRIC else 2**b
+        return self.quant_max - self.quant_min + 1
 
     def quantize(
         self, p: Tensor, b: int, dim: Optional[int] = None
@@ -73,15 +73,13 @@ def quantize(
             self.mapping_type,
             block_size,
             self.target_dtype,
-            quant_min=self.quant_min,
-            quant_max=self.quant_max,
             eps=self.eps,
             preserve_zero=self.preserve_zero,
-            zero_point_domain=self.zero_point_domain,
+            **self.q_kwargs,
         )
         q_args = (block_size, s, zero_point, self.target_dtype)
         q = quantize_affine(p, *q_args, **self.q_kwargs)
-        q = dequantize_affine(q, *q_args, **self.q_kwargs, output_dtype=p.dtype)
+        q = dequantize_affine(q, *q_args, output_dtype=p.dtype, **self.q_kwargs)
 
         Q = torch.arange(
             self.quant_min, self.quant_max + 1, dtype=self.target_dtype, device=p.device
diff --git a/torchao/swizzle/swizzle_ops.py b/torchao/swizzle/swizzle_ops.py
@@ -30,7 +30,12 @@ def swizzle_mm(aten_op, args, kwargs=None):
     a = args[0]
     b = args[1]
 
-    if torch.is_floating_point(a) and torch.is_floating_point(b) and a.ndim == 2 and b.ndim == 2:
+    if (
+        torch.is_floating_point(a)
+        and torch.is_floating_point(b)
+        and a.ndim == 2
+        and b.ndim == 2
+    ):
         a_is_swizzled = False
         b_is_swizzled = False
         if isinstance(a, SwizzleTensor):