Pass explicit layout to int4_weight_only

lisjin · lisjin · commit fb7f521eaa85 · 2025-04-23T19:08:16.000-07:00
diff --git a/test/prototype/test_parq.py b/test/prototype/test_parq.py
@@ -11,6 +11,7 @@
 from torch.testing._internal import common_utils
 
 from torchao import quantize_
+from torchao.dtypes import Int4CPULayout, TensorCoreTiledLayout
 from torchao.prototype.parq.optim import (
     ProxHardQuant,
     ProxPARQ,
@@ -24,9 +25,10 @@
 from torchao.prototype.parq.quant.uniform_torchao import _BIT_WIDTH_TO_DTYPE
 from torchao.quantization.granularity import PerGroup
 from torchao.quantization.quant_api import (
-    Int4WeightOnlyConfig,
+    LAYOUT_TO_ZERO_POINT_DOMAIN,
     IntxWeightOnlyConfig,
     _is_linear,
+    int4_weight_only,
 )
 from torchao.quantization.quant_primitives import ZeroPointDomain
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_4, TORCH_VERSION_AT_LEAST_2_6
@@ -66,8 +68,8 @@ def reset_parameters(self):
             if module.bias is not None:
                 nn.init.zeros_(module.bias)
 
-    def example_inputs(self):
-        return torch.randint(1, 10, (1, 256))
+    def example_inputs(self, device=None):
+        return torch.randint(1, 10, (1, 256), device=device)
 
     def forward(self, x):
         x = self.embedding(x)
@@ -78,20 +80,19 @@ def forward(self, x):
         return x
 
 
-def train_loop(model, optimizer, update=True, steps=1):
-    for _ in range(steps):
-        x = model.example_inputs().to(_DEVICE)
-        out = model(x)
-        out.sum().backward()
-        optimizer.step()
-
-
 class TestPARQuantization(common_utils.TestCase):
     def setUp(self):
         torch.manual_seed(123)
         self.model = M(bias=True).to(_DEVICE)
         self.params_quant, self.params_no_quant = split_param_groups(self.model)
 
+    def train_loop(self, optimizer, steps=1):
+        for _ in range(steps):
+            x = self.model.example_inputs(device=_DEVICE)
+            out = self.model(x)
+            out.sum().backward()
+            optimizer.step()
+
     def test_2bit_unif_quantizer_hard_prox(self):
         b = 2
         self.model.reset_parameters()
@@ -102,7 +103,7 @@ def test_2bit_unif_quantizer_hard_prox(self):
         base_optimizer = torch.optim.AdamW(param_groups)
         quantizer = UnifQuantizer()
         optimizer = QuantOptimizer(base_optimizer, quantizer, ProxHardQuant())
-        train_loop(self.model, optimizer)
+        self.train_loop(optimizer)
 
         for child in self.model.children():
             if isinstance(child, nn.Linear):
@@ -122,7 +123,7 @@ def test_ternarybit_lsbq_parq_prox(self):
         optimizer = QuantOptimizer(
             base_optimizer, quantizer, ProxPARQ(anneal_start=0, anneal_end=2)
         )
-        train_loop(self.model, optimizer, steps=3)
+        self.train_loop(optimizer, steps=3)
 
         for child in self.model.children():
             if isinstance(child, nn.Linear):
@@ -136,7 +137,7 @@ def setUp(self):
         torch.manual_seed(123)
 
     @staticmethod
-    def int4_torchao_quantizer(b: int = 4, config=None):
+    def int4_torchao_quantizer(config, b: int = 4):
         # based off torchao.quantization.quant_api._int4_weight_only_transform
         return UnifTorchaoQuantizer(
             symmetric=False,
@@ -145,6 +146,7 @@ def int4_torchao_quantizer(b: int = 4, config=None):
             quant_max=2**b - 1,
             eps=1e-6,
             preserve_zero=False,
+            zero_point_domain=LAYOUT_TO_ZERO_POINT_DOMAIN[type(config.layout)][0],
             config=config,
         )
 
@@ -172,28 +174,31 @@ def compare_quantized_models(
             ref = getattr(m_ref, n).weight.dequantize()
             self.assertTrue(q.equal(ref))
 
-    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
-    @unittest.skipIf(_DEVICE == "cpu", "Need GPU available")
     @common_utils.parametrize("group_size", [32, 256])
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
     def test_int4_weight_only(self, group_size: int = 32):
         model = M(m=512, n=512).to(torch.bfloat16).to(_DEVICE)
         model.reset_parameters()
 
-        m_ref = copy.deepcopy(model)
-        quantize_(m_ref, Int4WeightOnlyConfig(group_size))
+        m_ref = copy.deepcopy(model).eval()
+        config = int4_weight_only(
+            group_size=group_size,
+            layout=Int4CPULayout() if _DEVICE == "cpu" else TensorCoreTiledLayout(8),
+        )
+        quantize_(m_ref, config)
 
         b = 4
-        quantizer = self.int4_torchao_quantizer()
+        quantizer = self.int4_torchao_quantizer(config)
         self.compare_quantized_models(model, m_ref, quantizer, b, group_size)
 
-    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.6+")
     @common_utils.parametrize("b", [2, 3, 4, 8])
     @common_utils.parametrize("group_size", [32, 512])
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.6+")
     def test_intx_weight_only(self, b: int = 2, group_size: int = 32):
         model = M(m=512, n=512).to(_DEVICE)
         model.reset_parameters()
 
-        m_ref = copy.deepcopy(model)
+        m_ref = copy.deepcopy(model).eval()
         quantize_(
             m_ref,
             IntxWeightOnlyConfig(
@@ -214,8 +219,11 @@ def test_int4_weight_only_e2e(self, group_size: int = 32):
         model = M(m=512, n=512).to(torch.bfloat16).to(_DEVICE)
         model.reset_parameters()
 
-        m_ref = copy.deepcopy(model)
-        config = Int4WeightOnlyConfig(group_size)
+        m_ref = copy.deepcopy(model).eval()
+        config = int4_weight_only(
+            group_size=group_size,
+            layout=TensorCoreTiledLayout(8),
+        )
         quantize_(m_ref, config)
 
         b = 4
@@ -226,7 +234,7 @@ def test_int4_weight_only_e2e(self, group_size: int = 32):
         ]
         base_optimizer = torch.optim.AdamW(param_groups)
 
-        quantizer = self.int4_torchao_quantizer(config=config)
+        quantizer = self.int4_torchao_quantizer(config)
         optimizer = QuantOptimizer(
             base_optimizer, quantizer, ProxHardQuant(), quant_per_channel=True
         )
@@ -238,6 +246,7 @@ def test_int4_weight_only_e2e(self, group_size: int = 32):
         orig_model = copy.deepcopy(model)  # save copy of PARQ quantized model
 
         # equivalent to torchao's convert step
+        model.eval()
         with torch.no_grad():
             optimizer.restore_latent_params()
         quantize_(model, quantizer.config)
diff --git a/torchao/prototype/parq/quant/uniform_torchao.py b/torchao/prototype/parq/quant/uniform_torchao.py
@@ -36,7 +36,7 @@ def __init__(
         quant_max: Optional[Union[int, float]] = None,
         eps: Optional[float] = None,
         preserve_zero: bool = True,
-        zero_point_domain: ZeroPointDomain = ZeroPointDomain.FLOAT,
+        zero_point_domain: Optional[ZeroPointDomain] = ZeroPointDomain.NONE,
         config: Optional[AOBaseConfig] = None,
     ) -> None:
         super().__init__(center=False)