Update README, add Int4UnifTorchaoQuantizer

lisjin · lisjin · commit d7e380fcb22c · 2025-04-25T10:42:50.000-07:00
diff --git a/test/prototype/test_parq.py b/test/prototype/test_parq.py
@@ -18,6 +18,7 @@
     QuantOptimizer,
 )
 from torchao.prototype.parq.quant import (
+    Int4UnifTorchaoQuantizer,
     LSBQuantizer,
     UnifQuantizer,
     UnifTorchaoQuantizer,
@@ -29,7 +30,6 @@
     _is_linear,
     int4_weight_only,
 )
-from torchao.quantization.quant_primitives import ZeroPointDomain
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_4,
     TORCH_VERSION_AT_LEAST_2_6,
@@ -139,20 +139,6 @@ class TestUnifTorchaoQuantizer(common_utils.TestCase):
     def setUp(self):
         torch.manual_seed(123)
 
-    @staticmethod
-    def int4_torchao_quantizer(config, b: int = 4):
-        # based off torchao.quantization.quant_api._int4_weight_only_transform
-        return UnifTorchaoQuantizer(
-            symmetric=False,
-            target_dtype=torch.int32,
-            quant_min=0,
-            quant_max=2**b - 1,
-            eps=1e-6,
-            preserve_zero=False,
-            zero_point_domain=ZeroPointDomain.FLOAT,
-            config=config,
-        )
-
     def compare_quantized_models(
         self,
         model: nn.Module,
@@ -190,8 +176,9 @@ def test_int4_weight_only(self, group_size: int = 32):
         quantize_(m_ref, config)
 
         b = 4
-        quantizer = self.int4_torchao_quantizer(config)
-        self.compare_quantized_models(model, m_ref, quantizer, b, group_size)
+        self.compare_quantized_models(
+            model, m_ref, Int4UnifTorchaoQuantizer(), b, group_size
+        )
 
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.6+")
     @common_utils.parametrize("b", [2, 3, 4, 8])
@@ -208,11 +195,7 @@ def test_intx_weight_only(self, b: int = 2, group_size: int = 32):
             ),
         )
 
-        quantizer = UnifTorchaoQuantizer(
-            symmetric=True,
-            preserve_zero=True,
-            zero_point_domain=ZeroPointDomain.INT,
-        )
+        quantizer = UnifTorchaoQuantizer(symmetric=True)
         self.compare_quantized_models(model, m_ref, quantizer, b, group_size)
 
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
@@ -235,9 +218,11 @@ def test_int4_weight_only_e2e(self, group_size: int = 32):
         ]
         base_optimizer = torch.optim.AdamW(param_groups)
 
-        quantizer = self.int4_torchao_quantizer(config)
         optimizer = QuantOptimizer(
-            base_optimizer, quantizer, ProxHardQuant(), quant_per_channel=True
+            base_optimizer,
+            Int4UnifTorchaoQuantizer(),
+            ProxHardQuant(),
+            quant_per_channel=True,
         )
 
         # do not update model weights, just quantize
@@ -248,7 +233,7 @@ def test_int4_weight_only_e2e(self, group_size: int = 32):
 
         # equivalent to torchao's convert step
         model.eval()
-        optimizer.torchao_quantize_(model)
+        optimizer.torchao_quantize_(model, config)
 
         for n, module in model.named_modules():
             if not _is_linear(module):
diff --git a/torchao/prototype/parq/README.md b/torchao/prototype/parq/README.md
@@ -7,23 +7,105 @@ This library applies QAT without modifying model-level code. It instead interfac
 * quantization method: computing the best set of discrete, quantized values
 * proximal mapping: projection of weights onto quantized values
 
+
+## PARQ vs. torchao
+
+There are two main QAT interfaces in torchao:
+
+- Modules (e.g., `torch.nn.Linear`) are swapped with their quantized counterparts (e.g., `Int4WeightOnlyQATLinear`). See [Quantizer API (legacy)](torchao/quantization/qat#quantizer-api-legacy) for details.
+- The tensor subclass approach instead operates at a finer level of granularity. It replaces `torch.Tensor` instances with quantized `AffineQuantizedTensor` ones. The [`quantize_` API](quantization/qat#quantize_-api-recommended) uses this method by default.
+
+PARQ is conceptually similar to the tensor subclass interface. It quantizes tensors through the optimizer (i.e., `optimizer.param_groups[i]["params"]`) without modifying the model.
+
+An example PARQ flow and its torchao equivalent are shown below. The prepare stage takes place before training, while the convert stage runs after training to produce a quantized model.
+
+<table>
+<tr>
+<td style="font-weight: bold; text-align: center;">stage<td style="font-weight: bold; text-align: center;">PARQ</td><td style="font-weight: bold; text-align: center;">torchao</td>
+</tr>
+<tr>
+<td>prepare</td>
+<td style="vertical-align: top;">
+
+```python
+from torchao.prototype.parq.optim import QuantOptimizer
+from torchao.prototype.parq.quant import UnifTorchaoQuantizer
+
+param_groups = [
+    {"params": params_quant, "quant_bits": 4, "quant_block_size": 32},
+    {"params": params_no_quant},
+]
+base_optimizer = torch.optim.AdamW(param_groups, ...)
+optimizer = QuantOptimizer(
+    base_optimizer,
+    UnifTorchaoQuantizer(symmetric=True),
+    ProxHardQuant(),
+    quant_per_channel=True,
+)
+```
+
+</td>
+<td style="vertical-align: top;">
+
+```python
+from torchao.quantization.qat import (
+    FakeQuantizeConfig,
+    intx_quantization_aware_training,
+)
+
+weight_config = FakeQuantizeConfig(torch.int4, group_size=32)
+quantize_(
+    model,
+    intx_quantization_aware_training(weight_config=weight_config),
+)
+```
+
+</td>
+</tr>
+<tr>
+<td>convert</td>
+<td style="vertical-align: top;">
+
+```python
+config = IntXWeightOnlyConfig(
+    weight_dtype=torch.int4, granularity=PerGroup(32)
+)
+optimizer.torchao_quantize_(model, config)
+```
+
+</td>
+<td style="vertical-align: top;">
+
+```python
+from torchao.quantization.qat import from_intx_quantization_aware_training
+
+quantize_(model, from_intx_quantization_aware_training())
+```
+
+</td>
+</tr>
+</table>
+
+> [!NOTE]
+> `UnifTorchaoQuantizer` calls exactly the same quantization primitives as in torchao's tensor subclass interface (see [Affine Quantization Details](torchao/quantization#affine-quantization-details)).
+
 ## QAT arguments
 
 | | description | choices |
 | --- | --- | --- |
-| `quant-bits` | bit-width for quantized weights | 0 (ternary), 1—4 |
+| `quant-bits` | bit-width for quantized weights | 0 (ternary), 1-4 |
 | `quant-method` | method for determining quantized values | `lsbq`, `uniform` |
 | `quant-proxmap` | proximal mapping to project weights onto quantized values | `hard`, `parq`, `binaryrelax` |
 | `anneal-start` | start epoch for QAT annealing period | (0, `total_steps` - 1) |
 | `anneal-end` | end epoch for QAT annealing period | (`anneal_end`, `total_steps`) |
-| `anneal-steepness` | sigmoid steepness for PARQ inverse slope schedule | 25—100 |
+| `anneal-steepness` | sigmoid steepness for PARQ inverse slope schedule | 1-20 |
 
 ## Optimizer-only interface
 
 The `QuantOptimizer` wrapper takes any `torch.optim.Optimizer` object. It is also initialized with a `Quantizer` and `ProxMap` object. Integration into new training pipelines is simple:
 ```python
-from parq.optim import ProxPARQ, QuantOptimizer
-from parq.quant import LSBQuantizer
+from torchao.prototype.parq.optim import ProxPARQ, QuantOptimizer
+from torchao.prototype.parq.quant import LSBQuantizer
 
 
 # split params into quantizable and non-quantizable params
diff --git a/torchao/prototype/parq/optim/quantopt.py b/torchao/prototype/parq/optim/quantopt.py
@@ -14,6 +14,7 @@
 from torch.optim import Optimizer
 
 from torchao import quantize_
+from torchao.core.config import AOBaseConfig
 
 from ..quant import Quantizer
 from ..utils import HAS_DTENSOR, is_dtensor
@@ -109,9 +110,8 @@ def quantize_(
         return q
 
     @torch.no_grad()
-    def torchao_quantize_(self, model):
-        assert hasattr(self.quantizer, "config"), "Missing self.quantizer.config"
-
+    def torchao_quantize_(self, model: torch.nn.Module, config: AOBaseConfig):
+        """Recursively call torchao.quantize_ on model using given config."""
         self.restore_latent_params()
         param_set = {
             p.data_ptr()
@@ -123,7 +123,7 @@ def inner_quantize_(model):
             for module in model.children():
                 for param in module.parameters(recurse=False):
                     if param.data_ptr() in param_set:
-                        quantize_(module, self.quantizer.config)
+                        quantize_(module, config)
                         break
 
                 inner_quantize_(module)
diff --git a/torchao/prototype/parq/quant/__init__.py b/torchao/prototype/parq/quant/__init__.py
@@ -11,4 +11,7 @@
     TernaryUnifQuantizer,
     UnifQuantizer,
 )
-from .uniform_torchao import UnifTorchaoQuantizer  # noqa: F401
+from .uniform_torchao import (  # noqa: F401
+    Int4UnifTorchaoQuantizer,
+    UnifTorchaoQuantizer,
+)
diff --git a/torchao/prototype/parq/quant/uniform_torchao.py b/torchao/prototype/parq/quant/uniform_torchao.py
@@ -9,7 +9,6 @@
 import torch
 from torch import Tensor
 
-from torchao.core.config import AOBaseConfig
 from torchao.quantization.quant_primitives import (
     _DTYPE_TO_BIT_WIDTH,
     _DTYPE_TO_QVALUE_BOUNDS,
@@ -36,8 +35,7 @@ def __init__(
         quant_max: Optional[Union[int, float]] = None,
         eps: Optional[float] = None,
         preserve_zero: bool = True,
-        zero_point_domain: ZeroPointDomain = ZeroPointDomain.NONE,
-        config: Optional[AOBaseConfig] = None,
+        zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
     ) -> None:
         super().__init__(center=False)
 
@@ -50,7 +48,6 @@ def __init__(
         self.eps = eps
         self.preserve_zero = preserve_zero
         self.zero_point_domain = zero_point_domain
-        self.config = config
 
     def _init_quant_min_max(self, b: int) -> None:
         if self.quant_min is None or self.quant_max is None:
@@ -121,3 +118,18 @@ def quantize(
             zero_point_domain=self.zero_point_domain,
         )
         return q, Q
+
+
+class Int4UnifTorchaoQuantizer(UnifTorchaoQuantizer):
+    """Based on torchao.quantization.quant_api._int4_weight_only_transform"""
+
+    def __init__(self) -> None:
+        super().__init__(
+            symmetric=False,
+            target_dtype=torch.int32,
+            quant_min=0,
+            quant_max=15,
+            eps=1e-6,
+            preserve_zero=False,
+            zero_point_domain=ZeroPointDomain.FLOAT,
+        )

Original file line number	Diff line number	Diff line change
`@@ -11,4 +11,7 @@`
`11`	`11`	`TernaryUnifQuantizer,`
`12`	`12`	`UnifQuantizer,`
`13`	`13`	`)`
`14`		`-from .uniform_torchao import UnifTorchaoQuantizer # noqa: F401`
	`14`	`+from .uniform_torchao import ( # noqa: F401`
	`15`	`+ Int4UnifTorchaoQuantizer,`
	`16`	`+ UnifTorchaoQuantizer,`
	`17`	`+)`