Add lut quantized embedding.

szyszyzys · liangel-02 · commit 959c29b13b97 · 2025-08-25T11:15:09.000-07:00
Differential Revision: D79750002 Pull Request resolved: #2824
diff --git a/torchao/prototype/quantization/codebook_groupwise/__init__.py b/torchao/prototype/quantization/codebook_groupwise/__init__.py
@@ -1,4 +1,9 @@
 from .api import GroupwiseLutWeightConfig
 from .codebook_quantized_tensor import CodebookQuantizedPackedTensor
 
-__all__ = ["CodebookQuantizedPackedTensor", "GroupwiseLutWeightConfig"]
+__all__ = [
+    "CodebookQuantizedPackedTensor",
+    "GroupwiseLutWeightConfig",
+    "QuantizedLutEmbedding",
+    "EmbeddingLutQuantizer",
+]
diff --git a/torchao/prototype/quantization/codebook_groupwise/api.py b/torchao/prototype/quantization/codebook_groupwise/api.py
@@ -8,6 +8,7 @@
 from typing import List, Optional
 
 import torch
+import torch.nn as nn
 
 from torchao.core.config import AOBaseConfig
 from torchao.prototype.quantization.codebook_coreml.codebook_quantized_tensor import (
@@ -16,6 +17,12 @@
 from torchao.prototype.quantization.codebook_groupwise.codebook_quantized_tensor import (
     CodebookQuantizedPackedTensor,
 )
+from torchao.prototype.quantization.codebook_utils.codebook_utils import (
+    block_shape_to_group_size,
+)
+from torchao.quantization.quant_primitives import (
+    _DTYPE_TO_BIT_WIDTH,
+)
 from torchao.quantization.transform_module import register_quantize_module_handler
 
 
@@ -98,9 +105,11 @@ def __post_init__(self):
             raise ValueError(
                 "`lut_block_shape` must contain exactly one '-1' to specify the grouping dimension."
             )
+        if self.has_scale == True:
+            raise ValueError("currently only support lut quantization without scale")
 
         # 3. Validate scale_block_shape if it exists
-        if self.scale_block_shape is not None:
+        if self.has_scale and self.scale_block_shape is not None:
             if not (
                 isinstance(self.scale_block_shape, list)
                 and len(self.scale_block_shape) == 2
@@ -142,3 +151,170 @@ def _groupwise_lut_weight_transform(
         module.weight.data.copy_(dequantized_weight)
 
     return module
+
+
+class QuantizedLutEmbedding(nn.Module):
+    """
+    A PyTorch module that holds a LUT-based quantized embedding layer and
+    performs the forward pass using a high-performance C++ kernel.
+
+    This module should be created from a floating-point nn.Embedding module
+    using the `from_float` classmethod.
+    """
+
+    def __init__(
+        self, config: GroupwiseLutWeightConfig, num_embeddings: int, embedding_dim: int
+    ):
+        super().__init__()
+        # Store config and metadata needed for the forward pass
+        self.config = config
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.bit_width = _DTYPE_TO_BIT_WIDTH[config.code_dtype]
+
+        # This buffer will be populated by the from_float method
+        self.register_buffer("packed_weights", torch.empty(0, dtype=torch.uint8))
+
+    @classmethod
+    def from_float(
+        cls, float_embedding: nn.Embedding, config: GroupwiseLutWeightConfig
+    ) -> "QuantizedLutEmbedding":
+        """
+        Creates a quantized embedding module from a floating-point nn.Embedding.
+
+        Args:
+            float_embedding (nn.Embedding): The original, trained embedding module.
+            config (GroupwiseLutWeightConfig): The configuration for quantization.
+
+        Returns:
+            QuantizedLutEmbedding: A new module with quantized and packed weights.
+        """
+        assert isinstance(float_embedding, nn.Embedding), (
+            "Input must be an nn.Embedding module."
+        )
+
+        weight = float_embedding.weight.data
+        num_embeddings, embedding_dim = weight.shape
+
+        # --- 1. Call our universal quantize_dispatch function ---
+        quantized_tensor = CodebookQuantizedTensor.from_float(
+            weight, code_dtype=config.code_dtype, block_size=config.lut_block_shape
+        )
+        codes = quantized_tensor.codes
+        codebook = quantized_tensor.codebook.to(torch.float32)
+        # Currently only support lut quantization without scale. Upate this when we support scale.
+        scales = None
+
+        # Pack the quantized data
+        bit_width = _DTYPE_TO_BIT_WIDTH[config.code_dtype]
+        packer_op = getattr(torch.ops.torchao, f"_pack_embedding_lut_{bit_width}bit")
+        packed_weights = packer_op(
+            codes,
+            codebook,
+            block_shape_to_group_size(
+                config.scale_block_shape, (num_embeddings, embedding_dim)
+            )
+            if config.scale_block_shape
+            else -1,
+            block_shape_to_group_size(
+                config.lut_block_shape, (num_embeddings, embedding_dim)
+            ),
+            scales,
+        )
+
+        # Create and populate the new quantized module
+        quantized_module = cls(config, num_embeddings, embedding_dim)
+        quantized_module.register_buffer("packed_weights", packed_weights)
+
+        return quantized_module
+
+    def forward(self, indices: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the embedding lookup using the packed weights.
+        """
+        # The forward pass logic remains the same.
+        forward_op = getattr(torch.ops.torchao, f"_embedding_lut_{self.bit_width}bit")
+
+        # The C++ operator reads all metadata from the packed_weights header
+        result = forward_op(
+            self.packed_weights,
+            indices.reshape(-1),
+            self.num_embeddings,
+            self.embedding_dim,
+            block_shape_to_group_size(
+                self.config.scale_block_shape, (self.num_embeddings, self.embedding_dim)
+            )
+            if self.config.scale_block_shape
+            else -1,
+            block_shape_to_group_size(
+                self.config.lut_block_shape, (self.num_embeddings, self.embedding_dim)
+            ),
+            self.config.has_scale,
+        )
+        return result.reshape(*indices.shape, self.embedding_dim).to(
+            self.config.weight_dtype
+        )
+
+    def __repr__(self):
+        return (
+            f"QuantizedLutEmbedding(num_embeddings={self.num_embeddings}, "
+            f"embedding_dim={self.embedding_dim}, bit_width={self.bit_width}, "
+            f"lut_block_shape={self.config.lut_block_shape})"
+        )
+
+
+class EmbeddingLutQuantizer:
+    """
+    A quantizer that finds nn.Embedding modules in a model and replaces
+    them with the QuantizedLutEmbedding module based on a provided configuration.
+    """
+
+    def __init__(self, config: GroupwiseLutWeightConfig):
+        """
+        Initializes the quantizer with a single, comprehensive configuration object.
+
+        Args:
+            config (GroupwiseLutWeightConfig): The configuration that defines
+                how all embeddings should be quantized.
+        """
+        # The quantizer now holds the entire configuration object.
+        self.config = config
+
+    def quantize(self, model: nn.Module) -> nn.Module:
+        """
+        Recursively traverses the model and replaces all nn.Embedding layers.
+
+        Args:
+            model (nn.Module): The model to be quantized.
+
+        Returns:
+            nn.Module: The model with embedding layers replaced.
+        """
+        self._replace_embedding(model)
+        return model
+
+    def _replace_embedding(self, module: nn.Module):
+        for name, child in module.named_children():
+            if isinstance(child, nn.Embedding):
+                if self.config.use_qdq_reference:
+                    weight = child.weight.data
+
+                    # 1. Run the full quantize -> dequantize pipeline in Python
+                    quantized_tensor = CodebookQuantizedTensor.from_float(
+                        weight,
+                        code_dtype=self.config.code_dtype,
+                        block_size=self.config.lut_block_shape,
+                    )
+                    ref_weight = quantized_tensor.dequantize(self.config.weight_dtype)
+
+                    # 2. Create a standard nn.Embedding with the dequantized weight
+                    ref_embedding = nn.Embedding.from_pretrained(
+                        ref_weight, freeze=True
+                    )
+                    setattr(module, name, ref_embedding)
+
+                else:
+                    q_embedding = QuantizedLutEmbedding.from_float(child, self.config)
+                    setattr(module, name, q_embedding)
+            else:
+                self._replace_embedding(child)