Add lut quantized embedding. (#2824)

szyszyzys · facebook-github-bot · commit f93433f3c196 · 2025-08-20T15:37:45.000-07:00
Summary: Pull Request resolved: #2824 Reviewed By: metascroy Differential Revision: D79750002
diff --git a/torchao/prototype/quantization/codebook_groupwise/__init__.py b/torchao/prototype/quantization/codebook_groupwise/__init__.py
@@ -1,4 +1,4 @@
 from .api import GroupwiseLutWeightConfig
 from .codebook_quantized_tensor import CodebookQuantizedPackedTensor
 
-__all__ = ["CodebookQuantizedPackedTensor", "GroupwiseLutWeightConfig"]
+__all__ = ["CodebookQuantizedPackedTensor", "GroupwiseLutWeightConfig", "QuantizedLutEmbedding", "EmbeddingLutQuantizer"]
diff --git a/torchao/prototype/quantization/codebook_groupwise/api.py b/torchao/prototype/quantization/codebook_groupwise/api.py
@@ -3,13 +3,12 @@
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-import hashlib
-import os
 import types
 from dataclasses import dataclass, field
 from typing import List, Optional
 
 import torch
+import torch.nn as nn
 
 from torchao.core.config import AOBaseConfig
 from torchao.prototype.quantization.codebook_coreml.codebook_quantized_tensor import (
@@ -18,7 +17,13 @@
 from torchao.prototype.quantization.codebook_groupwise.codebook_quantized_tensor import (
     CodebookQuantizedPackedTensor,
 )
+from torchao.prototype.quantization.codebook_utils.codebook_utils import (
+    block_shape_to_group_size,
+)
 from torchao.quantization.transform_module import register_quantize_module_handler
+from torchao.quantization.quant_primitives import (
+    _DTYPE_TO_BIT_WIDTH,
+)
 
 
 def _get_linear_extra_repr_for_lut(self) -> str:
@@ -100,9 +105,11 @@ def __post_init__(self):
             raise ValueError(
                 "`lut_block_shape` must contain exactly one '-1' to specify the grouping dimension."
             )
+        if self.has_scale == True:
+            raise ValueError("currently only support lut quantization without scale")
 
         # 3. Validate scale_block_shape if it exists
-        if self.scale_block_shape is not None:
+        if self.has_scale and self.scale_block_shape is not None:
             if not (
                 isinstance(self.scale_block_shape, list)
                 and len(self.scale_block_shape) == 2
@@ -144,3 +151,239 @@ def _groupwise_lut_weight_transform(
         module.weight.data.copy_(dequantized_weight)
 
     return module
+
+
+class QuantizedLutEmbedding(nn.Module):
+    """
+    A PyTorch module that holds a LUT-based quantized embedding layer and
+    performs the forward pass using a high-performance C++ kernel.
+
+    This module should be created from a floating-point nn.Embedding module
+    using the `from_float` classmethod.
+    """
+
+    def __init__(
+        self, config: GroupwiseLutWeightConfig, num_embeddings: int, embedding_dim: int
+    ):
+        super().__init__()
+        # Store config and metadata needed for the forward pass
+        self.config = config
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.bit_width = _DTYPE_TO_BIT_WIDTH[config.code_dtype]
+
+        # This buffer will be populated by the from_float method
+        self.register_buffer("packed_weights", torch.empty(0, dtype=torch.uint8))
+
+    @classmethod
+    def from_float(
+        cls, float_embedding: nn.Embedding, config: GroupwiseLutWeightConfig
+    ) -> "QuantizedLutEmbedding":
+        """
+        Creates a quantized embedding module from a floating-point nn.Embedding.
+
+        Args:
+            float_embedding (nn.Embedding): The original, trained embedding module.
+            config (GroupwiseLutWeightConfig): The configuration for quantization.
+
+        Returns:
+            QuantizedLutEmbedding: A new module with quantized and packed weights.
+        """
+        assert isinstance(float_embedding, nn.Embedding), (
+            "Input must be an nn.Embedding module."
+        )
+
+        weight = float_embedding.weight.data
+        num_embeddings, embedding_dim = weight.shape
+
+        # --- 1. Call our universal quantize_dispatch function ---
+        quantized_tensor = CodebookQuantizedTensor.from_float(
+            weight, code_dtype=config.code_dtype, block_size=config.lut_block_shape
+        )
+        codes = quantized_tensor.codes
+        codebook = quantized_tensor.codebook.to(torch.float32)
+        # Currently only support lut quantization without scale. Upate this when we support scale.
+        scales = None
+
+        # Pack the quantized data
+        bit_width = _DTYPE_TO_BIT_WIDTH[config.code_dtype]
+        packer_op = getattr(torch.ops.torchao, f"_pack_embedding_lut_{bit_width}bit")
+        packed_weights = packer_op(
+            codes,
+            codebook,
+            block_shape_to_group_size(
+                config.scale_block_shape, (num_embeddings, embedding_dim)
+            )
+            if config.scale_block_shape
+            else -1,
+            block_shape_to_group_size(
+                config.lut_block_shape, (num_embeddings, embedding_dim)
+            ),
+            scales,
+        )
+
+        # Create and populate the new quantized module
+        quantized_module = cls(config, num_embeddings, embedding_dim)
+        quantized_module.register_buffer("packed_weights", packed_weights)
+
+        return quantized_module
+
+    def forward(self, indices: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the embedding lookup using the packed weights.
+        """
+        # The forward pass logic remains the same.
+        forward_op = getattr(torch.ops.torchao, f"_embedding_lut_{self.bit_width}bit")
+
+        # The C++ operator reads all metadata from the packed_weights header
+        result = forward_op(
+            self.packed_weights,
+            indices.reshape(-1),
+            self.num_embeddings,
+            self.embedding_dim,
+            block_shape_to_group_size(
+                self.config.scale_block_shape, (self.num_embeddings, self.embedding_dim)
+            )
+            if self.config.scale_block_shape
+            else -1,
+            block_shape_to_group_size(
+                self.config.lut_block_shape, (self.num_embeddings, self.embedding_dim)
+            ),
+            self.config.has_scale,
+        )
+        return result.reshape(*indices.shape, self.embedding_dim).to(
+            self.config.weight_dtype
+        )
+
+    def __repr__(self):
+        return (
+            f"QuantizedLutEmbedding(num_embeddings={self.num_embeddings}, "
+            f"embedding_dim={self.embedding_dim}, bit_width={self.bit_width}, "
+            f"lut_block_shape={self.config.lut_block_shape})"
+        )
+
+
+class EmbeddingLutQuantizer:
+    """
+    A quantizer that replaces nn.Embedding modules with the
+    QuantizedLutEmbedding module.
+    """
+
+    def __init__(
+        self,
+        weight_dtype: torch.dtype = torch.int4,
+        group_size: int = 64,
+    ):
+        assert weight_dtype in [getattr(torch, f"int{i}") for i in range(1, 9)]
+        self.bit_width = _DTYPE_TO_BIT_WIDTH[weight_dtype]
+        self.group_size = group_size
+
+    def quantize(self, model: nn.Module) -> nn.Module:
+        self._replace_embedding(model)
+        return model
+
+    def _replace_embedding(self, module: nn.Module):
+        for name, child in module.named_children():
+            if isinstance(child, nn.Embedding):
+                q_embedding = QuantizedLutEmbedding(self.bit_width)
+                q_embedding.quantize_and_pack(child.weight.data, self.group_size)
+                setattr(module, name, q_embedding)
+            else:
+                self._replace_embedding(child)
+
+
+class EmbeddingLutQuantizer:
+    """
+    A quantizer that finds nn.Embedding modules in a model and replaces
+    them with the QuantizedLutEmbedding module based on a provided configuration.
+    """
+
+    def __init__(self, config: GroupwiseLutWeightConfig):
+        """
+        Initializes the quantizer with a single, comprehensive configuration object.
+
+        Args:
+            config (GroupwiseLutWeightConfig): The configuration that defines
+                how all embeddings should be quantized.
+        """
+        # The quantizer now holds the entire configuration object.
+        self.config = config
+
+    def quantize(self, model: nn.Module) -> nn.Module:
+        """
+        Recursively traverses the model and replaces all nn.Embedding layers.
+
+        Args:
+            model (nn.Module): The model to be quantized.
+
+        Returns:
+            nn.Module: The model with embedding layers replaced.
+        """
+        self._replace_embedding(model)
+        return model
+
+    def _replace_embedding(self, module: nn.Module):
+        for name, child in module.named_children():
+            if isinstance(child, nn.Embedding):
+                q_embedding = QuantizedLutEmbedding.from_float(child, self.config)
+
+                setattr(module, name, q_embedding)
+            else:
+                self._replace_embedding(child)
+
+
+class EmbeddingLutQuantizer:
+    """
+    A quantizer that finds nn.Embedding modules in a model and replaces
+    them with the QuantizedLutEmbedding module based on a provided configuration.
+    """
+
+    def __init__(self, config: GroupwiseLutWeightConfig):
+        """
+        Initializes the quantizer with a single, comprehensive configuration object.
+
+        Args:
+            config (GroupwiseLutWeightConfig): The configuration that defines
+                how all embeddings should be quantized.
+        """
+        # The quantizer now holds the entire configuration object.
+        self.config = config
+
+    def quantize(self, model: nn.Module) -> nn.Module:
+        """
+        Recursively traverses the model and replaces all nn.Embedding layers.
+
+        Args:
+            model (nn.Module): The model to be quantized.
+
+        Returns:
+            nn.Module: The model with embedding layers replaced.
+        """
+        self._replace_embedding(model)
+        return model
+
+    def _replace_embedding(self, module: nn.Module):
+        for name, child in module.named_children():
+            if isinstance(child, nn.Embedding):
+                if self.config.use_qdq_reference:
+                    weight = child.weight.data
+
+                    # 1. Run the full quantize -> dequantize pipeline in Python
+                    quantized_tensor = CodebookQuantizedTensor.from_float(
+                        weight,
+                        code_dtype=self.config.code_dtype,
+                        block_size=self.config.lut_block_shape,
+                    )
+                    ref_weight = quantized_tensor.dequantize(self.config.weight_dtype)
+
+                    # 2. Create a standard nn.Embedding with the dequantized weight
+                    ref_embedding = nn.Embedding.from_pretrained(
+                        ref_weight, freeze=True
+                    )
+                    setattr(module, name, ref_embedding)
+
+                else:
+                    q_embedding = QuantizedLutEmbedding.from_float(child, self.config)
+                    setattr(module, name, q_embedding)
+            else:
+                self._replace_embedding(child)
diff --git a/torchao/prototype/quantization/codebook_utils/codebook_utils.py b/torchao/prototype/quantization/codebook_utils/codebook_utils.py
@@ -22,6 +22,66 @@
 )
 from torchao.quantization.quant_primitives import _DTYPE_TO_BIT_WIDTH
 
+def block_shape_to_group_size(block_shape, tensor_shape):
+    """Calculates the total number of elements in a group from a block_shape."""
+    n_group, k_group = block_shape
+    n_dim, k_dim = tensor_shape
+
+    if n_group == -1:
+        n_group = n_dim
+    if k_group == -1:
+        k_group = k_dim
+
+    return n_group * k_group
+
+def group_size_to_block_shapes(
+    lut_group_size: int,
+    tensor_shape: Tuple[int, int],
+    scale_group_size: Optional[int] = None,
+) -> Tuple[List[int], Optional[List[int]]]:
+    """
+    Translates legacy integer-based group sizes into the new block_shape list format.
+
+    This function encodes the implicit assumptions of the old system:
+    - LUTs were always grouped by rows.
+    - Scales were always grouped by columns.
+
+    Args:
+        lut_group_size (int): The total number of elements that shared a single LUT.
+        tensor_shape (Tuple[int, int]): The shape of the weight tensor (N, K).
+            This is required to calculate the number of rows for the LUT group.
+        scale_group_size (Optional[int]): The number of elements (columns) that
+            shared a single scale factor. Can be None or -1 if not used.
+
+    Returns:
+        A tuple containing:
+        - lut_block_shape (List[int]): The new block shape for LUTs (e.g., [N, -1]).
+        - scale_block_shape (Optional[List[int]]): The new block shape for scales
+          (e.g., [-1, K]), or None.
+    """
+    n_rows, k_cols = tensor_shape
+
+    # --- 1. Translate LUT Group Size ---
+    if lut_group_size % k_cols != 0:
+        raise ValueError(
+            f"lut_group_size ({lut_group_size}) must be divisible by the number "
+            f"of columns ({k_cols}) for legacy row-grouping."
+        )
+    rows_per_lut = lut_group_size // k_cols
+    lut_block_shape = [rows_per_lut, -1]
+
+    # --- 2. Translate Scale Group Size ---
+    scale_block_shape = None
+    if scale_group_size is not None and scale_group_size > 0:
+        if k_cols % scale_group_size != 0:
+             raise ValueError(
+                f"Number of columns ({k_cols}) must be divisible by "
+                f"scale_group_size ({scale_group_size}) for legacy column-grouping."
+             )
+        scale_block_shape = [1, scale_group_size]
+
+    return lut_block_shape, scale_block_shape
+
 
 def block_shape_to_group_size(block_shape, tensor_shape):
     """Calculates the total number of elements in a group from a block_shape."""