Add the ops for groupwise lut quantization for embeding (#2823)

szyszyzys · facebook-github-bot · commit 28f38c4d4d39 · 2025-08-20T15:37:45.000-07:00
Summary: Pull Request resolved: #2823 Reviewed By: metascroy Differential Revision: D79749992
diff --git a/torchao/experimental/CMakeLists.txt b/torchao/experimental/CMakeLists.txt
@@ -134,6 +134,7 @@ if(TORCHAO_BUILD_ATEN_OPS)
         ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_aten.cpp
         ops/groupwise_lowbit_weight_lut/groupwise_lowbit_weight_lut.cpp
         ops/groupwise_lowbit_weight_lut/op_groupwise_lowbit_weight_lut_aten.cpp
+        ops/embedding_lut/op_embedding_groupwise_lowbit_lut_aten.cpp
     )
     list(TRANSFORM _torchao_op_srcs_aten PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/")
 
@@ -194,7 +195,8 @@ if(TORCHAO_BUILD_EXECUTORCH_OPS)
         ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp
         ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch.cpp
         ops/groupwise_lowbit_weight_lut/groupwise_lowbit_weight_lut.cpp
-        ops/groupwise_lowbit_weight_lut/op_groupwise_lowbit_weight_lut_executorch.cpp)
+        ops/groupwise_lowbit_weight_lut/op_groupwise_lowbit_weight_lut_executorch.cpp
+        ops/embedding_lut/op_embedding_groupwise_lowbit_lut_executorch.cpp)
 
     list(TRANSFORM _torchao_op_srcs_executorch PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/")
     add_library(torchao_ops_executorch STATIC ${_torchao_op_srcs_executorch})
diff --git a/torchao/experimental/ops/embedding_lut/op_embedding_groupwise_lowbit_lut-impl.h b/torchao/experimental/ops/embedding_lut/op_embedding_groupwise_lowbit_lut-impl.h
@@ -0,0 +1,241 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#if defined(TORCHAO_BUILD_CPU_AARCH64)
+#include <torchao/experimental/kernels/cpu/aarch64/embedding/embedding_lut.h>
+#endif // TORCHAO_BUILD_CPU_AARCH64
+
+#include <torchao/experimental/ops/embedding_lut/packed_weights_header.h>
+#include <torchao/experimental/ops/library.h>
+#include <torchao/experimental/ops/parallel.h>
+
+template <int weight_nbit>
+void check_embedding_lut_inputs(
+    const Tensor& packed_weight_indices,
+    const Tensor& indices,
+    int64_t num_embeddings,
+    int64_t embedding_dim,
+    int64_t scale_group_size,
+    int64_t lut_group_size,
+    bool has_scales) {
+  // Check packed weights header
+  TORCHAO_CHECK(
+      packed_weight_indices.dim() == 1, "packed_weight_indices must be 1D");
+#ifdef USE_ATEN
+  TORCHAO_CHECK(
+      packed_weight_indices.dtype() == torch::kInt8,
+      "packed_weight_indices must be byte");
+#endif // USE_ATEN
+  TORCHAO_CHECK(
+      packed_weight_indices.size(0) >=
+          torchao::ops::PackedWeightsHeader::size(),
+      "packed_weight_indices is not large enough to contain a header");
+
+  // Check indices tensor
+  TORCHAO_CHECK(indices.dim() == 1, "indices must be 1D");
+  TORCHAO_CHECK(
+      (indices.dtype() == Tensor_dtype_kInt32) ||
+          (indices.dtype() == Tensor_dtype_kInt64),
+      "indices must be int32 or int64");
+
+  // Check header
+  auto header = torchao::ops::PackedWeightsHeader::read(
+      packed_weight_indices.const_data_ptr());
+  TORCHAO_CHECK(
+      header ==
+          torchao::ops::embedding_lut::get_packed_weights_header(
+              /*version=*/1,
+              weight_nbit,
+              num_embeddings,
+              embedding_dim,
+              scale_group_size,
+              lut_group_size,
+              has_scales),
+      "packed_weights are not compatible with the kernel");
+}
+
+#if defined(USE_ATEN) || defined(USE_EXECUTORCH)
+template <int weight_nbit>
+Tensor embedding_out_cpu(
+    const Tensor& packed_weights,
+    const Tensor& indices,
+    int64_t num_embeddings,
+    int64_t embedding_dim,
+    int64_t scale_group_size,
+    int64_t lut_group_size,
+    bool has_scales,
+    Tensor& out) {
+  check_embedding_lut_inputs<weight_nbit>(
+      packed_weights,
+      indices,
+      num_embeddings,
+      embedding_dim,
+      scale_group_size,
+      lut_group_size,
+      has_scales);
+
+  const int num_out = indices.size(0);
+  TORCHAO_RESIZE_TENSOR(out, {(int)num_out, (int)embedding_dim});
+
+  const int32_t* index32_ptr = nullptr;
+  const int64_t* index64_ptr = nullptr;
+  if (indices.dtype() == Tensor_dtype_kInt32) {
+    index32_ptr = indices.const_data_ptr<int32_t>();
+  } else {
+    index64_ptr = indices.const_data_ptr<int64_t>();
+  }
+
+  // The actual packed data starts after the header
+  const void* packed_data_ptr = packed_weights.const_data_ptr<int8_t>() +
+      torchao::ops::PackedWeightsHeader::size();
+
+  torchao::parallel_1d(0, num_out, [&](int64_t idx) {
+    int index = (index32_ptr != nullptr) ? index32_ptr[idx] : index64_ptr[idx];
+    TORCHAO_CHECK(index >= 0 && index < num_embeddings, "Index out of bounds");
+
+#if defined(TORCHAO_BUILD_CPU_AARCH64)
+    torchao::kernels::cpu::aarch64::embedding::
+        dequantize_embedding_row_at_idx_lut<weight_nbit>(
+            out.mutable_data_ptr<float>() + idx * embedding_dim,
+            packed_data_ptr,
+            index,
+            num_embeddings,
+            embedding_dim,
+            scale_group_size,
+            lut_group_size,
+            has_scales);
+#else
+    TORCHAO_CHECK(false, "Unsupported platform for embedding_lut kernel");
+#endif // TORCHAO_BUILD_CPU_AARCH64
+  });
+
+  return out;
+}
+#endif // defined(USE_ATEN) || defined(USE_EXECUTORCH)
+
+#ifdef USE_ATEN
+template <int weight_nbit>
+Tensor embedding_cpu(
+    const Tensor& packed_weights,
+    const Tensor& indices,
+    int64_t num_embeddings,
+    int64_t embedding_dim,
+    int64_t scale_group_size,
+    int64_t lut_group_size,
+    bool has_scales) {
+  Tensor output_tensor = torch::empty({0}, torch::kFloat32);
+  embedding_out_cpu<weight_nbit>(
+      packed_weights,
+      indices,
+      num_embeddings,
+      embedding_dim,
+      scale_group_size,
+      lut_group_size,
+      has_scales,
+      output_tensor);
+  return output_tensor;
+}
+
+template <int weight_nbit>
+Tensor pack_embedding_cpu(
+    const Tensor& weight_qval_idxs,
+    const Tensor& luts,
+    int64_t scale_group_size,
+    int64_t lut_group_size,
+    const std::optional<Tensor>& weight_scales) {
+  const bool has_scales = weight_scales.has_value();
+  TORCHAO_CHECK(weight_qval_idxs.dim() == 2, "weight_qval_idxs must be 2D");
+  const int64_t num_embeddings = weight_qval_idxs.size(0);
+  const int64_t embedding_dim = weight_qval_idxs.size(1);
+
+  TORCHAO_CHECK(
+      (embedding_dim * weight_nbit) % 8 == 0,
+      "Total bits must be a multiple of 8.");
+
+  const size_t packed_embedding_size =
+      torchao::kernels::cpu::aarch64::embedding::packed_embedding_size(
+          weight_nbit,
+          num_embeddings,
+          embedding_dim,
+          scale_group_size,
+          lut_group_size,
+          has_scales);
+  const size_t total_packed_size =
+      torchao::ops::PackedWeightsHeader::size() + packed_embedding_size;
+
+  // Allocate and Pack
+  auto out = torch::empty({(long)total_packed_size}, torch::kInt8);
+
+  // Write header
+  auto header = torchao::ops::embedding_lut::get_packed_weights_header(
+      /*version=*/1,
+      weight_nbit,
+      num_embeddings,
+      embedding_dim,
+      scale_group_size,
+      lut_group_size,
+      has_scales);
+  header.write(out.mutable_data_ptr());
+
+  void* packed_table_ptr = out.mutable_data_ptr<int8_t>() +
+      torchao::ops::PackedWeightsHeader::size();
+
+  // Pack each row
+  torchao::parallel_1d(0, num_embeddings, [&](int64_t i) {
+#if defined(TORCHAO_BUILD_CPU_AARCH64)
+    torchao::kernels::cpu::aarch64::embedding::pack_embedding_row_at_index_lut<
+        weight_nbit>(
+        packed_table_ptr,
+        i,
+        weight_qval_idxs.const_data_ptr<uint8_t>(),
+        has_scales ? weight_scales->const_data_ptr<float>() : nullptr,
+        luts.const_data_ptr<float>(),
+        num_embeddings,
+        embedding_dim,
+        scale_group_size,
+        lut_group_size,
+        has_scales);
+#else
+    TORCHAO_CHECK(false, "Unsupported platform for pack_embedding kernel");
+#endif // defined(TORCHAO_BUILD_CPU_AARCH64)
+  });
+
+  return out;
+}
+
+template <int weight_nbit>
+Tensor pack_embedding_meta(
+    const Tensor& weight_qval_idxs,
+    const Tensor& luts,
+    int64_t scale_group_size,
+    int64_t lut_group_size,
+    const std::optional<Tensor>& weight_scales) {
+  const int64_t num_embeddings = weight_qval_idxs.size(0);
+  const int64_t embedding_dim = weight_qval_idxs.size(1);
+  const bool has_scales = weight_scales.has_value();
+
+  TORCHAO_CHECK(
+      (embedding_dim * weight_nbit) % 8 == 0,
+      "Total bits must be a multiple of 8 for meta function.");
+
+  const size_t packed_embedding_size =
+      torchao::kernels::cpu::aarch64::embedding::packed_embedding_size(
+          weight_nbit,
+          num_embeddings,
+          embedding_dim,
+          scale_group_size,
+          lut_group_size,
+          has_scales);
+;
+  const size_t total_packed_size = torchao::ops::PackedWeightsHeader::size() + packed_embedding_size;
+
+  auto options =
+      torch::TensorOptions().device(c10::DeviceType::Meta).dtype(torch::kInt8);
+  return torch::empty({(long)total_packed_size}, options);
+}
+#endif // USE_ATEN
diff --git a/torchao/experimental/ops/embedding_lut/op_embedding_groupwise_lowbit_lut_aten.cpp b/torchao/experimental/ops/embedding_lut/op_embedding_groupwise_lowbit_lut_aten.cpp
@@ -0,0 +1,71 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <torchao/experimental/ops/embedding_lut/op_embedding_groupwise_lowbit_lut-impl.h>
+
+// This macro defines the operator signatures.
+// The signatures now correctly match the C++ implementation.
+#define DEFINE_LUT_OP(weight_nbit)                                          \
+  m.def(                                                                    \
+      "_pack_embedding_lut_" #weight_nbit                                   \
+      "bit(Tensor weight_qval_idxs, Tensor luts, int scale_group_size, "    \
+      "int lut_group_size, Tensor? weight_scales) -> Tensor");              \
+  m.def(                                                                    \
+      "_embedding_lut_" #weight_nbit                                        \
+      "bit(Tensor packed_weights, Tensor indices, int num_embeddings, "     \
+      "int embedding_dim, int scale_group_size, int lut_group_size, "       \
+      "bool has_scales) -> Tensor");                                        \
+  m.def(                                                                    \
+      "_embedding_lut_" #weight_nbit                                        \
+      "bit.out(Tensor packed_weights, Tensor indices, int num_embeddings, " \
+      "int embedding_dim, int scale_group_size, int lut_group_size, "       \
+      "bool has_scales, *, Tensor(a!) out) -> Tensor(a!)");
+
+// This macro registers the CPU implementations for the LUT-based operators.
+#define DEFINE_CPU_IMPL(weight_nbit)                                   \
+  m.impl(                                                              \
+      "_pack_embedding_lut_" #weight_nbit "bit",                       \
+      torch::dispatch(                                                 \
+          c10::DispatchKey::CPU, &pack_embedding_cpu<weight_nbit>));    \
+  m.impl(                                                              \
+      "_embedding_lut_" #weight_nbit "bit",                            \
+      torch::dispatch(                                                 \
+          c10::DispatchKey::CPU, &embedding_cpu<weight_nbit>));         \
+  m.impl(                                                              \
+      "_embedding_lut_" #weight_nbit "bit.out",                        \
+      torch::dispatch(                                                 \
+          c10::DispatchKey::CPU, &embedding_out_cpu<weight_nbit>));
+
+// This macro registers the Meta (device-agnostic) implementation for packing.
+#define DEFINE_META_IMPL(weight_nbit)                                  \
+  m.impl(                                                              \
+      "_pack_embedding_lut_" #weight_nbit "bit",                       \
+      torch::dispatch(                                                 \
+          c10::DispatchKey::Meta, &pack_embedding_meta<weight_nbit>));
+
+// Operator definitions
+TORCH_LIBRARY_FRAGMENT(torchao, m) {
+  DEFINE_LUT_OP(1);
+  DEFINE_LUT_OP(2);
+  DEFINE_LUT_OP(3);
+  DEFINE_LUT_OP(4);
+}
+
+// CPU implementations
+TORCH_LIBRARY_IMPL(torchao, CPU, m) {
+  DEFINE_CPU_IMPL(1);
+  DEFINE_CPU_IMPL(2);
+  DEFINE_CPU_IMPL(3);
+  DEFINE_CPU_IMPL(4);
+}
+
+// Meta implementations
+TORCH_LIBRARY_IMPL(torchao, Meta, m) {
+  DEFINE_META_IMPL(1);
+  DEFINE_META_IMPL(2);
+  DEFINE_META_IMPL(3);
+  DEFINE_META_IMPL(4);
+}
diff --git a/torchao/experimental/ops/embedding_lut/op_embedding_groupwise_lowbit_lut_executorch.cpp b/torchao/experimental/ops/embedding_lut/op_embedding_groupwise_lowbit_lut_executorch.cpp
@@ -0,0 +1,44 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <torchao/experimental/ops/embedding_lut/op_embedding_groupwise_lowbit_lut-impl.h>
+
+#define DEFINE_LUT_OP(weight_nbit)              \
+  Tensor _op_lut_out_##weight_nbit(             \
+      RuntimeContext& ctx,                      \
+      const Tensor& packed_weights,             \
+      const Tensor& indices,                    \
+      const int64_t& num_embeddings,            \
+      const int64_t& embedding_dim,             \
+      const int64_t& scale_group_size,          \
+      const int64_t& lut_group_size,            \
+      const bool& has_scales,                   \
+      Tensor& out) {                            \
+    (void)ctx;                                  \
+    embedding_out_cpu<weight_nbit>(             \
+        packed_weights,                         \
+        indices,                                \
+        num_embeddings,                         \
+        embedding_dim,                          \
+        scale_group_size,                       \
+        lut_group_size,                         \
+        has_scales,                             \
+        out);                                   \
+    return out;                                 \
+  }                                             \
+  EXECUTORCH_LIBRARY(                           \
+      torchao,                                  \
+      "_embedding_lut_" #weight_nbit "bit.out", \
+      _op_lut_out_##weight_nbit)
+
+DEFINE_LUT_OP(1);
+DEFINE_LUT_OP(2);
+DEFINE_LUT_OP(3);
+DEFINE_LUT_OP(4);
+DEFINE_LUT_OP(5);
+DEFINE_LUT_OP(6);
+DEFINE_LUT_OP(7);
+DEFINE_LUT_OP(8);
diff --git a/torchao/experimental/ops/embedding_lut/packed_weights_header.h b/torchao/experimental/ops/embedding_lut/packed_weights_header.h
diff --git a/torchao/experimental/ops/packed_weights_header.h b/torchao/experimental/ops/packed_weights_header.h