Enabling MOE Quantization using linear decomposition [WIP]

HDCharles · HDCharles · commit fce1e18a94d7 · 2025-04-11T05:10:10.000-07:00
Summary: This PR is a first step at optimizing moe inference using
torchAO. The goal for this step is to enable existing quantization
kernels and workflows to work for moe quantization by decomposing the
group gemm into a sequence of unbalanced linear ops that can use the
existing quantized kernels. To enable this we had to add support for
quantizing these 3D tensors as well as slicing and indexing.

current tests are running locally but will be added once working.

currently int8wo and int8dq are working for multi and single token moe
inference while int4wo is being finished up.

TODO move test set into ao, move quantizable moe module code to ao test
on hf model definition.

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/torchao/dtypes/affine_quantized_tensor_ops.py b/torchao/dtypes/affine_quantized_tensor_ops.py
@@ -477,6 +477,46 @@ def _(func, types, args, kwargs):
     )
     return return_and_correct_aliasing(func, args, kwargs, new)
 
+@implements(aten.index.Tensor)
+def _(func, types, args, kwargs):
+    self, indices = args
+    assert len(indices) == 1, f"op {func} currently only implemented for single dimensional indexing but got indices: {indices}"
+    
+    new_tensor_impl = aten.index.Tensor(self.tensor_impl, indices)
+    shape = tuple([indices[0].numel(), *self.shape[1:]])
+
+    block_size = self.block_size
+    new = self.__class__(
+        new_tensor_impl,
+        block_size,
+        shape,
+        self.quant_min,
+        self.quant_max,
+        self.zero_point_domain,
+        dtype=self.dtype,
+    )
+    return return_and_correct_aliasing(func, args, kwargs, new)
+
+@implements(aten.select.int)
+def _(func, types, args, kwargs):
+    self, dim, index = fill_defaults(args, 3, [0, 0])
+    assert dim==0, f"op {func} currently only implemented for dim=0 but got dim={dim}"
+    assert self.dim() == 3, f"op {func} currently only implemented for 3 dimensional tensors but got shape={self.shape}"
+    
+    new_tensor_impl = aten.select.int(self.tensor_impl, dim, index)
+
+    shape = self.shape[1:]
+    block_size = self.block_size[1:]
+    new = self.__class__(
+        new_tensor_impl,
+        block_size,
+        shape,
+        self.quant_min,
+        self.quant_max,
+        self.zero_point_domain,
+        dtype=self.dtype,
+    )
+    return return_and_correct_aliasing(func, args, kwargs, new)
 
 # this is needed for DTensor.from_local() and for flattening tensor
 @implements(aten.view.default)
diff --git a/torchao/dtypes/uintx/plain_layout.py b/torchao/dtypes/uintx/plain_layout.py
@@ -154,6 +154,17 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             )
             return return_and_correct_aliasing(func, args, kwargs, new)
 
+
+        elif func in [aten.select.int, aten.index.Tensor]:
+            return return_and_correct_aliasing(
+                func,
+                args,
+                kwargs,
+                args[0]._apply_fn_to_data(
+                    lambda x: func(x, *args[1:], **kwargs)
+                ),
+            )
+
         elif func is aten.slice.Tensor:
             self, dim, start, end, step = fill_defaults(args, 5, [0, None, None, 1])
             if dim == 0:
diff --git a/torchao/dtypes/uintx/tensor_core_tiled_layout.py b/torchao/dtypes/uintx/tensor_core_tiled_layout.py
@@ -75,7 +75,6 @@ def _linear_bf16_act_uint4_weight_impl(input_tensor, weight_tensor, bias):
         f"need input_tensor shape: {input_tensor.shape} final"
         f"dim to match weight_tensor shape: {weight_tensor.shape} second dim "
     )
-
     # TODO: check groupsize quantization
     # avoid circular dep, TODO: move this to a common util.py
     act_mat = input_tensor
@@ -97,7 +96,6 @@ def _linear_bf16_act_uint4_weight_impl(input_tensor, weight_tensor, bias):
     y = torch.ops.aten._weight_int4pack_mm(
         act_mat.contiguous(), packed_weight, groupsize, scale_and_zero
     )
-
     # remove out_feature padding
     orig_out_features = weight_tensor.shape[-2]
     y = y[:, :orig_out_features]
@@ -119,7 +117,7 @@ class TensorCoreTiledLayout(Layout):
     inner_k_tiles: int = 8
 
     def pre_process(self, input: torch.Tensor) -> torch.Tensor:
-        orig_out_features, orig_in_features = input.shape
+        orig_out_features, orig_in_features = input.shape[-2:]
         in_features = find_multiple(orig_in_features, 1024)
         out_features = find_multiple(orig_out_features, 8)
         input = torch.nn.functional.pad(
@@ -160,7 +158,7 @@ def post_process(
         zero_point: torch.Tensor,
         block_size: Tuple[int, ...],
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        orig_out_features, orig_in_features = input.shape
+        orig_out_features, orig_in_features = input.shape[-2:]
         in_features = find_multiple(orig_in_features, 1024)
         out_features = find_multiple(orig_out_features, 8)
         input = torch.nn.functional.pad(
@@ -272,14 +270,28 @@ def from_plain(
             assert (
                 int_data.dtype == torch.int32
             ), "torch.ops.aten._convert_weight_to_int4pack in torch 2.4 expects `int32` dtype"
-        packed_weight = torch.ops.aten._convert_weight_to_int4pack(
-            int_data, _layout.inner_k_tiles
-        )
-        scale = scale.reshape(int_data.shape[0], -1)
-        zero_point = zero_point.reshape(int_data.shape[0], -1)
+        def quant_2d(int_data):
+            return torch.ops.aten._convert_weight_to_int4pack(
+                int_data, _layout.inner_k_tiles
+            )
+        if int_data.shape[1] == 14336:
+            import fbvscode; fbvscode.set_trace()
+        if int_data.dim() == 3: # for moe quant
+            num_experts = int_data.shape[0]
+            packed_weight_list = []
+            for expert in range(num_experts):
+                packed_weight_list.append(quant_2d(int_data[expert]).unsqueeze(0))
+            packed_weight = torch.cat(packed_weight_list, dim=0)
+            scale = scale.reshape(int_data.shape[0], int_data.shape[-2], -1)
+            zero_point = zero_point.reshape(int_data.shape[0], int_data.shape[-2], -1)
+        else:
+            packed_weight = quant_2d(int_data)
+            scale = scale.reshape(int_data.shape[0], -1)
+            zero_point = zero_point.reshape(int_data.shape[0], -1)
         from torchao.quantization.utils import pack_tinygemm_scales_and_zeros
 
         scale_and_zero = pack_tinygemm_scales_and_zeros(scale, zero_point, scale.dtype)
+        import fbvscode; fbvscode.set_trace()
         return cls(packed_weight, scale_and_zero, False, _layout)
 
     def to(self, *args, **kwargs):
@@ -336,6 +348,18 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
                 f"Not supported args for copy_ due to metadata mistach: {args[0], args[1]}"
             )
 
+        if func in [aten.select.int, aten.index.Tensor]:
+            assert not (func is aten.select.int and args[1]!=0), "aten.select.int currently only has support for dim=0"
+            return return_and_correct_aliasing(
+                func,
+                args,
+                kwargs,
+                args[0]._apply_fn_to_data(
+                    lambda x: func(x, *args[1:], **kwargs)
+                ),
+            )
+
+
         if func is aten.t.default:
             """we don't need to repack the weight and just rely on external
             shape being changed and record the status of transpose/no-transpose
@@ -399,29 +423,45 @@ def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         )
         from torchao.quantization.utils import unpack_tinygemm_scales_and_zeros
 
-        scale, zero = unpack_tinygemm_scales_and_zeros(self.scale_and_zero)
-
+        def dequant_4d(self):
+            cur_shape = self.shape
+            assert len(cur_shape) == 4
+            inner_k_tiles = cur_shape[-1] * 2
+            original_shape = (cur_shape[0] * 8, cur_shape[1] * (inner_k_tiles * 16))
+            eye_shape = original_shape[1]
+            groupsize = int(original_shape[1] / scale.shape[-2])
+            block_size = (1, groupsize)
+            device = self.device
+            original_dtype = torch.bfloat16
+            target_dtype = torch.int32
+            quant_min = 0
+            quant_max = 15
+            zero_point_domain = ZeroPointDomain.FLOAT
+            assert len(block_size) == 2 and block_size[0] == 1
+            dequantized = torch.ops.aten._weight_int4pack_mm(
+                torch.eye(eye_shape, device=device, dtype=original_dtype),
+                self.packed_weight,
+                groupsize,
+                self.scale_and_zero,
+            )
+            dequantized = dequantized.t().contiguous()
+            return dequantized
+            
         cur_shape = self.shape
-        assert len(cur_shape) == 4
-        inner_k_tiles = cur_shape[-1] * 2
-        original_shape = (cur_shape[0] * 8, cur_shape[1] * (inner_k_tiles * 16))
-        eye_shape = original_shape[1]
-        groupsize = int(original_shape[1] / scale.shape[-2])
-        block_size = (1, groupsize)
-        device = self.device
-        original_dtype = torch.bfloat16
-        target_dtype = torch.int32
-        quant_min = 0
-        quant_max = 15
-        zero_point_domain = ZeroPointDomain.FLOAT
-        assert len(block_size) == 2 and block_size[0] == 1
-        dequantized = torch.ops.aten._weight_int4pack_mm(
-            torch.eye(eye_shape, device=device, dtype=original_dtype),
-            self.packed_weight,
-            groupsize,
-            self.scale_and_zero,
-        )
-        dequantized = dequantized.t().contiguous()
+
+        if len(cur_shape)==4:
+            dequantized = dequant_4d(self)
+        else:
+            
+            assert len(cur_shape) == 5
+            num_experts = cur_shape[0]
+            dequantized_list = []
+            import fbvscode; fbvscode.set_trace()
+            for expert in range(num_experts):
+                dequantized_list.append(dequant_4d(self[expert]).unsqueeze(0))
+            de
+
+        scale, zero = unpack_tinygemm_scales_and_zeros(self.scale_and_zero)
         # TODO: move this to `unpack_tinygemm_scales_and_zeros`?
         scale = scale.reshape(scale.shape[:-1]).contiguous()
         zero = zero.reshape(zero.shape[:-1]).contiguous()
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -300,7 +300,7 @@ def _replace_with_custom_fn_if_matches_filter(
                 device,
                 extra_args,
             )
-            if new_child is not child:
+            if new_child is not child and new_child is not None:
                 setattr(model, name, new_child)
         if device is not None:
             model.to(device=device)  # move parent module to device
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -366,22 +366,23 @@ def get_groupwise_affine_qparams(
 def pack_tinygemm_scales_and_zeros(scales, zeros, dtype=torch.bfloat16):
     guard_dtype_size(scales, "scales", dtype=dtype, size=zeros.size())
     guard_dtype_size(zeros, "zeros", dtype=dtype)
+    dim = scales.dim()
     return (
         torch.cat(
             [
-                scales.reshape(scales.size(0), scales.size(1), 1),
-                zeros.reshape(zeros.size(0), zeros.size(1), 1),
+                scales.unsqueeze(-1),
+                zeros.unsqueeze(-1),
             ],
-            2,
+            dim,
         )
-        .transpose(0, 1)
+        .transpose(-3, -2)
         .contiguous()
     )
 
 
 def unpack_tinygemm_scales_and_zeros(scales_and_zeros):
-    assert len(scales_and_zeros.shape) == 3 and scales_and_zeros.shape[2] == 2
-    return torch.split(scales_and_zeros.transpose(0, 1), 1, 2)
+    assert scales_and_zeros.shape[-1] == 2
+    return torch.split(scales_and_zeros.transpose(-3, -2), 1, -1)
 
 
 def convert_weight_to_int4pack_xpu(weight, zero_point_domain_is_int=False):
diff --git a/torchao/utils.py b/torchao/utils.py
@@ -422,7 +422,8 @@ class MyTensor(torch.Tensor):
         return cls._ATEN_OP_OR_TORCH_FN_TABLE[func](func, types, args, kwargs)
 
     with torch._C.DisableTorchFunctionSubclass():
-        return func(*args, **kwargs)
+        out = func(*args, **kwargs)
+        return out
 
 
 def _dispatch__torch_dispatch__(cls, func, types, args, kwargs):
@@ -441,6 +442,7 @@ class MyTensor(torch.Tensor):
 
     arg_types = tuple(type(arg) for arg in args)
     kwarg_types = {k: type(arg) for k, arg in kwargs.items()}
+    # import fbvscode; fbvscode.set_trace()
     raise NotImplementedError(
         f"{cls.__name__} dispatch: attempting to run unimplemented operator/function: {func=}, {types=}, {arg_types=}, {kwarg_types=}"
     )

Original file line number	Diff line number	Diff line change
`@@ -300,7 +300,7 @@ def _replace_with_custom_fn_if_matches_filter(`
`300`	`300`	`device,`
`301`	`301`	`extra_args,`
`302`	`302`	`)`
`303`		`- if new_child is not child:`
	`303`	`+ if new_child is not child and new_child is not None:`
`304`	`304`	`setattr(model, name, new_child)`
`305`	`305`	`if device is not None:`
`306`	`306`	`model.to(device=device) # move parent module to device`