intel
diff --git a/‎auto_round/__init__.py
+1-1 b/‎auto_round/__init__.py
+1-1
diff --git a/‎auto_round/auto_quantizer.py
-837 b/‎auto_round/auto_quantizer.py
-837
diff --git a/‎auto_round/autoround.py
+9-9 b/‎auto_round/autoround.py
+9-9
diff --git a/‎auto_round/eval/evaluation.py
+2-6 b/‎auto_round/eval/evaluation.py
+2-6
diff --git a/‎auto_round/export/export_to_autogptq/export.py
+3-1 b/‎auto_round/export/export_to_autogptq/export.py
+3-1
diff --git a/‎auto_round/export/export_to_autogptq/qlinear_triton.py
+7-27 b/‎auto_round/export/export_to_autogptq/qlinear_triton.py
+7-27
diff --git a/‎auto_round/export/export_to_autoround/export.py
+15-10 b/‎auto_round/export/export_to_autoround/export.py
+15-10
diff --git a/‎auto_round/export/export_to_autoround/qlinear_triton.py
+137 b/‎auto_round/export/export_to_autoround/qlinear_triton.py
+137
@@ -15,5 +15,5 @@
 from .mllm import AutoRoundMLLM
 from auto_round.utils import LazyImport
 
-from .auto_quantizer import AutoHfQuantizer,AutoRoundConfig
+from auto_round.inference.auto_quantizer import AutoHfQuantizer,AutoRoundConfig
 from .version import __version__
@@ -53,7 +53,8 @@
     compile_func,
     find_matching_blocks, is_debug_mode,
     TORCH_VERSION_AT_LEAST_2_6,
-    supported_layer_types
+    supported_layer_types,
+    get_layer_features,
 )
 from .low_cpu_mem.utils import get_layers_before_block
 
@@ -448,7 +449,7 @@ def quantize_and_save(self, output_dir: str = "tmp_autoround", format: str = "au
                         f"Currently only support to export auto_round format quantized model"
                         " with fp8 dtype activation for activation quantization."
                         " Change format to fake and save."
-                        )
+                    )
                     formats = ["fake"]
             else:
                 if len(formats) > 1 or "auto_round" not in formats:
@@ -478,11 +479,6 @@ def quantize_and_save(self, output_dir: str = "tmp_autoround", format: str = "au
                     format = format.replace('auto_round', 'auto_round:gptq')
                     formats[index] = format
 
-                if not any(f in format for f in ["triton", "exllamav2", "awq", "gptq"]):
-                    logger.info(f"AutoRound format does not support {format}, attempting to use AutoGPTQ")
-                    format = format.replace("auto_round", "auto_gptq")
-                    formats[index] = format
-
         # Remove duplicates from formats list
         def remove_duplicates(lst):
             seen = set()
@@ -693,6 +689,10 @@ def set_layerwise_config(self, layer_config):
             if n not in layers_in_blocks and check_to_quantized(layer_config[n]):
                 has_qlayer_outside_block = True
 
+            in_features, out_features = get_layer_features(m)
+            if in_features <= layer_config[n]["group_size"]:
+                layer_config[n]["group_size"] = -1
+
             # Apply the configuration to the corresponding layer in the model
             for key in keys:
                 setattr(m, key, layer_config[n][key])
@@ -1478,7 +1478,7 @@ def quant_blocks(
                 m.name = n
 
         for i in range(0, len(block_names), nblocks):
-            if i!=0:
+            if i != 0:
                 pbar.update(1)
             if nblocks == 1:
                 n = block_names[i]
@@ -1542,7 +1542,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
                         f"Currently only support to export auto_round format quantized model"
                         " with fp8 dtype activation for activation quantization."
                         " Change format to fake and save."
-                        )
+                    )
                     format = "fake"
             else:
                 if format != "auto_round":
 
@@ -12,12 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
-import random
-import time
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import Optional, Union
 
-import lm_eval
 from lm_eval import simple_evaluate as lm_simple_evaluate
 import os
 
@@ -52,7 +48,7 @@ def simple_evaluate(
     try:
         from auto_round import AutoRoundConfig
     except:
-        from auto_round.auto_quantizer import AutoHfQuantizer
+        from auto_round.inference.auto_quantizer import AutoHfQuantizer
 
     return lm_simple_evaluate(
         model=model,
 
@@ -105,7 +105,7 @@ def pack_layer(name, model, backend):
     ##force to float32 to be compatible with torch 2.0
     if sym and isinstance(new_layer, auto_round.export.export_to_autogptq.qlinear_triton.QuantLinear):
         layer, scale = layer.to("cpu"), scale.to("cpu")
-        zero = 2 ** (bits - 1)
+        zero = int(zero.flatten()[0])
     else:
         layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu").to(torch.float32)
     sig = inspect.signature(qlayer.pack)
@@ -126,6 +126,8 @@ def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exll
     quant_block_list = kwargs.get("quant_block_list", get_block_names(model))
     tokenizer = kwargs.get("tokenizer", None)
     processor = kwargs.get("processor", None)
+    if os.path.exists(output_dir):
+        logger.warning(f"{output_dir} already exists, this may cause model conflict")
     if tokenizer is not None:
         tokenizer.save_pretrained(output_dir)
     if processor is not None:
 
@@ -18,30 +18,6 @@
 import torch
 import torch.nn as nn
 import transformers
-import numba
-
-
-##TODO different bits
-# @numba.jit(nopython=True, parallel=True)
-# def pack_array_with_numba_b4_c32(
-#         raw_array: np.ndarray, packed_array: np.ndarray
-# ) -> np.ndarray:
-#     """Pack the array with numba when bits=4 and compress_bits=32."""
-#     bits = 4
-#     n_pack = 32 // bits
-#
-#     for row in range(packed_array.shape[0]):
-#         packed_array[row] = ((((raw_array[row * n_pack + 7]) << 28)
-#                               | ((raw_array[row * n_pack + 6]) << 24)
-#                               | ((raw_array[row * n_pack + 5]) << 20)
-#                               | ((raw_array[row * n_pack + 4]) << 16)
-#                               | ((raw_array[row * n_pack + 3]) << 12)
-#                               | (raw_array[row * n_pack + 2]) << 8)
-#                              | ((raw_array[row * n_pack + 1]) << 4)
-#                              | ((raw_array[row * n_pack]) << 0))
-#
-#     return packed_array
-
 
 class TritonModuleMixin:
     @classmethod
@@ -89,6 +65,7 @@ def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=Fa
             "g_idx",
             torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
         )
+
         if bias:
             self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
         else:
@@ -108,6 +85,8 @@ def pack(self, linear, scales, zeros, g_idx=None):
         device = "cpu"
         if torch.cuda.is_available():
             device = "cuda:0"
+        elif torch.xpu.is_available():
+            device = "xpu:0"
 
         W = linear.weight.data.to(device).clone()
         if isinstance(linear, nn.Conv2d):
@@ -118,11 +97,12 @@ def pack(self, linear, scales, zeros, g_idx=None):
         repeat_scales = scales.to(device).repeat_interleave(self.group_size, 1)
         if isinstance(zeros, torch.Tensor):
             repeat_zeros = zeros.to(device).repeat_interleave(self.group_size, 1)
+            intweight = torch.round(W.to(device) / repeat_scales[:, :W.shape[1]] + repeat_zeros[:, :W.shape[1]]).to(
+                torch.int32)
         else:
             repeat_zeros = zeros
-
-        intweight = torch.round(W.to(device) / repeat_scales + repeat_zeros).to(
-            torch.int32)
+            intweight = torch.round(W.to(device) / repeat_scales[:, :W.shape[1]] + repeat_zeros).to(
+                torch.int32)
 
         del repeat_scales
         intweight = intweight.reshape(-1, intweight.shape[1] // 32 * self.bits, 32 // self.bits)
 
@@ -22,6 +22,7 @@
 import transformers
 
 import auto_round.export.export_to_autoround.qlinear_triton_act
+import auto_round_extension.cuda.qlinear_tritonv2
 from auto_round.utils import get_layer_names_in_block, get_module, logger, set_module, supported_layer_types
 import threadpoolctl as tctl
 import inspect
@@ -71,17 +72,18 @@ def dynamic_import_quant_linear_for_packing(backend, bits, group_size, sym, act_
     if "auto_round" in backend and "awq" not in backend and "gptq" not in backend:
         if act_bits <= 8:  ##easily have bug for other configuration, need to refine code later
             return auto_round.export.export_to_autoround.qlinear_triton_act.QuantLinear
-        ##only support triton and exllamav2
-        if not ("triton" in backend or "exllamav2" in backend):
-            logger.warning_once(f"auto_round format does not support {backend}, try to pack each layer with auto_gptq")
-            return get_autogptq_packing_qlinear(backend, bits, group_size, sym)
 
-        from auto_round_extension.cuda.qlinear_triton import QuantLinear
+        from auto_round_extension.cuda.qlinear_tritonv2 import QuantLinear
         return QuantLinear
+    elif "auto_round" in backend and "gptq" in backend:
+        from auto_round.export.export_to_autoround.qlinear_triton import QuantLinear ##no g_idx
+        return  QuantLinear
     elif "awq" in backend:
         from ..export_to_awq.utils import WQLinear_GEMM
         return WQLinear_GEMM
-    elif "gptq" in backend:
+    elif "gptqmodel" in backend:
+        return auto_round_extension.cuda.qlinear_tritonv2.QuantLinear
+    elif "gptq" in backend and not "gptqmodel" in backend: ## have g_idx
         return get_autogptq_packing_qlinear(backend, bits, group_size, sym)
     else:
         assert False, f"only support auto_gptq, auto_awq and auto_round backend"
@@ -188,6 +190,8 @@ def pack_layer(layer_name, model, backend):
         new_layer.device = device
         set_module(model, layer_name, new_layer)
         qlayer = new_layer
+        if sym:
+            zp = int(zp.flatten()[0])
 
         qlayer.to("cpu")
         ##force to float32 to be compatible with torch 2.0
@@ -202,6 +206,9 @@ def pack_layer(layer_name, model, backend):
         scale, zp = scale.to(torch.float32), zp.to(torch.float32)
         scale = scale.t().contiguous()
         zp = zp.t().contiguous()
+        if sym:
+            zp = int(zp.flatten()[0])
+
         if bits != 4:
             logger.error("AutoAWQ format only supports 4-bits quantization.")
         qlayer = QuantLinear.from_linear(
@@ -243,10 +250,6 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     if (kwargs.get("sym") is None or kwargs.get("sym") == True) and ("gptq" not in backend and "awq" not in backend):
         backend = backend.replace('auto_round', 'auto_round:gptq')
 
-    if not ("triton" in backend or "exllamav2" in backend or "awq" in backend or "gptq" in backend):
-        logger.info(f"AutoRound format does not support {backend}, try to pack each layer with AutoGPTQ")
-        backend = backend.replace("auto_round", "auto_gptq")
-
     model = kwargs["model"]
     safe_serialization = True if 'safe_serialization' not in kwargs.keys() else kwargs["safe_serialization"]
     if not inplace:
@@ -306,6 +309,8 @@ def wrapper(name):
     if output_dir is None:
         model.tokenizer = tokenizer
         return model
+    if os.path.exists(output_dir):
+        logger.warning(f"{output_dir} already exists, this may cause model conflict")
     if tokenizer is not None:
         tokenizer.save_pretrained(output_dir)
 
 
@@ -0,0 +1,137 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import transformers
+
+class TritonModuleMixin:
+    @classmethod
+    def warmup(cls, model, transpose=False, seqlen=2048):
+        pass
+
+
+class QuantLinear(nn.Module, TritonModuleMixin):
+    QUANT_TYPE = "triton"
+
+    def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs):
+        super().__init__()
+        if bits not in [2, 4, 8]:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+        if infeatures % 32 != 0 or outfeatures % 32 != 0:
+            raise NotImplementedError("in_feature and out_feature must be divisible by 32.")
+        self.infeatures = infeatures
+        self.outfeatures = outfeatures
+        self.bits = bits
+        self.group_size = group_size if group_size != -1 else infeatures
+        self.maxq = 2 ** self.bits - 1
+
+        self.register_buffer(
+            "qweight",
+            torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
+        )
+        self.register_buffer(
+            "qzeros",
+            torch.zeros(
+                (
+                    math.ceil(infeatures / self.group_size),
+                    outfeatures // 32 * self.bits,
+                ),
+                dtype=torch.int32,
+            ),
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (math.ceil(infeatures / self.group_size), outfeatures),
+                dtype=torch.float16,
+            ),
+        )
+
+        if bias:
+            self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
+        else:
+            self.bias = None
+
+        self.trainable = trainable
+
+    def post_init(self):
+        pass
+
+    def pack(self, linear, scales, zeros, g_idx=None):
+        scales_t = scales.t().contiguous()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+        self.scales = scales_t.clone().half()
+        device = "cpu"
+        if torch.cuda.is_available():
+            device = "cuda:0"
+        elif torch.xpu.is_available():
+            device = "xpu:0"
+
+        W = linear.weight.data.to(device).clone()
+        if isinstance(linear, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(linear, transformers.pytorch_utils.Conv1D):
+            W = W.t()
+
+        repeat_scales = scales.to(device).repeat_interleave(self.group_size, 1)
+        if isinstance(zeros, torch.Tensor):
+            repeat_zeros = zeros.to(device).repeat_interleave(self.group_size, 1)
+            intweight = torch.round(W.to(device) / repeat_scales[:, :W.shape[1]] + repeat_zeros[:, :W.shape[1]]).to(
+                torch.int32)
+        else:
+            repeat_zeros = zeros
+            intweight = torch.round(W.to(device) / repeat_scales[:, :W.shape[1]] + repeat_zeros).to(
+                torch.int32)
+
+        del repeat_scales
+        intweight = intweight.reshape(-1, intweight.shape[1] // 32 * self.bits, 32 // self.bits)
+        order_map = torch.arange(0, 32 // self.bits, device=device) * self.bits
+        intweight = intweight << order_map
+        intweight = torch.sum(intweight, dim=-1)
+
+        intweight = intweight.t().contiguous().to(torch.int32)
+        self.qweight = intweight.to("cpu")
+
+        if isinstance(zeros, torch.Tensor):
+            zeros = zeros.t().contiguous()
+            zeros -= 1
+            zeros = zeros.numpy().astype(np.uint32)
+            qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
+            i = 0
+            col = 0
+            while col < qzeros.shape[1]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+
+            qzeros = qzeros.astype(np.int32)
+            self.qzeros = torch.from_numpy(qzeros)
+        else:
+            zeros -= 1
+            shape = scales_t.shape
+            value = 0
+            for j in range(0, (32 // self.bits)):
+                value |= zeros << (self.bits * j)
+            qzeros = np.ones((shape[0], shape[1] // 32 * self.bits), dtype=np.uint32) * value
+            qzeros = qzeros.astype(np.int32)
+            self.qzeros = torch.from_numpy(qzeros)
+
+
+__all__ = ["QuantLinear"]