intel · SinpackKonmakan · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -1351,11 +1351,13 @@ def quant_block(self, block, input_ids, input_others, q_input=None, device=torch
         for n, m in block.named_modules():
             if hasattr(m, "orig_layer"):
                 for key in m.params.keys():
+                    # breakpoint()
                     if "min" in key or "max" in key:
                         minmax_params.append(m.params[key])
                     else:
                         round_params.append(m.params[key])
 
+
         if self.enable_minmax_tuning:
             optimizer = self.optimizer(
                 [{"params": round_params}, {"params": minmax_params, "lr": self.minmax_lr}], lr=self.lr, weight_decay=0

diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py
@@ -64,17 +64,17 @@ def quant_tensor_sym(tensor, bits=4, group_size=-1, v=0, min_scale=1.0, max_scal
 
 
 ## the values should be positive
-def double_quant_tensor(tensor, bits, q_scale_thresh):
+def double_quant_tensor(tensor, bits, q_scale_thresh, coeef):
     maxq = 2 ** bits - 1
     wmax = torch.clamp(tensor.max(-1)[0], min=0)
-    scale = torch.clamp(wmax / maxq, q_scale_thresh)
+    scale = torch.clamp(wmax / maxq, q_scale_thresh) * coeef
     scale = scale.view(-1, 1)
     qdq_tensor = torch.clamp(round_ste(tensor / scale), max=maxq) * scale
     return qdq_tensor, scale
 
 
 @register_dtype("int_asym_dq")
-def quant_tensor_asym_dq(tensor, bits=4, group_size=-1, v=0, min_scale=1.0, max_scale=1.0, scale_dtype=torch.float16,
+def quant_tensor_asym_dq(tensor, bits=4, group_size=-1, v=0, min_scale=1.0, max_scale=1.0, k_wm=1.0, k_scale=1.0, scale_dtype=torch.float16,
                          tensor_min=None, tensor_max=None, q_scale_thresh=1e-5, super_group_size=8, super_bits=6,
                          **kwargs):
     """Quantize and de-quantize tensor asymmetrically.
@@ -104,8 +104,8 @@ def quant_tensor_asym_dq(tensor, bits=4, group_size=-1, v=0, min_scale=1.0, max_
         wmin_tmp = tensor_min
         wmax_tmp = tensor_max
     if isinstance(min_scale, torch.Tensor):
-        wmin = wmin_tmp * min_scale
-        wmax = wmax_tmp * max_scale
+        wmin = wmin_tmp * min_scale #* k_wm
+        wmax = wmax_tmp * max_scale #* k_scale
     else:
         wmin = wmin_tmp
         wmax = wmax_tmp
@@ -114,20 +114,19 @@ def quant_tensor_asym_dq(tensor, bits=4, group_size=-1, v=0, min_scale=1.0, max_
     scale = scale.view(-1, super_group_size)
     wmin_m = -wmin  # pylint: disable=E1130
     wmin_m = wmin_m.view(-1, super_group_size)
-
     ##conduct double quant
-    scale, d_scale = double_quant_tensor(scale, super_bits, q_scale_thresh)
-    wmin_m, d_wmin_m = double_quant_tensor(wmin_m, super_bits, q_scale_thresh)
+    scale, d_scale = double_quant_tensor(scale, super_bits, q_scale_thresh, k_scale)
+    wmin_m, d_wmin_m = double_quant_tensor(wmin_m, super_bits, q_scale_thresh, k_wm)
 
     scale = scale.view(-1, 1)
     scale = torch.clamp(scale, q_scale_thresh)
     wmin_m = wmin_m.view(-1, 1)
 
-    int_w = round_ste(tensor / scale + v)
-    q = torch.clamp(int_w + round_ste(wmin_m / scale), 0, maxq)
+    int_w = round_ste((tensor + wmin_m) / scale + v)
+    q = torch.clamp(int_w, 0, maxq)
     qdq_result = (scale * q - wmin_m).to(tensor.dtype)
     qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len)
-    zp = round_ste(wmin_m / scale)  # remove this later
+    #zp = round_ste(wmin_m / scale)  # remove this later
     return qdq_result, {"scale": scale, "d_scale": d_scale}, {"wmin_m": wmin_m, "d_wmin_m": d_wmin_m}