fix triton multiple gpus and some other issues (#539)

wenhuach21 · web-flow · commit d6d6dad1f816 · 2025-04-23T10:04:36.000+08:00
diff --git a/auto_round/inference/auto_quantizer.py b/auto_round/inference/auto_quantizer.py
@@ -281,6 +281,8 @@ def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
 
         if "auto-round" not in quant_method:
             config_dict["packing_format"] = f"auto_round:{quant_method}"
+
+
         return super().from_dict(config_dict, return_unused_kwargs=return_unused_kwargs, **kwargs)
 
 
diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
@@ -383,11 +383,12 @@ def _import_exllamav2_kernels():
     """Attempts to import ExLlamaV2 kernels for performance optimization."""
     try:
         from exllamav2_kernels import gemm_half_q_half, make_q_matrix  # pylint: disable=E0611, E0401
-    except ImportError:
-        raise ImportError(
+    except:
+        logger.warning_once(
             "AutoGPTQ ExLlamaV2 has not been installed, Please install it using the following command: "
             "`pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@b8b4127`"
         )
+        logger.warning_once("try to fallback to other autogptq backends for now")
 
 
 def _create_quant_layer(layer, layer_backend, config, in_features, out_features):
@@ -520,19 +521,19 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
     else:
         backend = "auto"
 
-
     ##target_backend could be None
     _, backend = parse_target_device_and_backend(backend)
 
-    if hasattr(quantization_config, "packing_format"):  # pragma: no cover
+    if hasattr(quantization_config,
+               "packing_format") and "auto-round" in quantization_config.quant_method:  # pragma: no cover
         packing_format = quantization_config.packing_format
     elif 'gptq' in quantization_config.quant_method:  # pragma: no cover
         packing_format = "auto_gptq"
     elif "awq" in quantization_config.quant_method:
         packing_format = "auto_awq"
     else:  # pragma: no cover
         packing_format = "auto_gptq"
-        logger.warning("Quantization backend must be specified. Set it to 'auto_gptq' by default.")
+        logger.warning("quantization backend must be specified. Set it to 'auto_gptq' by default.")
     if packing_format == "auto":
         packing_format = "auto_gptq"
 
diff --git a/auto_round/version.py b/auto_round/version.py
@@ -14,4 +14,4 @@
 """Intel® auto-round: An open-source Python library
 supporting popular model weight only compression based on signround."""
 
-__version__ = "0.5.0"
+__version__ = "0.5.1"
diff --git a/auto_round_extension/cuda/triton_utils/dequant.py b/auto_round_extension/cuda/triton_utils/dequant.py
@@ -123,29 +123,29 @@ def dequant248(qweight, scales, qzeros, g_idx, bits, maxq=None, input_dtype=torc
     """
     Launcher for triton dequant kernel.  Only valid for bits = 2, 4, 8
     """
-
-    num_groups = scales.shape[0]
-    outfeatures = scales.shape[1]
-    infeatures = g_idx.shape[0]
-
-    out = torch.empty((infeatures, outfeatures), device="cuda", dtype=input_dtype)
-    numels = out.numel()
-    maxq = 2 ** bits - 1 if maxq is None else maxq
-    grid = lambda meta: (triton.cdiv(numels, meta["X_BLOCK"]),)  # noqa: E731
-
-    dequant_kernel_248[grid](
-        g_idx,
-        scales,
-        qweight,
-        qzeros,
-        out,
-        numels,
-        maxq=maxq,
-        bits=bits,
-        outfeatures=outfeatures,
-        num_groups=num_groups,
-    )
-    return out
+    with torch.cuda.device(qweight.device):
+        num_groups = scales.shape[0]
+        outfeatures = scales.shape[1]
+        infeatures = g_idx.shape[0]
+
+        out = torch.empty((infeatures, outfeatures), device=qweight.device, dtype=input_dtype)
+        numels = out.numel()
+        maxq = 2 ** bits - 1 if maxq is None else maxq
+        grid = lambda meta: (triton.cdiv(numels, meta["X_BLOCK"]),)  # noqa: E731
+
+        dequant_kernel_248[grid](
+            g_idx,
+            scales,
+            qweight,
+            qzeros,
+            out,
+            numels,
+            maxq=maxq,
+            bits=bits,
+            outfeatures=outfeatures,
+            num_groups=num_groups,
+        )
+        return out
 
 
 def quant_matmul_248(
diff --git a/auto_round_extension/cuda/triton_utils_zp/dequant.py b/auto_round_extension/cuda/triton_utils_zp/dequant.py
@@ -123,29 +123,29 @@ def dequant248(qweight, scales, qzeros, g_idx, bits, maxq=None, input_dtype=torc
     """
     Launcher for triton dequant kernel.  Only valid for bits = 2, 4, 8
     """
-
-    num_groups = scales.shape[0]
-    outfeatures = scales.shape[1]
-    infeatures = g_idx.shape[0]
-
-    out = torch.empty((infeatures, outfeatures), device="cuda", dtype=input_dtype)
-    numels = out.numel()
-    maxq = 2 ** bits - 1 if maxq is None else maxq
-    grid = lambda meta: (triton.cdiv(numels, meta["X_BLOCK"]),)  # noqa: E731
-
-    dequant_kernel_248[grid](
-        g_idx,
-        scales,
-        qweight,
-        qzeros,
-        out,
-        numels,
-        maxq=maxq,
-        bits=bits,
-        outfeatures=outfeatures,
-        num_groups=num_groups,
-    )
-    return out
+    with torch.cuda.device(qweight.device):
+        num_groups = scales.shape[0]
+        outfeatures = scales.shape[1]
+        infeatures = g_idx.shape[0]
+
+        out = torch.empty((infeatures, outfeatures), device=qweight.device, dtype=input_dtype)
+        numels = out.numel()
+        maxq = 2 ** bits - 1 if maxq is None else maxq
+        grid = lambda meta: (triton.cdiv(numels, meta["X_BLOCK"]),)  # noqa: E731
+
+        dequant_kernel_248[grid](
+            g_idx,
+            scales,
+            qweight,
+            qzeros,
+            out,
+            numels,
+            maxq=maxq,
+            bits=bits,
+            outfeatures=outfeatures,
+            num_groups=num_groups,
+        )
+        return out
 
 
 def quant_matmul_248(