@@ -383,11 +383,12 @@ def _import_exllamav2_kernels():
383
383
"""Attempts to import ExLlamaV2 kernels for performance optimization."""
384
384
try :
385
385
from exllamav2_kernels import gemm_half_q_half , make_q_matrix # pylint: disable=E0611, E0401
386
- except ImportError :
387
- raise ImportError (
386
+ except :
387
+ logger . warning_once (
388
388
"AutoGPTQ ExLlamaV2 has not been installed, Please install it using the following command: "
389
389
"`pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@b8b4127`"
390
390
)
391
+ logger .warning_once ("try to fallback to other autogptq backends for now" )
391
392
392
393
393
394
def _create_quant_layer (layer , layer_backend , config , in_features , out_features ):
@@ -520,19 +521,19 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
520
521
else :
521
522
backend = "auto"
522
523
523
-
524
524
##target_backend could be None
525
525
_ , backend = parse_target_device_and_backend (backend )
526
526
527
- if hasattr (quantization_config , "packing_format" ): # pragma: no cover
527
+ if hasattr (quantization_config ,
528
+ "packing_format" ) and "auto-round" in quantization_config .quant_method : # pragma: no cover
528
529
packing_format = quantization_config .packing_format
529
530
elif 'gptq' in quantization_config .quant_method : # pragma: no cover
530
531
packing_format = "auto_gptq"
531
532
elif "awq" in quantization_config .quant_method :
532
533
packing_format = "auto_awq"
533
534
else : # pragma: no cover
534
535
packing_format = "auto_gptq"
535
- logger .warning ("Quantization backend must be specified. Set it to 'auto_gptq' by default." )
536
+ logger .warning ("quantization backend must be specified. Set it to 'auto_gptq' by default." )
536
537
if packing_format == "auto" :
537
538
packing_format = "auto_gptq"
538
539
0 commit comments