Skip to content

Commit 709b6d3

Browse files
authored
[GGUF support step3]patch for double quant (#473)
1 parent 23447b5 commit 709b6d3

File tree

11 files changed

+258
-125
lines changed

11 files changed

+258
-125
lines changed

.azure-pipelines/scripts/ut/run_ut.sh

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ echo "set up UT env..."
66
pip install pytest-cov pytest-html
77
pip install -r /auto-round/test/requirements.txt
88
pip list
9+
# install latest gguf for ut test
10+
git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install .
911

1012
cd /auto-round/test || exit 1
1113
find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +

auto_round/autoround.py

+21
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import os
1616
import re
17+
import sys
1718

1819
import torch
1920
import copy
@@ -350,7 +351,17 @@ def _set_device_for_matching_module(self, name, device):
350351
else:
351352
module.tuning_device = device
352353

354+
def _dq_check(self):
355+
"""Reset the default value of super_bits and super_group_size"""
356+
from auto_round.export.export_to_gguf.config import GGUF_CONFIG
357+
if self.data_type.endswith("_dq"):
358+
gguf_config = GGUF_CONFIG[f"gguf:q{self.bits}_k_s"]
359+
self.super_bits = gguf_config["super_bits"] if self.super_bits is None else self.super_bits
360+
self.super_group_size = gguf_config["super_group_size"] \
361+
if self.super_group_size is None else self.super_group_size
362+
353363
def check_configs(self):
364+
354365
"""Checks if the configurations are valid.
355366
356367
Raises:
@@ -392,6 +403,7 @@ def check_configs(self):
392403
f"reset gradient_accumulate_steps to {self.gradient_accumulate_steps}"
393404
f" as nsamples must equal or greater"
394405
f" than gradient_accumulate_steps * batch_size")
406+
self._dq_check()
395407

396408
# def _check_format_compatibility(self, format): ##TODO
397409
# ##check lm_head, mixed_bits, bits, each layer supporting, etc
@@ -491,9 +503,11 @@ def remove_duplicates(lst):
491503
save_format_ = format.replace(":", "-").replace("_", "-")
492504
save_folder = os.path.join(output_dir, save_format_) if len(formats) > 1 else output_dir
493505
self.save_quantized(save_folder, format=format, inplace=inplace, **kwargs)
506+
494507
folders.append(save_folder)
495508

496509
return model, folders
510+
497511
def quantize(self):
498512
"""Quantize the model and return the quantized model along with layer configurations.
499513
the entry of AutoRound.
@@ -1538,6 +1552,13 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
15381552
)
15391553
format = "auto_round"
15401554

1555+
if re.search("q\d_k", format) and not self.data_type.endswith("_dq"):
1556+
logger.error(
1557+
f"datatype<{self.data_type}> not support to export {format} format."
1558+
" Please change export format or data_type."
1559+
)
1560+
sys.exit(-1)
1561+
15411562
if self.low_cpu_mem_usage:
15421563
self.model = self.model.to('cpu')
15431564

auto_round/export/export_to_gguf/config.py

+18-18
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,24 @@
1818

1919
GGUF_CONFIG["gguf:q4_1"] = {"bits": 4, "act_bits": 16, "group_size": 32, "asym": True, "data_type": "int"}
2020

21-
# GGUF_CONFIG["gguf:q4_k_s"] = {
22-
# "bits": 4,
23-
# "act_bits": 16,
24-
# "super_group_size": 8,
25-
# "super_bits": 6,
26-
# "group_size": 32,
27-
# "asym": True,
28-
# "data_type": "int_asym_dq"
29-
# }
21+
GGUF_CONFIG["gguf:q4_k_s"] = {
22+
"bits": 4,
23+
"act_bits": 16,
24+
"super_group_size": 8,
25+
"super_bits": 6,
26+
"group_size": 32,
27+
"asym": True,
28+
"data_type": "int_asym_dq"
29+
}
3030

31-
# GGUF_CONFIG["gguf:q2_k_s"] = {
32-
# "bits": 2,
33-
# "act_bits": 16,
34-
# "super_group_size": 16,
35-
# "super_bits": 4,
36-
# "group_size": 16,
37-
# "asym": True,
38-
# "data_type": "int_asym_dq"
39-
# }
31+
GGUF_CONFIG["gguf:q2_k_s"] = {
32+
"bits": 2,
33+
"act_bits": 16,
34+
"super_group_size": 16,
35+
"super_bits": 4,
36+
"group_size": 16,
37+
"asym": True,
38+
"data_type": "int_asym_dq"
39+
}
4040

4141
GGUF_CONFIG["gguf:q8_0"] = {"bits": 8, "act_bits": 16, "group_size": 32, "asym": False, "data_type": "int"}

auto_round/export/export_to_gguf/convert.py

+12
Original file line numberDiff line numberDiff line change
@@ -1149,7 +1149,19 @@ def _quant_data(data, data_qtype):
11491149
layer_name = name[:-len(suffix)]
11501150
module = get_module(self.model, layer_name)
11511151
if hasattr(module, "scale"):
1152+
1153+
if hasattr(self, "permute"):
1154+
bs = module.scale.shape[0]
1155+
for attr in ["scale", "zp", "w_d_scale", "w_d_wmin_m", "w_wmin_m"]:
1156+
if hasattr(module, attr) and getattr(module, attr) is not None:
1157+
attr_tensor = getattr(module, attr)
1158+
ori_shape = attr_tensor.shape
1159+
attr_tensor = self.modify_tensors(attr_tensor.reshape(bs, -1), name, bid)[0][1]
1160+
attr_tensor = attr_tensor.reshape(ori_shape)
1161+
setattr(module, attr, attr_tensor)
1162+
11521163
scale = module.scale
1164+
11531165
if isinstance(scale, torch.Tensor):
11541166
scale = scale.numpy()
11551167
zp = module.zp if hasattr(module, "zp") else None

auto_round/export/export_to_gguf/export.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@
3131
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
3232
"q4_0": gguf.LlamaFileType.MOSTLY_Q4_0,
3333
"q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
34-
# "q4_k_s": gguf.LlamaFileType.MOSTLY_Q4_K_S,
35-
# "q2_k_s": gguf.LlamaFileType.MOSTLY_Q2_K_S,
34+
"q4_k_s": gguf.LlamaFileType.MOSTLY_Q4_K_S,
35+
"q2_k_s": gguf.LlamaFileType.MOSTLY_Q2_K_S,
3636
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
3737
"auto": gguf.LlamaFileType.GUESSED,
3838
}

auto_round/export/export_to_gguf/quant.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,19 @@ def register(cls):
3939
return register
4040

4141

42-
def ggml_quant(data: np.array, ggml_type, scale=None, zp=None, wmin_m=None, d_scale=None, d_wmin_m=None, worker=16):
42+
def ggml_quant(data: np.array, ggml_type, scale=None, zp=None, wmin_m=None, d_scale=None, d_wmin_m=None):
4343
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
4444

4545
data = data.astype(np.float32, copy=False)
4646
shape = data.shape
4747
n_blocks = data.size // block_size
4848
blocks = data.reshape((n_blocks, block_size))
4949

50+
if ggml_type.endswith("_k"):
51+
worker = 16
52+
else:
53+
worker = 0
54+
5055
if worker > 0:
5156
n_groups = (data.shape[0] // worker) or 1
5257
blocks = np.array_split(blocks, n_groups, axis=0)

auto_round/script/llm.py

+9-52
Original file line numberDiff line numberDiff line change
@@ -311,61 +311,13 @@ def setup_eval_parser():
311311
return args
312312

313313

314-
def _gguf_args_check(args):
315-
from auto_round.utils import logger
316-
from auto_round.export.export_to_gguf.config import GGUF_CONFIG
317-
318-
formats = args.format.lower().replace(' ', '').split(",")
319-
for format in GGUF_CONFIG:
320-
if format in formats:
321-
from pathlib import Path
322-
from auto_round.export.export_to_gguf.convert import Model
323-
hparams = Model.load_hparams(Path(args.model))
324-
model_architecture = hparams["architectures"][0]
325-
try:
326-
model_class = Model.from_model_architecture(model_architecture)
327-
except NotImplementedError:
328-
logger.error(f"Model {model_architecture} is not supported to export GGUF format")
329-
sys.exit(1)
330-
331-
if format.endswith("_k") and ("hidden_size" in hparams and hparams["hidden_size"] % 256 != 0):
332-
model_name = args.model.split('/')
333-
model_name = model_name[-1] if model_name[-1] else model_name[-2]
334-
hidden_size = hparams["hidden_size"]
335-
logger.error(
336-
f"Currently only support pure mode for format: {format}. "
337-
f"{model_name} is not supported, cause hidden_size({hidden_size}) % 256 !=0")
338-
sys.exit(-1)
339-
340-
unsupport_list, reset_list = [], []
341-
gguf_config = GGUF_CONFIG[format]
342-
for k, v in gguf_config.items():
343-
if getattr(args, k) != v:
344-
unsupport_list.append(f"{k}={getattr(args, k)}")
345-
reset_list.append(f"{k}={v}")
346-
setattr(args, k, v)
347-
if len(unsupport_list) > 0:
348-
if len(formats) > 1:
349-
logger.error(
350-
f"format {format} not support for {', '.join(unsupport_list)},"
351-
f" please reset to {', '.join(reset_list)}, and retry")
352-
exit(-1)
353-
else:
354-
logger.error(
355-
f"format {format} not support for {', '.join(unsupport_list)},"
356-
f" reset to {', '.join(reset_list)}.")
357-
logger.info(f"export format {format}, sym = {not args.asym}, group_size = {args.group_size}")
358-
359-
return args
360-
361-
362314
def tune(args):
363315
import transformers
364316

365317
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, AutoConfig
366318

367319
from auto_round.utils import detect_device, get_library_version
368-
from auto_round.utils import logger
320+
from auto_round.utils import logger, _gguf_args_check
369321

370322
tasks = args.tasks
371323
if args.format is None:
@@ -602,9 +554,12 @@ def tune(args):
602554

603555
if args.act_bits <= 8 or eval_gguf_model:
604556
if eval_gguf_model:
557+
# gguf floder only contains one file
605558
for file in os.listdir(eval_folder):
606559
gguf_file = file
607-
user_model = AutoModelForCausalLM.from_pretrained(eval_folder, gguf_file=gguf_file, device_map="auto")
560+
model = AutoModelForCausalLM.from_pretrained(
561+
eval_folder, gguf_file=gguf_file, device_map="auto" if use_auto_mapping else None)
562+
tokenizer = AutoTokenizer.from_pretrained(eval_folder, gguf_file=gguf_file)
608563
else:
609564
if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
610565
from accelerate.big_modeling import dispatch_model
@@ -616,7 +571,8 @@ def tune(args):
616571
user_model = model.to(device_str)
617572

618573
if args.eval_task_by_task:
619-
eval_task_by_task(user_model, device=device_str, tasks=args.tasks, batch_size=args.eval_bs)
574+
eval_task_by_task(
575+
user_model, tokenizer=tokenizer, device=device_str, tasks=args.tasks, batch_size=args.eval_bs)
620576
else:
621577
if args.eval_bs is None or args.eval_bs == "auto":
622578
logger.warning("This API does not support auto currently, reset eval_bs to 16")
@@ -660,7 +616,8 @@ def eval(args):
660616
print(make_table(res))
661617

662618

663-
def eval_task_by_task(model, device, tasks, tokenizer=None, batch_size=None, max_batch_size=64, trust_remote_code=True):
619+
def eval_task_by_task(
620+
model, device=None, tasks=None, tokenizer=None, batch_size=None, max_batch_size=64, trust_remote_code=True):
664621
set_cuda_visible_devices(device)
665622
device_str, parallelism = get_device_and_parallelism(device)
666623

auto_round/script/mllm.py

+3-52
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222
is_debug_mode,
2323
get_device_and_parallelism,
2424
set_cuda_visible_devices,
25-
logger)
25+
logger,
26+
_gguf_args_check
27+
)
2628

2729

2830
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
@@ -278,57 +280,6 @@ def setup_lmeval_parser():
278280
args = parser.parse_args()
279281
return args
280282

281-
def _gguf_args_check(args):
282-
from auto_round.utils import logger
283-
284-
_GGUF_CONFIG = {
285-
"gguf:q4_0": {
286-
"bits": 4,
287-
"act_bits": 16,
288-
"group_size": 32,
289-
"asym": False,
290-
},
291-
"gguf:q4_1": {
292-
"bits": 4,
293-
"act_bits": 16,
294-
"group_size": 32,
295-
"asym": True,
296-
}
297-
}
298-
299-
formats = args.format.lower().replace(' ', '').split(",")
300-
for format in _GGUF_CONFIG:
301-
if format in formats:
302-
from pathlib import Path
303-
from auto_round.export.export_to_gguf.convert import Model
304-
hparams = Model.load_hparams(Path(args.model))
305-
model_architecture = hparams["architectures"][0]
306-
try:
307-
model_class = Model.from_model_architecture(model_architecture)
308-
except NotImplementedError:
309-
logger.error(f"Model {model_architecture} is not supported to export GGUF format")
310-
sys.exit(1)
311-
312-
unsupport_list, reset_list = [], []
313-
gguf_config = _GGUF_CONFIG[format]
314-
for k, v in gguf_config.items():
315-
if getattr(args, k) != v:
316-
unsupport_list.append(f"{k}={getattr(args, k)}")
317-
reset_list.append(f"{k}={v}")
318-
setattr(args, k, v)
319-
if len(unsupport_list) > 0:
320-
if len(formats) > 1:
321-
logger.error(
322-
f"format {format} not support for {', '.join(unsupport_list)},"
323-
f" please reset to {', '.join(reset_list)}, and retry")
324-
exit(-1)
325-
else:
326-
logger.error(
327-
f"format {format} not support for {', '.join(unsupport_list)},"
328-
f" reset to {', '.join(reset_list)}.")
329-
logger.info(f"export format {format}, sym = {not args.asym}, group_size = {args.group_size}")
330-
331-
return args
332283

333284
def tune(args):
334285
import transformers

auto_round/utils.py

+53
Original file line numberDiff line numberDiff line change
@@ -1204,3 +1204,56 @@ def is_debug_mode():
12041204
bool: True if debugging is enabled, False otherwise.
12051205
"""
12061206
return sys.gettrace() is not None or sys.flags.debug == 1
1207+
1208+
1209+
def _gguf_args_check(args):
1210+
from auto_round.utils import logger
1211+
from auto_round.export.export_to_gguf.config import GGUF_CONFIG
1212+
1213+
formats = args.format.lower().replace(' ', '').split(",")
1214+
formats = sorted(formats, key=lambda x:len(x))
1215+
pattern = re.compile("q\d_k")
1216+
pre_dq_format = ""
1217+
for format in GGUF_CONFIG:
1218+
if format in formats:
1219+
if re.search(pattern, format):
1220+
if pre_dq_format and re.search(pattern, format).group() not in pre_dq_format:
1221+
logger.error(f"Cannot eport {pre_dq_format} and {format} at the same time.")
1222+
sys.exit(-1)
1223+
else:
1224+
pre_dq_format = format
1225+
1226+
if os.path.isdir(args.model):
1227+
from pathlib import Path
1228+
from auto_round.export.export_to_gguf.convert import Model
1229+
hparams = Model.load_hparams(Path(args.model))
1230+
model_architecture = hparams["architectures"][0]
1231+
try:
1232+
model_class = Model.from_model_architecture(model_architecture)
1233+
except NotImplementedError:
1234+
logger.error(f"Model {model_architecture} is not supported to export GGUF format")
1235+
sys.exit(1)
1236+
1237+
if re.search(pattern, format) and ("hidden_size" in hparams and hparams["hidden_size"] % 256 !=0):
1238+
model_name = args.model.split('/')
1239+
model_name = model_name[-1] if model_name[-1] else model_name[-2]
1240+
hidden_size = hparams["hidden_size"]
1241+
logger.error(
1242+
f"Currently only support pure mode for format: {format}. "
1243+
f"{model_name} is not supported, cause hidden_size({hidden_size}) % 256 !=0")
1244+
sys.exit(-1)
1245+
1246+
unsupport_list, reset_list = [], []
1247+
gguf_config = GGUF_CONFIG[format]
1248+
for k, v in gguf_config.items():
1249+
if getattr(args, k) != v:
1250+
unsupport_list.append(f"{k}={getattr(args, k)}")
1251+
reset_list.append(f"{k}={v}")
1252+
setattr(args, k, v)
1253+
if len(unsupport_list) > 0:
1254+
logger.error(
1255+
f"format {format} not support for {', '.join(unsupport_list)},"
1256+
f" reset to {', '.join(reset_list)}.")
1257+
logger.info(f"export format {format}, sym = {not args.asym}, group_size = {args.group_size}")
1258+
1259+
return args

test/requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
addict
22
modelscope
3+
gguf

0 commit comments

Comments
 (0)