Skip to content

Commit ea4d843

Browse files
authored
refine inference backend/code step 1 (#486)
1 parent 709b6d3 commit ea4d843

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+4227
-4961
lines changed

auto_round/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,5 @@
1515
from .mllm import AutoRoundMLLM
1616
from auto_round.utils import LazyImport
1717

18-
from .auto_quantizer import AutoHfQuantizer,AutoRoundConfig
18+
from auto_round.inference.auto_quantizer import AutoHfQuantizer,AutoRoundConfig
1919
from .version import __version__

auto_round/auto_quantizer.py

-837
This file was deleted.

auto_round/autoround.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@
5353
compile_func,
5454
find_matching_blocks, is_debug_mode,
5555
TORCH_VERSION_AT_LEAST_2_6,
56-
supported_layer_types
56+
supported_layer_types,
57+
get_layer_features,
5758
)
5859
from .low_cpu_mem.utils import get_layers_before_block
5960

@@ -448,7 +449,7 @@ def quantize_and_save(self, output_dir: str = "tmp_autoround", format: str = "au
448449
f"Currently only support to export auto_round format quantized model"
449450
" with fp8 dtype activation for activation quantization."
450451
" Change format to fake and save."
451-
)
452+
)
452453
formats = ["fake"]
453454
else:
454455
if len(formats) > 1 or "auto_round" not in formats:
@@ -478,11 +479,6 @@ def quantize_and_save(self, output_dir: str = "tmp_autoround", format: str = "au
478479
format = format.replace('auto_round', 'auto_round:gptq')
479480
formats[index] = format
480481

481-
if not any(f in format for f in ["triton", "exllamav2", "awq", "gptq"]):
482-
logger.info(f"AutoRound format does not support {format}, attempting to use AutoGPTQ")
483-
format = format.replace("auto_round", "auto_gptq")
484-
formats[index] = format
485-
486482
# Remove duplicates from formats list
487483
def remove_duplicates(lst):
488484
seen = set()
@@ -693,6 +689,10 @@ def set_layerwise_config(self, layer_config):
693689
if n not in layers_in_blocks and check_to_quantized(layer_config[n]):
694690
has_qlayer_outside_block = True
695691

692+
in_features, out_features = get_layer_features(m)
693+
if in_features <= layer_config[n]["group_size"]:
694+
layer_config[n]["group_size"] = -1
695+
696696
# Apply the configuration to the corresponding layer in the model
697697
for key in keys:
698698
setattr(m, key, layer_config[n][key])
@@ -1478,7 +1478,7 @@ def quant_blocks(
14781478
m.name = n
14791479

14801480
for i in range(0, len(block_names), nblocks):
1481-
if i!=0:
1481+
if i != 0:
14821482
pbar.update(1)
14831483
if nblocks == 1:
14841484
n = block_names[i]
@@ -1542,7 +1542,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
15421542
f"Currently only support to export auto_round format quantized model"
15431543
" with fp8 dtype activation for activation quantization."
15441544
" Change format to fake and save."
1545-
)
1545+
)
15461546
format = "fake"
15471547
else:
15481548
if format != "auto_round":

auto_round/eval/evaluation.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import logging
16-
import random
17-
import time
18-
from typing import TYPE_CHECKING, List, Optional, Union
15+
from typing import Optional, Union
1916

20-
import lm_eval
2117
from lm_eval import simple_evaluate as lm_simple_evaluate
2218
import os
2319

@@ -52,7 +48,7 @@ def simple_evaluate(
5248
try:
5349
from auto_round import AutoRoundConfig
5450
except:
55-
from auto_round.auto_quantizer import AutoHfQuantizer
51+
from auto_round.inference.auto_quantizer import AutoHfQuantizer
5652

5753
return lm_simple_evaluate(
5854
model=model,

auto_round/export/export_to_autogptq/export.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def pack_layer(name, model, backend):
105105
##force to float32 to be compatible with torch 2.0
106106
if sym and isinstance(new_layer, auto_round.export.export_to_autogptq.qlinear_triton.QuantLinear):
107107
layer, scale = layer.to("cpu"), scale.to("cpu")
108-
zero = 2 ** (bits - 1)
108+
zero = int(zero.flatten()[0])
109109
else:
110110
layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu").to(torch.float32)
111111
sig = inspect.signature(qlayer.pack)
@@ -126,6 +126,8 @@ def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exll
126126
quant_block_list = kwargs.get("quant_block_list", get_block_names(model))
127127
tokenizer = kwargs.get("tokenizer", None)
128128
processor = kwargs.get("processor", None)
129+
if os.path.exists(output_dir):
130+
logger.warning(f"{output_dir} already exists, this may cause model conflict")
129131
if tokenizer is not None:
130132
tokenizer.save_pretrained(output_dir)
131133
if processor is not None:

auto_round/export/export_to_autogptq/qlinear_triton.py

+7-27
Original file line numberDiff line numberDiff line change
@@ -18,30 +18,6 @@
1818
import torch
1919
import torch.nn as nn
2020
import transformers
21-
import numba
22-
23-
24-
##TODO different bits
25-
# @numba.jit(nopython=True, parallel=True)
26-
# def pack_array_with_numba_b4_c32(
27-
# raw_array: np.ndarray, packed_array: np.ndarray
28-
# ) -> np.ndarray:
29-
# """Pack the array with numba when bits=4 and compress_bits=32."""
30-
# bits = 4
31-
# n_pack = 32 // bits
32-
#
33-
# for row in range(packed_array.shape[0]):
34-
# packed_array[row] = ((((raw_array[row * n_pack + 7]) << 28)
35-
# | ((raw_array[row * n_pack + 6]) << 24)
36-
# | ((raw_array[row * n_pack + 5]) << 20)
37-
# | ((raw_array[row * n_pack + 4]) << 16)
38-
# | ((raw_array[row * n_pack + 3]) << 12)
39-
# | (raw_array[row * n_pack + 2]) << 8)
40-
# | ((raw_array[row * n_pack + 1]) << 4)
41-
# | ((raw_array[row * n_pack]) << 0))
42-
#
43-
# return packed_array
44-
4521

4622
class TritonModuleMixin:
4723
@classmethod
@@ -89,6 +65,7 @@ def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=Fa
8965
"g_idx",
9066
torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
9167
)
68+
9269
if bias:
9370
self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
9471
else:
@@ -108,6 +85,8 @@ def pack(self, linear, scales, zeros, g_idx=None):
10885
device = "cpu"
10986
if torch.cuda.is_available():
11087
device = "cuda:0"
88+
elif torch.xpu.is_available():
89+
device = "xpu:0"
11190

11291
W = linear.weight.data.to(device).clone()
11392
if isinstance(linear, nn.Conv2d):
@@ -118,11 +97,12 @@ def pack(self, linear, scales, zeros, g_idx=None):
11897
repeat_scales = scales.to(device).repeat_interleave(self.group_size, 1)
11998
if isinstance(zeros, torch.Tensor):
12099
repeat_zeros = zeros.to(device).repeat_interleave(self.group_size, 1)
100+
intweight = torch.round(W.to(device) / repeat_scales[:, :W.shape[1]] + repeat_zeros[:, :W.shape[1]]).to(
101+
torch.int32)
121102
else:
122103
repeat_zeros = zeros
123-
124-
intweight = torch.round(W.to(device) / repeat_scales + repeat_zeros).to(
125-
torch.int32)
104+
intweight = torch.round(W.to(device) / repeat_scales[:, :W.shape[1]] + repeat_zeros).to(
105+
torch.int32)
126106

127107
del repeat_scales
128108
intweight = intweight.reshape(-1, intweight.shape[1] // 32 * self.bits, 32 // self.bits)

auto_round/export/export_to_autoround/export.py

+15-10
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import transformers
2323

2424
import auto_round.export.export_to_autoround.qlinear_triton_act
25+
import auto_round_extension.cuda.qlinear_tritonv2
2526
from auto_round.utils import get_layer_names_in_block, get_module, logger, set_module, supported_layer_types
2627
import threadpoolctl as tctl
2728
import inspect
@@ -71,17 +72,18 @@ def dynamic_import_quant_linear_for_packing(backend, bits, group_size, sym, act_
7172
if "auto_round" in backend and "awq" not in backend and "gptq" not in backend:
7273
if act_bits <= 8: ##easily have bug for other configuration, need to refine code later
7374
return auto_round.export.export_to_autoround.qlinear_triton_act.QuantLinear
74-
##only support triton and exllamav2
75-
if not ("triton" in backend or "exllamav2" in backend):
76-
logger.warning_once(f"auto_round format does not support {backend}, try to pack each layer with auto_gptq")
77-
return get_autogptq_packing_qlinear(backend, bits, group_size, sym)
7875

79-
from auto_round_extension.cuda.qlinear_triton import QuantLinear
76+
from auto_round_extension.cuda.qlinear_tritonv2 import QuantLinear
8077
return QuantLinear
78+
elif "auto_round" in backend and "gptq" in backend:
79+
from auto_round.export.export_to_autoround.qlinear_triton import QuantLinear ##no g_idx
80+
return QuantLinear
8181
elif "awq" in backend:
8282
from ..export_to_awq.utils import WQLinear_GEMM
8383
return WQLinear_GEMM
84-
elif "gptq" in backend:
84+
elif "gptqmodel" in backend:
85+
return auto_round_extension.cuda.qlinear_tritonv2.QuantLinear
86+
elif "gptq" in backend and not "gptqmodel" in backend: ## have g_idx
8587
return get_autogptq_packing_qlinear(backend, bits, group_size, sym)
8688
else:
8789
assert False, f"only support auto_gptq, auto_awq and auto_round backend"
@@ -188,6 +190,8 @@ def pack_layer(layer_name, model, backend):
188190
new_layer.device = device
189191
set_module(model, layer_name, new_layer)
190192
qlayer = new_layer
193+
if sym:
194+
zp = int(zp.flatten()[0])
191195

192196
qlayer.to("cpu")
193197
##force to float32 to be compatible with torch 2.0
@@ -202,6 +206,9 @@ def pack_layer(layer_name, model, backend):
202206
scale, zp = scale.to(torch.float32), zp.to(torch.float32)
203207
scale = scale.t().contiguous()
204208
zp = zp.t().contiguous()
209+
if sym:
210+
zp = int(zp.flatten()[0])
211+
205212
if bits != 4:
206213
logger.error("AutoAWQ format only supports 4-bits quantization.")
207214
qlayer = QuantLinear.from_linear(
@@ -243,10 +250,6 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
243250
if (kwargs.get("sym") is None or kwargs.get("sym") == True) and ("gptq" not in backend and "awq" not in backend):
244251
backend = backend.replace('auto_round', 'auto_round:gptq')
245252

246-
if not ("triton" in backend or "exllamav2" in backend or "awq" in backend or "gptq" in backend):
247-
logger.info(f"AutoRound format does not support {backend}, try to pack each layer with AutoGPTQ")
248-
backend = backend.replace("auto_round", "auto_gptq")
249-
250253
model = kwargs["model"]
251254
safe_serialization = True if 'safe_serialization' not in kwargs.keys() else kwargs["safe_serialization"]
252255
if not inplace:
@@ -306,6 +309,8 @@ def wrapper(name):
306309
if output_dir is None:
307310
model.tokenizer = tokenizer
308311
return model
312+
if os.path.exists(output_dir):
313+
logger.warning(f"{output_dir} already exists, this may cause model conflict")
309314
if tokenizer is not None:
310315
tokenizer.save_pretrained(output_dir)
311316

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# Copyright (c) 2024 Intel Corporation
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import math
16+
17+
import numpy as np
18+
import torch
19+
import torch.nn as nn
20+
import transformers
21+
22+
class TritonModuleMixin:
23+
@classmethod
24+
def warmup(cls, model, transpose=False, seqlen=2048):
25+
pass
26+
27+
28+
class QuantLinear(nn.Module, TritonModuleMixin):
29+
QUANT_TYPE = "triton"
30+
31+
def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs):
32+
super().__init__()
33+
if bits not in [2, 4, 8]:
34+
raise NotImplementedError("Only 2,4,8 bits are supported.")
35+
if infeatures % 32 != 0 or outfeatures % 32 != 0:
36+
raise NotImplementedError("in_feature and out_feature must be divisible by 32.")
37+
self.infeatures = infeatures
38+
self.outfeatures = outfeatures
39+
self.bits = bits
40+
self.group_size = group_size if group_size != -1 else infeatures
41+
self.maxq = 2 ** self.bits - 1
42+
43+
self.register_buffer(
44+
"qweight",
45+
torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
46+
)
47+
self.register_buffer(
48+
"qzeros",
49+
torch.zeros(
50+
(
51+
math.ceil(infeatures / self.group_size),
52+
outfeatures // 32 * self.bits,
53+
),
54+
dtype=torch.int32,
55+
),
56+
)
57+
self.register_buffer(
58+
"scales",
59+
torch.zeros(
60+
(math.ceil(infeatures / self.group_size), outfeatures),
61+
dtype=torch.float16,
62+
),
63+
)
64+
65+
if bias:
66+
self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
67+
else:
68+
self.bias = None
69+
70+
self.trainable = trainable
71+
72+
def post_init(self):
73+
pass
74+
75+
def pack(self, linear, scales, zeros, g_idx=None):
76+
scales_t = scales.t().contiguous()
77+
if linear.bias is not None:
78+
self.bias = linear.bias.clone().half()
79+
self.scales = scales_t.clone().half()
80+
device = "cpu"
81+
if torch.cuda.is_available():
82+
device = "cuda:0"
83+
elif torch.xpu.is_available():
84+
device = "xpu:0"
85+
86+
W = linear.weight.data.to(device).clone()
87+
if isinstance(linear, nn.Conv2d):
88+
W = W.flatten(1)
89+
if isinstance(linear, transformers.pytorch_utils.Conv1D):
90+
W = W.t()
91+
92+
repeat_scales = scales.to(device).repeat_interleave(self.group_size, 1)
93+
if isinstance(zeros, torch.Tensor):
94+
repeat_zeros = zeros.to(device).repeat_interleave(self.group_size, 1)
95+
intweight = torch.round(W.to(device) / repeat_scales[:, :W.shape[1]] + repeat_zeros[:, :W.shape[1]]).to(
96+
torch.int32)
97+
else:
98+
repeat_zeros = zeros
99+
intweight = torch.round(W.to(device) / repeat_scales[:, :W.shape[1]] + repeat_zeros).to(
100+
torch.int32)
101+
102+
del repeat_scales
103+
intweight = intweight.reshape(-1, intweight.shape[1] // 32 * self.bits, 32 // self.bits)
104+
order_map = torch.arange(0, 32 // self.bits, device=device) * self.bits
105+
intweight = intweight << order_map
106+
intweight = torch.sum(intweight, dim=-1)
107+
108+
intweight = intweight.t().contiguous().to(torch.int32)
109+
self.qweight = intweight.to("cpu")
110+
111+
if isinstance(zeros, torch.Tensor):
112+
zeros = zeros.t().contiguous()
113+
zeros -= 1
114+
zeros = zeros.numpy().astype(np.uint32)
115+
qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
116+
i = 0
117+
col = 0
118+
while col < qzeros.shape[1]:
119+
for j in range(i, i + (32 // self.bits)):
120+
qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
121+
i += 32 // self.bits
122+
col += 1
123+
124+
qzeros = qzeros.astype(np.int32)
125+
self.qzeros = torch.from_numpy(qzeros)
126+
else:
127+
zeros -= 1
128+
shape = scales_t.shape
129+
value = 0
130+
for j in range(0, (32 // self.bits)):
131+
value |= zeros << (self.bits * j)
132+
qzeros = np.ones((shape[0], shape[1] // 32 * self.bits), dtype=np.uint32) * value
133+
qzeros = qzeros.astype(np.int32)
134+
self.qzeros = torch.from_numpy(qzeros)
135+
136+
137+
__all__ = ["QuantLinear"]

0 commit comments

Comments
 (0)