7
7
from llmcompressor .transformers import oneshot
8
8
from llmcompressor .modifiers .quantization import QuantizationModifier
9
9
10
+
10
11
class BaseQuantizeConfig :
11
12
"""Configuration for model quantization.
12
13
@@ -24,6 +25,7 @@ class BaseQuantizeConfig:
24
25
By default, "lm_head" is included to ignore the embedding
25
26
Linear layer usually at the end of decoder LLMs
26
27
"""
28
+
27
29
def __init__ (
28
30
self ,
29
31
quant_method : str = "fp8" ,
@@ -36,108 +38,41 @@ def __init__(
36
38
37
39
38
40
class AutoFP8ForCausalLM :
39
- def __init__ (self , model : SparseAutoModelForCausalLM , quantize_config : BaseQuantizeConfig ):
41
+ def __init__ (
42
+ self , model : SparseAutoModelForCausalLM , quantize_config : BaseQuantizeConfig
43
+ ):
40
44
self .model = model
41
45
self .model_type = self .model .config .model_type
42
46
self .config = self .model .config
43
- < << << << HEAD
44
-
45
- # Gather the Linear module names that we want to ignore
46
- quantize_config .ignored_layers = get_layers_to_ignore (
47
- self .model , quantize_config .ignore_patterns
48
- )
49
-
50
- if quantize_config .kv_cache_quant_targets :
51
- < << << << HEAD
52
- < << << << HEAD
53
- kv_cache_quant_layers = get_kv_cache_quant_layers (
54
- == == == =
55
- kv_cache_quant_layers = get_kv_cache_quant_layer (
56
- >> >> >> > 3 ee9283 (Support calibrating kv cache scales )
57
- == == == =
58
- kv_cache_quant_layers = get_kv_cache_quant_layers (
59
- >> >> >> > c3acdee (Switch from output_scale to kv_scale )
60
- self .model , quantize_config .kv_cache_quant_targets
61
- )
62
- if len (kv_cache_quant_layers ) == 0 :
63
- raise ValueError (
64
- f"Could not find any kv cache layers using kv_cache_quant_targets={ quantize_config .kv_cache_quant_targets } , please fix your argument."
65
- )
66
- quantize_config .kv_cache_quant_layers = kv_cache_quant_layers
67
-
68
- == == == =
69
- >> > >> > > ba7d420 (Switch backend to use llm - compressor )
70
47
self .quantize_config = quantize_config
71
48
72
49
@classmethod
73
- def from_pretrained (cls , pretrained_model_name_or_path : str , quantize_config : BaseQuantizeConfig , ** kwargs ):
50
+ def from_pretrained (
51
+ cls ,
52
+ pretrained_model_name_or_path : str ,
53
+ quantize_config : BaseQuantizeConfig ,
54
+ ** kwargs ,
55
+ ):
74
56
config = AutoConfig .from_pretrained (pretrained_model_name_or_path )
75
57
model = SparseAutoModelForCausalLM .from_pretrained (
76
58
pretrained_model_name_or_path ,
77
59
config = config ,
78
60
device_map = "auto" ,
79
61
torch_dtype = "auto" ,
80
- ** kwargs
62
+ ** kwargs ,
81
63
)
82
64
return cls (model , quantize_config )
83
65
84
- << << < << HEAD
85
- def quantize (self , calibration_tokens : Optional [torch .Tensor ] = None ):
86
- << << << < HEAD
87
- << < << << HEAD
88
- == == == =
89
- def _prepare_calibration_data (calibration_tokens ):
90
- if hasattr (calibration_tokens , "input_ids" ):
91
- return calibration_tokens .input_ids
92
- return calibration_tokens
93
- >> >> > >> 3 ee9283 (Support calibrating kv cache scales )
94
- == == == =
95
- >> > >> > > 2739 d61 (Add Qwen test )
96
-
97
- # Always quantize the weights as they do not require calibration data
98
- quantize_weights (self .model , self .quantize_config )
99
-
100
- if self .quantize_config .activation_scheme == "static" :
101
- assert (
102
- calibration_tokens is not None
103
- ), "Calibration tokens required for activation quantization"
104
- << << < << HEAD
105
- << < << << HEAD
106
- == == == =
107
- >> > >> > > 2739 d61 (Add Qwen test )
108
-
109
-
110
- def _prepare_calibration_data (calibration_tokens ):
111
- if hasattr (calibration_tokens , "input_ids" ):
112
- return calibration_tokens .input_ids
113
- return calibration_tokens
114
-
115
- << << < << HEAD
116
- == == == =
117
- >> > >> > > 3 ee9283 (Support calibrating kv cache scales )
118
- == == == =
119
- >> > >> > > 2739 d61 (Add Qwen test )
120
- quantize_activations (
121
- self .model ,
122
- self .quantize_config ,
123
- _prepare_calibration_data (calibration_tokens ),
124
- )
125
-
126
- def save_quantized (self , save_dir ):
127
- save_quantized_model (
128
- self .model ,
129
- quant_config = self .quantize_config ,
130
- save_dir = save_dir ,
131
- == == == =
132
66
def quantize (self , dataset : Optional [Dataset ] = None ):
133
- assert self .quantize_config .activation_scheme == "static"
134
- assert dataset is not None , "Calibration tokens required for static activation quantization"
67
+ assert (
68
+ self .quantize_config .activation_scheme == "static"
69
+ ), "Dynamic isn't supported yet"
70
+ assert (
71
+ dataset is not None
72
+ ), "Calibration tokens required for static activation quantization"
135
73
136
74
recipe = QuantizationModifier (
137
- targets = "Linear" ,
138
- scheme = "FP8" ,
139
- ignore = self .quantize_config .ignore_patterns
140
- >> > >> >> ba7d420 (Switch backend to use llm - compressor )
75
+ targets = "Linear" , scheme = "FP8" , ignore = self .quantize_config .ignore_patterns
141
76
)
142
77
143
78
oneshot (
@@ -149,64 +84,8 @@ def quantize(self, dataset: Optional[Dataset] = None):
149
84
def save_quantized (self , save_directory : str ):
150
85
self .save_pretrained (save_directory , save_compressed = True )
151
86
152
- << << < << HEAD
153
- for name , linear in model .named_modules ():
154
- if not isinstance (linear , torch .nn .Linear ):
155
- continue
156
-
157
- for ignore_pattern in ignore_patterns :
158
- regex_prefix = "re:"
159
- if ignore_pattern .startswith (regex_prefix ):
160
- # check if name matches regex and add to set if true
161
- regex_pattern = ignore_pattern [len (regex_prefix ) :]
162
- if re .search (regex_pattern , name ):
163
- ignored_layers .add (name )
164
- else :
165
- # else, exact match
166
- if ignore_pattern == name :
167
- ignored_layers .add (name )
168
-
169
- return list (ignored_layers )
170
-
171
-
172
- << << < << HEAD
173
- << < << << HEAD
174
- def get_kv_cache_quant_layers (model , kv_cache_quant_targets : Tuple [str ]) - > List [str ]:
175
- kv_cache_quant_layers = []
176
- == == == =
177
- def get_kv_cache_quant_layer (model , kv_cache_quant_targets : Tuple [str ]) - > List [str ]:
178
- kv_cache_quant_layers = set ()
179
- >> >> > >> 3 ee9283 (Support calibrating kv cache scales )
180
- == == == =
181
- def get_kv_cache_quant_layers (model , kv_cache_quant_targets : Tuple [str ]) - > List [str ]:
182
- kv_cache_quant_layers = []
183
- >> >> > >> c3acdee (Switch from output_scale to kv_scale )
184
-
185
- for name , linear in model .named_modules ():
186
- if not isinstance (linear , torch .nn .Linear ):
187
- continue
188
-
189
- for output_quant_target in kv_cache_quant_targets :
190
- if name .endswith (output_quant_target ):
191
- << << << < HEAD
192
- << < << << HEAD
193
- kv_cache_quant_layers .append (name )
194
-
195
- return kv_cache_quant_layers
196
- == == == =
197
- kv_cache_quant_layers .add (name )
198
-
199
- return list (kv_cache_quant_layers )
200
- >> >> > >> 3 ee9283 (Support calibrating kv cache scales )
201
- == == == =
202
- kv_cache_quant_layers .append (name )
203
-
204
- return kv_cache_quant_layers
205
- >> >> > >> c3acdee (Switch from output_scale to kv_scale )
206
- == == == =
207
- def save_pretrained (self , save_directory : str , save_compressed : bool = True ):
87
+ def save_pretrained (self , save_directory : str , save_compressed : bool = True ):
208
88
self .model .save_pretrained (save_directory , save_compressed = save_compressed )
209
89
tokenizer = AutoTokenizer .from_pretrained (self .model .config ._name_or_path )
210
90
tokenizer .save_pretrained (save_directory )
211
- print (f"Saved final checkpoint to { os .path .abspath (save_directory )} " )
212
- >> >> > >> ba7d420 (Switch backend to use llm - compressor )
91
+ print (f"Saved final checkpoint to { os .path .abspath (save_directory )} " )
0 commit comments