Skip to content

Commit b428604

Browse files
committed
Format
1 parent 7546f76 commit b428604

File tree

1 file changed

+21
-142
lines changed

1 file changed

+21
-142
lines changed

auto_fp8/modeling.py

+21-142
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from llmcompressor.transformers import oneshot
88
from llmcompressor.modifiers.quantization import QuantizationModifier
99

10+
1011
class BaseQuantizeConfig:
1112
"""Configuration for model quantization.
1213
@@ -24,6 +25,7 @@ class BaseQuantizeConfig:
2425
By default, "lm_head" is included to ignore the embedding
2526
Linear layer usually at the end of decoder LLMs
2627
"""
28+
2729
def __init__(
2830
self,
2931
quant_method: str = "fp8",
@@ -36,108 +38,41 @@ def __init__(
3638

3739

3840
class AutoFP8ForCausalLM:
39-
def __init__(self, model: SparseAutoModelForCausalLM, quantize_config: BaseQuantizeConfig):
41+
def __init__(
42+
self, model: SparseAutoModelForCausalLM, quantize_config: BaseQuantizeConfig
43+
):
4044
self.model = model
4145
self.model_type = self.model.config.model_type
4246
self.config = self.model.config
43-
<<<<<<< HEAD
44-
45-
# Gather the Linear module names that we want to ignore
46-
quantize_config.ignored_layers = get_layers_to_ignore(
47-
self.model, quantize_config.ignore_patterns
48-
)
49-
50-
if quantize_config.kv_cache_quant_targets:
51-
<<<<<<< HEAD
52-
<<<<<<< HEAD
53-
kv_cache_quant_layers = get_kv_cache_quant_layers(
54-
=======
55-
kv_cache_quant_layers = get_kv_cache_quant_layer(
56-
>>>>>>> 3ee9283 (Support calibrating kv cache scales)
57-
=======
58-
kv_cache_quant_layers = get_kv_cache_quant_layers(
59-
>>>>>>> c3acdee (Switch from output_scale to kv_scale)
60-
self.model, quantize_config.kv_cache_quant_targets
61-
)
62-
if len(kv_cache_quant_layers) == 0:
63-
raise ValueError(
64-
f"Could not find any kv cache layers using kv_cache_quant_targets={quantize_config.kv_cache_quant_targets}, please fix your argument."
65-
)
66-
quantize_config.kv_cache_quant_layers = kv_cache_quant_layers
67-
68-
=======
69-
>>>>>>> ba7d420 (Switch backend to use llm-compressor)
7047
self.quantize_config = quantize_config
7148

7249
@classmethod
73-
def from_pretrained(cls, pretrained_model_name_or_path: str, quantize_config: BaseQuantizeConfig, **kwargs):
50+
def from_pretrained(
51+
cls,
52+
pretrained_model_name_or_path: str,
53+
quantize_config: BaseQuantizeConfig,
54+
**kwargs,
55+
):
7456
config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
7557
model = SparseAutoModelForCausalLM.from_pretrained(
7658
pretrained_model_name_or_path,
7759
config=config,
7860
device_map="auto",
7961
torch_dtype="auto",
80-
**kwargs
62+
**kwargs,
8163
)
8264
return cls(model, quantize_config)
8365

84-
<<<<<<< HEAD
85-
def quantize(self, calibration_tokens: Optional[torch.Tensor] = None):
86-
<<<<<<< HEAD
87-
<<<<<<< HEAD
88-
=======
89-
def _prepare_calibration_data(calibration_tokens):
90-
if hasattr(calibration_tokens, "input_ids"):
91-
return calibration_tokens.input_ids
92-
return calibration_tokens
93-
>>>>>>> 3ee9283 (Support calibrating kv cache scales)
94-
=======
95-
>>>>>>> 2739d61 (Add Qwen test)
96-
97-
# Always quantize the weights as they do not require calibration data
98-
quantize_weights(self.model, self.quantize_config)
99-
100-
if self.quantize_config.activation_scheme == "static":
101-
assert (
102-
calibration_tokens is not None
103-
), "Calibration tokens required for activation quantization"
104-
<<<<<<< HEAD
105-
<<<<<<< HEAD
106-
=======
107-
>>>>>>> 2739d61 (Add Qwen test)
108-
109-
110-
def _prepare_calibration_data(calibration_tokens):
111-
if hasattr(calibration_tokens, "input_ids"):
112-
return calibration_tokens.input_ids
113-
return calibration_tokens
114-
115-
<<<<<<< HEAD
116-
=======
117-
>>>>>>> 3ee9283 (Support calibrating kv cache scales)
118-
=======
119-
>>>>>>> 2739d61 (Add Qwen test)
120-
quantize_activations(
121-
self.model,
122-
self.quantize_config,
123-
_prepare_calibration_data(calibration_tokens),
124-
)
125-
126-
def save_quantized(self, save_dir):
127-
save_quantized_model(
128-
self.model,
129-
quant_config=self.quantize_config,
130-
save_dir=save_dir,
131-
=======
13266
def quantize(self, dataset: Optional[Dataset] = None):
133-
assert self.quantize_config.activation_scheme == "static"
134-
assert dataset is not None, "Calibration tokens required for static activation quantization"
67+
assert (
68+
self.quantize_config.activation_scheme == "static"
69+
), "Dynamic isn't supported yet"
70+
assert (
71+
dataset is not None
72+
), "Calibration tokens required for static activation quantization"
13573

13674
recipe = QuantizationModifier(
137-
targets="Linear",
138-
scheme="FP8",
139-
ignore=self.quantize_config.ignore_patterns
140-
>>>>>>> ba7d420 (Switch backend to use llm-compressor)
75+
targets="Linear", scheme="FP8", ignore=self.quantize_config.ignore_patterns
14176
)
14277

14378
oneshot(
@@ -149,64 +84,8 @@ def quantize(self, dataset: Optional[Dataset] = None):
14984
def save_quantized(self, save_directory: str):
15085
self.save_pretrained(save_directory, save_compressed=True)
15186

152-
<<<<<<< HEAD
153-
for name, linear in model.named_modules():
154-
if not isinstance(linear, torch.nn.Linear):
155-
continue
156-
157-
for ignore_pattern in ignore_patterns:
158-
regex_prefix = "re:"
159-
if ignore_pattern.startswith(regex_prefix):
160-
# check if name matches regex and add to set if true
161-
regex_pattern = ignore_pattern[len(regex_prefix) :]
162-
if re.search(regex_pattern, name):
163-
ignored_layers.add(name)
164-
else:
165-
# else, exact match
166-
if ignore_pattern == name:
167-
ignored_layers.add(name)
168-
169-
return list(ignored_layers)
170-
171-
172-
<<<<<<< HEAD
173-
<<<<<<< HEAD
174-
def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
175-
kv_cache_quant_layers = []
176-
=======
177-
def get_kv_cache_quant_layer(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
178-
kv_cache_quant_layers = set()
179-
>>>>>>> 3ee9283 (Support calibrating kv cache scales)
180-
=======
181-
def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
182-
kv_cache_quant_layers = []
183-
>>>>>>> c3acdee (Switch from output_scale to kv_scale)
184-
185-
for name, linear in model.named_modules():
186-
if not isinstance(linear, torch.nn.Linear):
187-
continue
188-
189-
for output_quant_target in kv_cache_quant_targets:
190-
if name.endswith(output_quant_target):
191-
<<<<<<< HEAD
192-
<<<<<<< HEAD
193-
kv_cache_quant_layers.append(name)
194-
195-
return kv_cache_quant_layers
196-
=======
197-
kv_cache_quant_layers.add(name)
198-
199-
return list(kv_cache_quant_layers)
200-
>>>>>>> 3ee9283 (Support calibrating kv cache scales)
201-
=======
202-
kv_cache_quant_layers.append(name)
203-
204-
return kv_cache_quant_layers
205-
>>>>>>> c3acdee (Switch from output_scale to kv_scale)
206-
=======
207-
def save_pretrained(self, save_directory: str, save_compressed: bool = True):
87+
def save_pretrained(self, save_directory: str, save_compressed: bool = True):
20888
self.model.save_pretrained(save_directory, save_compressed=save_compressed)
20989
tokenizer = AutoTokenizer.from_pretrained(self.model.config._name_or_path)
21090
tokenizer.save_pretrained(save_directory)
211-
print(f"Saved final checkpoint to {os.path.abspath(save_directory)}")
212-
>>>>>>> ba7d420 (Switch backend to use llm-compressor)
91+
print(f"Saved final checkpoint to {os.path.abspath(save_directory)}")

0 commit comments

Comments
 (0)