From 9f0e0ac8f80fe65cf3d4abe63ceede537bf1b291 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 14 Aug 2025 22:13:15 +0000 Subject: [PATCH 01/27] match_named_modules, add observer on_start instead of on_initialize Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/mappings.py | 1 + .../modifiers/quantization/quantization/base.py | 12 ++++++++---- .../modifiers/quantization/quantization/mixin.py | 14 +++++++------- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py index d446dd324..ba9cd122e 100644 --- a/src/llmcompressor/modifiers/awq/mappings.py +++ b/src/llmcompressor/modifiers/awq/mappings.py @@ -157,6 +157,7 @@ class AWQMapping: "Phi3ForCausalLM": _phi_mappings, "Phi3VForCausalLM": _phi_mappings, "Qwen2ForCausalLM": _default_mappings, + "Qwen2_5OmniThinkerForConditionalGeneration": _default_mappings, "Qwen2MoeForCausalLM": _moe_default_mappings, "Qwen3ForCausalLM": _default_mappings, "Qwen3MoeForCausalLM": _moe_default_mappings, diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py index 07332f214..4baf28e2c 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/base.py +++ b/src/llmcompressor/modifiers/quantization/quantization/base.py @@ -1,5 +1,7 @@ import tqdm - +from compressed_tensors.utils import ( + match_named_modules, +) from llmcompressor.core import Event, EventType, State from llmcompressor.modifiers import Modifier from llmcompressor.modifiers.quantization.calibration import ( @@ -69,14 +71,16 @@ def on_start(self, state: State, event: Event, **kwargs): self.started_ = True QuantizationMixin.start_calibration(self, state.model) - modules = list(state.model.modules()) + named_modules = list( + match_named_modules(state.model, self.targets, self.ignore) + ) # TODO: this step can be combined with update_weight_zp_scale # once update_fused_layer_weight_global_scales is removed # and not required by vLLM - for module in tqdm.tqdm(modules): + for name, module in tqdm.tqdm(named_modules): update_weight_global_scale(module) - for module in tqdm.tqdm(modules, desc="Calibrating weights"): + for name, module in tqdm.tqdm(named_modules, desc="Calibrating weights"): update_fused_layer_weight_global_scales(module) update_weight_zp_scale(module) diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py index d193d85a1..5961a0212 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/mixin.py +++ b/src/llmcompressor/modifiers/quantization/quantization/mixin.py @@ -116,7 +116,7 @@ def validate_scheme( def initialize_quantization(self, model: torch.nn.Module): """ - Attach quantization schemes and observers to modules in the model according to + Attach quantization schemes to modules in the model according to the quantization config specified on this modifier :param model: model to attach schemes and observers to @@ -127,25 +127,25 @@ def initialize_quantization(self, model: torch.nn.Module): config = self.resolve_quantization_config() apply_quantization_config(model, config) - # apply observers, disable quantization until calibration - model.apply(self._initialize_observers) + # disable quantization until calibration model.apply(disable_quantization) def start_calibration(self, model: torch.nn.Module): """ - Register activation calibration hooks (including kv_cache quantization) and - enable quantization as we calibrate + Attach observers, register activation calibration hooks (including + kv_cache quantization) and enable quantization as we calibrate :param model: model to prepare for calibration """ self._calibration_hooks = self._initialize_hooks(model) + model.apply(self._initialize_observers) model.apply(apply_calibration_status) model.apply(enable_quantization) # quantize at the same time as calibrate def end_calibration(self, model: torch.nn.Module): """ - Remove calibration hooks and set the model status to frozen. Keep quantization - enabled for future operations + Remove calibration hooks and observers, and set the model status to frozen. + Keep quantization enabled for future operations :param model: model to end calibration for """ From 14486af105afbf5596e6bd8040ab06ff885f5c30 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Wed, 20 Aug 2025 21:59:33 +0000 Subject: [PATCH 02/27] scoped quant status/config Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 6 ++++-- .../quantization/quantization/base.py | 9 ++++---- .../quantization/quantization/mixin.py | 21 +++++++++++++------ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index 5c187c17e..9be4146be 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -265,8 +265,10 @@ def on_end(self, state: State, event: Event, **kwargs): self.ended_ = True - modules = list(state.model.modules()) - for module in tqdm(modules, desc="Calibrating weights"): + for _, module in tqdm( + match_named_modules(state.model, self.targets, self.ignore), + desc="Calibrating weights", + ): update_weight_zp_scale(module) QuantizationMixin.end_calibration(self, state.model) diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py index 4baf28e2c..aa6208da4 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/base.py +++ b/src/llmcompressor/modifiers/quantization/quantization/base.py @@ -1,7 +1,6 @@ import tqdm -from compressed_tensors.utils import ( - match_named_modules, -) +from compressed_tensors.utils import match_named_modules + from llmcompressor.core import Event, EventType, State from llmcompressor.modifiers import Modifier from llmcompressor.modifiers.quantization.calibration import ( @@ -77,10 +76,10 @@ def on_start(self, state: State, event: Event, **kwargs): # TODO: this step can be combined with update_weight_zp_scale # once update_fused_layer_weight_global_scales is removed # and not required by vLLM - for name, module in tqdm.tqdm(named_modules): + for _, module in tqdm.tqdm(named_modules): update_weight_global_scale(module) - for name, module in tqdm.tqdm(named_modules, desc="Calibrating weights"): + for _, module in tqdm.tqdm(named_modules, desc="Calibrating weights"): update_fused_layer_weight_global_scales(module) update_weight_zp_scale(module) diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py index 5961a0212..e8d5cd931 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/mixin.py +++ b/src/llmcompressor/modifiers/quantization/quantization/mixin.py @@ -14,6 +14,7 @@ is_preset_scheme, preset_name_to_scheme, ) +from compressed_tensors.utils import match_named_modules from pydantic import Field, PrivateAttr, field_validator from torch.utils.hooks import RemovableHandle @@ -121,12 +122,15 @@ def initialize_quantization(self, model: torch.nn.Module): :param model: model to attach schemes and observers to """ - reset_quantization_status(model) # reset any previously applied qconfigs - # apply scheme and status to model config = self.resolve_quantization_config() + + for _, module in match_named_modules(model, self.targets, self.ignore): + reset_quantization_status(module) # reset any previously applied qconfigs + apply_quantization_config(model, config) + # TODO should we disable for entire model or just matching modules? # disable quantization until calibration model.apply(disable_quantization) @@ -138,8 +142,11 @@ def start_calibration(self, model: torch.nn.Module): :param model: model to prepare for calibration """ self._calibration_hooks = self._initialize_hooks(model) - model.apply(self._initialize_observers) - model.apply(apply_calibration_status) + for _, module in match_named_modules(model, self.targets, self.ignore): + self._initialize_observers(module) + apply_calibration_status(module) + + # TODO should we disable for entire model or just matching modules? model.apply(enable_quantization) # quantize at the same time as calibrate def end_calibration(self, model: torch.nn.Module): @@ -150,7 +157,9 @@ def end_calibration(self, model: torch.nn.Module): :param model: model to end calibration for """ self.remove_hooks(self._calibration_hooks) - model.apply(freeze_module_quantization) # remove observers + for _, module in match_named_modules(model, self.targets, self.ignore): + freeze_module_quantization(module) # remove observers + model.apply(enable_quantization) # keep quantization enabled def has_config(self) -> bool: @@ -240,7 +249,7 @@ def _initialize_observers(self, module: torch.nn.Module): def _initialize_hooks(self, model: torch.nn.Module) -> Set[RemovableHandle]: hooks = set() - for module in model.modules(): + for _, module in match_named_modules(model, self.targets, self.ignore): if not hasattr(module, "quantization_scheme"): continue From ff5067a919114f69377f3dd7266a1cc1f5ce779d Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 21 Aug 2025 20:04:07 +0000 Subject: [PATCH 03/27] scoped GPTQModifier Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/quantization/gptq/base.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index 13b5a5411..98961048b 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -10,6 +10,7 @@ get_execution_device, getattr_chain, update_offload_parameter, + match_named_modules, ) from loguru import logger from pydantic import PrivateAttr, field_validator @@ -165,7 +166,10 @@ def on_initialize(self, state: State, **kwargs) -> bool: QuantizationMixin.initialize_quantization(self, state.model) # prepare module names - self._module_names = {m: name for name, m in state.model.named_modules()} + self._module_names = { + m: name + for name, m in match_named_modules(state.model, self.targets, self.ignore) + } return True @@ -178,7 +182,7 @@ def on_start(self, state: State, event: Event, **kwargs): # register gptq hooks added_hook = False - for module in state.model.modules(): + for _, module in match_named_modules(state.model, self.targets, self.ignore): if getattr_chain(module, "quantization_scheme.weights", None) is not None: # HACK: previously, embeddings were not quantized because they were not # accessible by the layer compressor. For now, we manually ignore it, From f99db2f804c2d27254dc02506a72842e40aaf83f Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 21 Aug 2025 20:24:37 +0000 Subject: [PATCH 04/27] style fixes Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/quantization/gptq/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index 98961048b..c108d987c 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -9,8 +9,8 @@ align_module_device, get_execution_device, getattr_chain, - update_offload_parameter, match_named_modules, + update_offload_parameter, ) from loguru import logger from pydantic import PrivateAttr, field_validator From 5da7b6d1c7315af45fe9743445bad387142e416d Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 15 Sep 2025 20:30:51 +0000 Subject: [PATCH 05/27] multi-modifier example Signed-off-by: Brian Dellabetta --- examples/multi_modifier/README.md | 8 ++ examples/multi_modifier/llama3_example.py | 101 ++++++++++++++++++ .../modifiers/quantization/gptq/base.py | 19 ++-- 3 files changed, 120 insertions(+), 8 deletions(-) create mode 100644 examples/multi_modifier/README.md create mode 100644 examples/multi_modifier/llama3_example.py diff --git a/examples/multi_modifier/README.md b/examples/multi_modifier/README.md new file mode 100644 index 000000000..7c1a48c1a --- /dev/null +++ b/examples/multi_modifier/README.md @@ -0,0 +1,8 @@ +# Quantizing Models with Multiple Quantization Modifiers # + +This section outlines how multiple quantization modifiers can be applied to the same model, for example applying AWQ W4A16 to a model's `self_attn` layers and GPTQ W8A8 to its `mlp` layers. The heterogeneous application of multiple modifiers comes in 2 flavors: + +1. Run every modifier in a single, sequential pipeline, performing a single calibrated run. See `./llama3_example.py` for an example. +2. Run each modifier in its own, independent pipeline, performing a calibrated run for each modifier. To run each modifier independently, run `./llama3_example.py` with `oneshot(..., pipeline="independent")` instead of `pipeline="sequential"`. + +This is an advanced usage of `llm-compressor` and an active area of research. Best practices will be provided in a future release, after further research and sensitivity analysis. \ No newline at end of file diff --git a/examples/multi_modifier/llama3_example.py b/examples/multi_modifier/llama3_example.py new file mode 100644 index 000000000..9c278cd3b --- /dev/null +++ b/examples/multi_modifier/llama3_example.py @@ -0,0 +1,101 @@ +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.modifiers.awq import AWQModifier, AWQMapping +from llmcompressor.utils import dispatch_for_generation + +# Select model and load it. +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" + +# Select number of samples. 512 samples is a good place to start. +# Increasing the number of samples can improve accuracy. +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# Configure the quantization algorithm to run. +# * quantize self_attn layers to W8A8 with GPTQ +# * quantize mlp layers to W4A16 with AWQ +# only include mappings pertaining to target layers +recipe = [ + GPTQModifier(targets=r"re:.*self_attn\.(k|q|o|v)_proj$", scheme="W8A8"), + AWQModifier( + targets=r"re:.*mlp\.(down|gate|up)_proj$", + mappings=[ + AWQMapping( + "re:.*post_attention_layernorm$", + ["re:.*gate_proj$", "re:.*up_proj$"], + ), + AWQMapping( + "re:.*up_proj$", + ["re:.*down_proj$"], + ), + ], + scheme="W4A16", + ), +] + +# Apply algorithms. +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + # Option 1) run both modifiers in a single calibrated run + pipeline="sequential", + # Option 2) run each modifier in its own separate pipeline + # pipeline="independent", +) + +# Confirm generations of the quantized model look sane. +print("\n\n") +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +sample = tokenizer("Hello my name is", return_tensors="pt") +sample = {key: value.to(model.device) for key, value in sample.items()} +output = model.generate(**sample, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index c108d987c..fb64d749c 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -182,14 +182,17 @@ def on_start(self, state: State, event: Event, **kwargs): # register gptq hooks added_hook = False - for _, module in match_named_modules(state.model, self.targets, self.ignore): - if getattr_chain(module, "quantization_scheme.weights", None) is not None: - # HACK: previously, embeddings were not quantized because they were not - # accessible by the layer compressor. For now, we manually ignore it, - # but in the FUTURE this should be ignored by the user - if not isinstance(module, torch.nn.Embedding): - self.register_hook(module, self.calibrate_module, "forward") - added_hook = True + for name, module in match_named_modules(state.model, self.targets, self.ignore): + assert ( + getattr_chain(module, "quantization_scheme.weights", None) is not None + ), "" + + # HACK: previously, embeddings were not quantized because they were not + # accessible by the layer compressor. For now, we manually ignore it, + # but in the FUTURE this should be ignored by the user + if not isinstance(module, torch.nn.Embedding): + self.register_hook(module, self.calibrate_module, "forward") + added_hook = True if not added_hook: raise ValueError( From 32ad8dccf52b9aaf47f86c18e628e76cbc829653 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 15 Sep 2025 20:37:08 +0000 Subject: [PATCH 06/27] revert assert check in GPTQ Signed-off-by: Brian Dellabetta --- .../modifiers/quantization/gptq/base.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index fb64d749c..c108d987c 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -182,17 +182,14 @@ def on_start(self, state: State, event: Event, **kwargs): # register gptq hooks added_hook = False - for name, module in match_named_modules(state.model, self.targets, self.ignore): - assert ( - getattr_chain(module, "quantization_scheme.weights", None) is not None - ), "" - - # HACK: previously, embeddings were not quantized because they were not - # accessible by the layer compressor. For now, we manually ignore it, - # but in the FUTURE this should be ignored by the user - if not isinstance(module, torch.nn.Embedding): - self.register_hook(module, self.calibrate_module, "forward") - added_hook = True + for _, module in match_named_modules(state.model, self.targets, self.ignore): + if getattr_chain(module, "quantization_scheme.weights", None) is not None: + # HACK: previously, embeddings were not quantized because they were not + # accessible by the layer compressor. For now, we manually ignore it, + # but in the FUTURE this should be ignored by the user + if not isinstance(module, torch.nn.Embedding): + self.register_hook(module, self.calibrate_module, "forward") + added_hook = True if not added_hook: raise ValueError( From 4db397bbfb5d90af295b9ced8d1d739c6e7ea218 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 15 Sep 2025 20:39:33 +0000 Subject: [PATCH 07/27] stylefix examples Signed-off-by: Brian Dellabetta --- examples/multi_modifier/llama3_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_modifier/llama3_example.py b/examples/multi_modifier/llama3_example.py index 9c278cd3b..2b1be3d7a 100644 --- a/examples/multi_modifier/llama3_example.py +++ b/examples/multi_modifier/llama3_example.py @@ -2,8 +2,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot +from llmcompressor.modifiers.awq import AWQMapping, AWQModifier from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.modifiers.awq import AWQModifier, AWQMapping from llmcompressor.utils import dispatch_for_generation # Select model and load it. From 64f8f39b285798bb6bde4de7a32c9ac91bf5700b Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Wed, 17 Sep 2025 20:42:25 +0000 Subject: [PATCH 08/27] KVCacheScaleType import update Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/quantization/cache.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py index dd3640dda..06a602a1e 100644 --- a/src/llmcompressor/modifiers/quantization/cache.py +++ b/src/llmcompressor/modifiers/quantization/cache.py @@ -1,7 +1,6 @@ from typing import Any, Dict, List, Optional, Tuple -from compressed_tensors.quantization.lifecycle import KVCacheScaleType -from compressed_tensors.quantization.quant_args import QuantizationArgs +from compressed_tensors.quantization import KVCacheScaleType, QuantizationArgs from torch import Tensor from transformers import DynamicCache From 1d3eceb641fb30018d068cc054e7b35b7c1d3b5f Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 18 Sep 2025 16:56:03 +0000 Subject: [PATCH 09/27] codereview multi_modifier -> mixed_precision Signed-off-by: Brian Dellabetta --- examples/{multi_modifier => mixed_precision}/README.md | 4 ++-- .../{multi_modifier => mixed_precision}/llama3_example.py | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename examples/{multi_modifier => mixed_precision}/README.md (67%) rename examples/{multi_modifier => mixed_precision}/llama3_example.py (100%) diff --git a/examples/multi_modifier/README.md b/examples/mixed_precision/README.md similarity index 67% rename from examples/multi_modifier/README.md rename to examples/mixed_precision/README.md index 7c1a48c1a..3cf9cd308 100644 --- a/examples/multi_modifier/README.md +++ b/examples/mixed_precision/README.md @@ -1,6 +1,6 @@ -# Quantizing Models with Multiple Quantization Modifiers # +# Quantizing Mixed-Precision Models with Multiple Quantization Modifiers # -This section outlines how multiple quantization modifiers can be applied to the same model, for example applying AWQ W4A16 to a model's `self_attn` layers and GPTQ W8A8 to its `mlp` layers. The heterogeneous application of multiple modifiers comes in 2 flavors: +This section outlines how multiple quantization modifiers can be applied to the same model for mixed-precision quantization, for example applying AWQ W4A16 to a model's `self_attn` layers and GPTQ W8A8 to its `mlp` layers. This heterogeneous application of multiple modifiers comes in 2 flavors: 1. Run every modifier in a single, sequential pipeline, performing a single calibrated run. See `./llama3_example.py` for an example. 2. Run each modifier in its own, independent pipeline, performing a calibrated run for each modifier. To run each modifier independently, run `./llama3_example.py` with `oneshot(..., pipeline="independent")` instead of `pipeline="sequential"`. diff --git a/examples/multi_modifier/llama3_example.py b/examples/mixed_precision/llama3_example.py similarity index 100% rename from examples/multi_modifier/llama3_example.py rename to examples/mixed_precision/llama3_example.py From 75c7ca6ef185b6c00880f7b9f8e5d7832a2960e3 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 18 Sep 2025 16:57:23 +0000 Subject: [PATCH 10/27] saved model name Signed-off-by: Brian Dellabetta --- examples/mixed_precision/llama3_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mixed_precision/llama3_example.py b/examples/mixed_precision/llama3_example.py index 2b1be3d7a..d4b8d9175 100644 --- a/examples/mixed_precision/llama3_example.py +++ b/examples/mixed_precision/llama3_example.py @@ -96,6 +96,6 @@ def tokenize(sample): print("==========================================\n\n") # Save to disk compressed. -SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128" +SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-gptq-w8a8-self_attn-awq-w4a16-mlp" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) From 81cf4a1e5dbd83444606d4cc6f3231bfff2e2496 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 18 Sep 2025 17:06:35 +0000 Subject: [PATCH 11/27] GPTQ validation layer Signed-off-by: Brian Dellabetta --- .../modifiers/quantization/gptq/base.py | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index 584f17643..61c777473 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -17,7 +17,7 @@ update_offload_parameter, ) from loguru import logger -from pydantic import PrivateAttr, field_validator +from pydantic import PrivateAttr, field_validator, model_validator from llmcompressor.core import Event, EventType, State from llmcompressor.modifiers import Modifier @@ -121,6 +121,20 @@ class GPTQModifier(Modifier, QuantizationMixin): _hessians: Dict[torch.nn.Module, torch.Tensor] = PrivateAttr(default_factory=dict) _num_samples: Dict[torch.nn.Module, int] = PrivateAttr(default_factory=dict) + @model_validator(mode="after") + def validate_model_after(model: "GPTQModifier") -> "GPTQModifier": + """ + Confirm weight quantization exists in each scheme, otherwise + assert will fail when registering GPTQ hooks + """ + config = model.resolve_quantization_config() + for config_group in config.config_groups.values(): + assert ( + config_group.weights is not None + ), "In GPTQ, all config groups must include weight quantization" + + return model + def resolve_quantization_config(self) -> QuantizationConfig: config = super().resolve_quantization_config() @@ -178,14 +192,17 @@ def on_start(self, state: State, event: Event, **kwargs): # register gptq hooks added_hook = False - for _, module in match_named_modules(state.model, self.targets, self.ignore): - if getattr_chain(module, "quantization_scheme.weights", None) is not None: - # HACK: previously, embeddings were not quantized because they were not - # accessible by the layer compressor. For now, we manually ignore it, - # but in the FUTURE this should be ignored by the user - if not isinstance(module, torch.nn.Embedding): - self.register_hook(module, self.calibrate_module, "forward") - added_hook = True + for name, module in match_named_modules(state.model, self.targets, self.ignore): + assert ( + getattr_chain(module, "quantization_scheme.weights", None) is not None + ), f"Matched target {name} does not have associated weight quantization" + + # HACK: previously, embeddings were not quantized because they were not + # accessible by the layer compressor. For now, we manually ignore it, + # but in the FUTURE this should be ignored by the user + if not isinstance(module, torch.nn.Embedding): + self.register_hook(module, self.calibrate_module, "forward") + added_hook = True if not added_hook: raise ValueError( From af6a34bfdddb1e36114844704dce25c2a071664c Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 18 Sep 2025 17:30:12 +0000 Subject: [PATCH 12/27] test fixes Signed-off-by: Brian Dellabetta --- .../modifiers/quantization/test_base.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/llmcompressor/modifiers/quantization/test_base.py b/tests/llmcompressor/modifiers/quantization/test_base.py index 931f7deb6..ce62115fd 100644 --- a/tests/llmcompressor/modifiers/quantization/test_base.py +++ b/tests/llmcompressor/modifiers/quantization/test_base.py @@ -95,12 +95,11 @@ def test_block_strategy_parsing(block_q_config_kwargs): def test_actorder_resolution( has_actorder, actorder, q_config_kwargs, expected_0, expected_1 ): - if has_actorder: - modifier = GPTQModifier(**q_config_kwargs, actorder=actorder) - else: - modifier = GPTQModifier(**q_config_kwargs) - with pytest.raises(ValueError) if expected_0 == "error" else nullcontext(): + if has_actorder: + modifier = GPTQModifier(**q_config_kwargs, actorder=actorder) + else: + modifier = GPTQModifier(**q_config_kwargs) resolved = modifier.resolve_quantization_config() if expected_0 != "error": @@ -155,8 +154,8 @@ def test_config_resolution(strategies, actorder): ) def test_serialize_actorder(has_actorder, actorder, exp_actorder): if has_actorder: - modifier = GPTQModifier(targets=["Linear"], actorder=actorder) + modifier = GPTQModifier(targets=["Linear"], scheme="W8A8", actorder=actorder) else: - modifier = GPTQModifier(targets=["Linear"]) + modifier = GPTQModifier(targets=["Linear"], scheme="W8A8") assert modifier.model_dump()["actorder"] == exp_actorder From 2cc681f833870d510684f2aac9e3851e0f51a78c Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 18 Sep 2025 17:33:48 +0000 Subject: [PATCH 13/27] remove TODOs Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/quantization/quantization/mixin.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py index e8d5cd931..bfac2665d 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/mixin.py +++ b/src/llmcompressor/modifiers/quantization/quantization/mixin.py @@ -130,7 +130,6 @@ def initialize_quantization(self, model: torch.nn.Module): apply_quantization_config(model, config) - # TODO should we disable for entire model or just matching modules? # disable quantization until calibration model.apply(disable_quantization) @@ -146,7 +145,6 @@ def start_calibration(self, model: torch.nn.Module): self._initialize_observers(module) apply_calibration_status(module) - # TODO should we disable for entire model or just matching modules? model.apply(enable_quantization) # quantize at the same time as calibrate def end_calibration(self, model: torch.nn.Module): From 855606ec9281266c3d3685a661125b7efe0fe466 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 18 Sep 2025 20:23:42 +0000 Subject: [PATCH 14/27] revert GPTQ validation changes, fix failing transformers tests Signed-off-by: Brian Dellabetta --- .../modifiers/quantization/gptq/base.py | 35 +++++-------------- .../modifiers/quantization/test_base.py | 9 ++--- .../compression/test_compress_tensor_utils.py | 1 - 3 files changed, 14 insertions(+), 31 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index 61c777473..584f17643 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -17,7 +17,7 @@ update_offload_parameter, ) from loguru import logger -from pydantic import PrivateAttr, field_validator, model_validator +from pydantic import PrivateAttr, field_validator from llmcompressor.core import Event, EventType, State from llmcompressor.modifiers import Modifier @@ -121,20 +121,6 @@ class GPTQModifier(Modifier, QuantizationMixin): _hessians: Dict[torch.nn.Module, torch.Tensor] = PrivateAttr(default_factory=dict) _num_samples: Dict[torch.nn.Module, int] = PrivateAttr(default_factory=dict) - @model_validator(mode="after") - def validate_model_after(model: "GPTQModifier") -> "GPTQModifier": - """ - Confirm weight quantization exists in each scheme, otherwise - assert will fail when registering GPTQ hooks - """ - config = model.resolve_quantization_config() - for config_group in config.config_groups.values(): - assert ( - config_group.weights is not None - ), "In GPTQ, all config groups must include weight quantization" - - return model - def resolve_quantization_config(self) -> QuantizationConfig: config = super().resolve_quantization_config() @@ -192,17 +178,14 @@ def on_start(self, state: State, event: Event, **kwargs): # register gptq hooks added_hook = False - for name, module in match_named_modules(state.model, self.targets, self.ignore): - assert ( - getattr_chain(module, "quantization_scheme.weights", None) is not None - ), f"Matched target {name} does not have associated weight quantization" - - # HACK: previously, embeddings were not quantized because they were not - # accessible by the layer compressor. For now, we manually ignore it, - # but in the FUTURE this should be ignored by the user - if not isinstance(module, torch.nn.Embedding): - self.register_hook(module, self.calibrate_module, "forward") - added_hook = True + for _, module in match_named_modules(state.model, self.targets, self.ignore): + if getattr_chain(module, "quantization_scheme.weights", None) is not None: + # HACK: previously, embeddings were not quantized because they were not + # accessible by the layer compressor. For now, we manually ignore it, + # but in the FUTURE this should be ignored by the user + if not isinstance(module, torch.nn.Embedding): + self.register_hook(module, self.calibrate_module, "forward") + added_hook = True if not added_hook: raise ValueError( diff --git a/tests/llmcompressor/modifiers/quantization/test_base.py b/tests/llmcompressor/modifiers/quantization/test_base.py index ce62115fd..a9d8ea9b7 100644 --- a/tests/llmcompressor/modifiers/quantization/test_base.py +++ b/tests/llmcompressor/modifiers/quantization/test_base.py @@ -95,11 +95,12 @@ def test_block_strategy_parsing(block_q_config_kwargs): def test_actorder_resolution( has_actorder, actorder, q_config_kwargs, expected_0, expected_1 ): + if has_actorder: + modifier = GPTQModifier(**q_config_kwargs, actorder=actorder) + else: + modifier = GPTQModifier(**q_config_kwargs) + with pytest.raises(ValueError) if expected_0 == "error" else nullcontext(): - if has_actorder: - modifier = GPTQModifier(**q_config_kwargs, actorder=actorder) - else: - modifier = GPTQModifier(**q_config_kwargs) resolved = modifier.resolve_quantization_config() if expected_0 != "error": diff --git a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py index a4841a0b4..dd1abdee5 100644 --- a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py +++ b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py @@ -187,7 +187,6 @@ def test_quant_model_reload(format, dtype, tmp_path): for _, module in model.named_modules(): if hasattr(module, "quantization_scheme"): assert module.weight.dtype == dtype - assert module.quantization_status == QuantizationStatus.FROZEN # Save to disk model.save_pretrained( From b25d23b99de5ab12adc7b3bfa85fbd894c8d284d Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 18 Sep 2025 21:16:26 +0000 Subject: [PATCH 15/27] compresstion test fixes Signed-off-by: Brian Dellabetta --- .../transformers/compression/recipes/new_quant_weight.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml index 67aa5df3f..127a830c3 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml @@ -14,4 +14,3 @@ test_stage: targets: ["Linear", "Embedding"] GPTQModifier: block_size: 128 - targets: ["re:model.layers.\\d+$"] \ No newline at end of file From 50fbf158bcfc56320e06c377b367799b6655f229 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Fri, 19 Sep 2025 14:10:19 +0000 Subject: [PATCH 16/27] move exampe to quantization_non_uniform Signed-off-by: Brian Dellabetta --- examples/mixed_precision/README.md | 8 -------- examples/quantization_non_uniform/README.md | 9 +++++++++ .../quantization_multiple_modifiers.py} | 0 3 files changed, 9 insertions(+), 8 deletions(-) delete mode 100644 examples/mixed_precision/README.md rename examples/{mixed_precision/llama3_example.py => quantization_non_uniform/quantization_multiple_modifiers.py} (100%) diff --git a/examples/mixed_precision/README.md b/examples/mixed_precision/README.md deleted file mode 100644 index 3cf9cd308..000000000 --- a/examples/mixed_precision/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# Quantizing Mixed-Precision Models with Multiple Quantization Modifiers # - -This section outlines how multiple quantization modifiers can be applied to the same model for mixed-precision quantization, for example applying AWQ W4A16 to a model's `self_attn` layers and GPTQ W8A8 to its `mlp` layers. This heterogeneous application of multiple modifiers comes in 2 flavors: - -1. Run every modifier in a single, sequential pipeline, performing a single calibrated run. See `./llama3_example.py` for an example. -2. Run each modifier in its own, independent pipeline, performing a calibrated run for each modifier. To run each modifier independently, run `./llama3_example.py` with `oneshot(..., pipeline="independent")` instead of `pipeline="sequential"`. - -This is an advanced usage of `llm-compressor` and an active area of research. Best practices will be provided in a future release, after further research and sensitivity analysis. \ No newline at end of file diff --git a/examples/quantization_non_uniform/README.md b/examples/quantization_non_uniform/README.md index c2e50bdbf..f9dd5b3eb 100644 --- a/examples/quantization_non_uniform/README.md +++ b/examples/quantization_non_uniform/README.md @@ -9,3 +9,12 @@ We demonstrate mixed precision by quantizing models to both int8 and int4, and i ## Multiple Strategies It may also be interesting to quantize a model with two different [quantization strategies](https://github.com/neuralmagic/compressed-tensors/blob/a2bfc03e9d52824ba5d6d2a50c8741dd9bccd5d3/src/compressed_tensors/quantization/quant_args.py#L93) such as group, channel, or per-tensor. [Here](https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_non_uniform/quantization_fp8_multiple_strategies.py) we apply fp8 quantization where all the attention weights are quantized using the per-channel strategy, and all the mlp weights are quantized using per-tensor. This is accomplished through defining multiple config groups in the recipe. The produced model is compressed using the `float-quantized` compressor and can be directly run in vllm. + +## Quantization with Multiple Quantization Modifiers + +This section outlines how multiple quantization modifiers can be applied to the same model for mixed-precision quantization, for example applying AWQ W4A16 to a model's `self_attn` layers and GPTQ W8A8 to its `mlp` layers. This heterogeneous application of multiple modifiers comes in 2 flavors: + +1. Run every modifier in a single, sequential pipeline, performing a single calibrated run. See `./quantization_multiple_modifiers.py` for an example. +2. Run each modifier in its own, independent pipeline, performing a calibrated run for each modifier. To run each modifier independently, run `./quantization_multiple_modifiers.py` with `oneshot(..., pipeline="independent")` instead of `pipeline="sequential"`. + +This is an advanced usage of `llm-compressor` and an active area of research. Best practices will be provided in a future release, after further research and sensitivity analysis. \ No newline at end of file diff --git a/examples/mixed_precision/llama3_example.py b/examples/quantization_non_uniform/quantization_multiple_modifiers.py similarity index 100% rename from examples/mixed_precision/llama3_example.py rename to examples/quantization_non_uniform/quantization_multiple_modifiers.py From a0568f71854c41d60048c8e37f71bdd22a851e8c Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Fri, 19 Sep 2025 18:56:31 +0000 Subject: [PATCH 17/27] QuantizationMixin targets resolution Signed-off-by: Brian Dellabetta --- .../quantization/quantization/mixin.py | 36 ++++++++++++++++--- .../modifiers/calibration/test_frozen.py | 4 +-- .../compression/test_compress_tensor_utils.py | 27 ++++++++------ 3 files changed, 49 insertions(+), 18 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py index bfac2665d..91f660333 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/mixin.py +++ b/src/llmcompressor/modifiers/quantization/quantization/mixin.py @@ -15,7 +15,7 @@ preset_name_to_scheme, ) from compressed_tensors.utils import match_named_modules -from pydantic import Field, PrivateAttr, field_validator +from pydantic import Field, PrivateAttr, field_validator, model_validator from torch.utils.hooks import RemovableHandle from llmcompressor.modifiers.quantization.calibration import ( @@ -59,8 +59,9 @@ class QuantizationMixin(HooksMixin): :param config_groups: dictionary specifying quantization schemes to apply to target modules. Modules not matching a scheme target will NOT be quantized. - :param targets: list of layer names to quantize if a scheme is provided. Defaults - to Linear layers + :param targets: list of layer names to quantize if a scheme is provided. If unset, + will contain all targets listed in config_groups. If config_groups is also + unset, will default to ["Linear"] (i.e. all Linear layers will be targeted). :param ignore: optional list of module class names or submodule names to not quantize even if they match a target in config_groups. Defaults to empty list. :param scheme: a single quantization scheme to apply to the model. This is a @@ -82,7 +83,7 @@ class QuantizationMixin(HooksMixin): """ config_groups: Optional[Dict[str, QuantizationScheme]] = None - targets: Union[str, List[str]] = Field(default_factory=lambda: ["Linear"]) + targets: Optional[Union[str, List[str]]] = None ignore: List[str] = Field(default_factory=list) scheme: Optional[Union[str, Dict[str, Any]]] = None kv_cache_scheme: Optional[QuantizationArgs] = None @@ -90,7 +91,9 @@ class QuantizationMixin(HooksMixin): _calibration_hooks: Set[RemovableHandle] = PrivateAttr(default_factory=set) @field_validator("targets", mode="before") - def validate_targets(cls, value: Union[str, List[str]]) -> List[str]: + def validate_targets( + cls, value: Optional[Union[str, List[str]]] + ) -> Optional[List[str]]: if isinstance(value, str): return [value] @@ -115,6 +118,29 @@ def validate_scheme( return value + @model_validator(mode="after") + def validate_model_after(model: "QuantizationMixin") -> "QuantizationMixin": + """ + If targets has not been set but config_groups has, aggregate targets from + each config_group into a single unique list for self.targets. If config_groups + has not been set, default to targets=["Linear"] + """ + config = model.resolve_quantization_config() + + if model.targets is None: + if config.config_groups is not None: + targets = [] + for config_group in config.config_groups.values(): + for target in config_group.targets: + if target not in targets: + targets.append(target) + + model.targets = list(targets) + else: + model.targets = ["Linear"] + + return model + def initialize_quantization(self, model: torch.nn.Module): """ Attach quantization schemes to modules in the model according to diff --git a/tests/llmcompressor/modifiers/calibration/test_frozen.py b/tests/llmcompressor/modifiers/calibration/test_frozen.py index 4b89a0084..9a9ea6a18 100644 --- a/tests/llmcompressor/modifiers/calibration/test_frozen.py +++ b/tests/llmcompressor/modifiers/calibration/test_frozen.py @@ -37,7 +37,7 @@ def test_set_module_for_calibration(): layer = Linear(4, 4) initialize_module_for_quantization(layer, quantization_scheme) - layer.quantization_status = QuantizationStatus("calibration") + layer.quantization_status = QuantizationStatus.CALIBRATION initialize_observer(layer, "weight") # should have both input and weight observer after initalizing @@ -48,4 +48,4 @@ def test_set_module_for_calibration(): assert not hasattr(layer, "input_observer") assert not hasattr(layer, "weight_observer") - assert layer.quantization_status == QuantizationStatus("frozen") + assert layer.quantization_status == QuantizationStatus.FROZEN diff --git a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py index dd1abdee5..dfbd5f0eb 100644 --- a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py +++ b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py @@ -184,9 +184,14 @@ def test_quant_model_reload(format, dtype, tmp_path): og_state_dict = model.state_dict() save_path_compressed = tmp_path / "compressed" - for _, module in model.named_modules(): + for name, module in model.named_modules(): if hasattr(module, "quantization_scheme"): - assert module.weight.dtype == dtype + assert module.weight.dtype == dtype, ( + f"Module {name} has incorrect weight dtype" + ) + assert module.quantization_status == QuantizationStatus.FROZEN, ( + f"Module {name} has incorrect quantization status" + ) # Save to disk model.save_pretrained( @@ -368,14 +373,14 @@ def test_compressor_stacking(model_stub, recipe, sparse_format, quant_format, tm model, sparsity_config=sparse_format, quantization_format=quant_format ) - assert ( - compressor.sparsity_compressor is not None - ), "Sparse compressor not initialized" + assert compressor.sparsity_compressor is not None, ( + "Sparse compressor not initialized" + ) assert compressor.sparsity_config.format == sparse_format - assert ( - compressor.quantization_compressor is not None - ), "Quantization compressor not initialized" + assert compressor.quantization_compressor is not None, ( + "Quantization compressor not initialized" + ) compressor.compress_model(model) compressor.decompress_model(model) @@ -454,9 +459,9 @@ def test_sparse_24_compressor_is_lossless(model_stub, recipe, sparse_format, tmp # the model instead compressor = ModelCompressor.from_compression_config(compression_config) - assert ( - compressor.sparsity_compressor is not None - ), "Sparse compressor not initialized" + assert compressor.sparsity_compressor is not None, ( + "Sparse compressor not initialized" + ) assert compressor.sparsity_config.format == sparse_format compressor.decompress(model_path=path, model=empty_model) From 5e5e0fe2f0c5f5a93dd6043ef4c072e6c42a6b35 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Fri, 19 Sep 2025 19:04:18 +0000 Subject: [PATCH 18/27] style fixes Signed-off-by: Brian Dellabetta --- .../compression/test_compress_tensor_utils.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py index dfbd5f0eb..72366fddc 100644 --- a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py +++ b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py @@ -186,12 +186,12 @@ def test_quant_model_reload(format, dtype, tmp_path): for name, module in model.named_modules(): if hasattr(module, "quantization_scheme"): - assert module.weight.dtype == dtype, ( - f"Module {name} has incorrect weight dtype" - ) - assert module.quantization_status == QuantizationStatus.FROZEN, ( - f"Module {name} has incorrect quantization status" - ) + assert ( + module.weight.dtype == dtype + ), f"Module {name} has incorrect weight dtype" + assert ( + module.quantization_status == QuantizationStatus.FROZEN + ), f"Module {name} has incorrect quantization status" # Save to disk model.save_pretrained( @@ -373,14 +373,14 @@ def test_compressor_stacking(model_stub, recipe, sparse_format, quant_format, tm model, sparsity_config=sparse_format, quantization_format=quant_format ) - assert compressor.sparsity_compressor is not None, ( - "Sparse compressor not initialized" - ) + assert ( + compressor.sparsity_compressor is not None + ), "Sparse compressor not initialized" assert compressor.sparsity_config.format == sparse_format - assert compressor.quantization_compressor is not None, ( - "Quantization compressor not initialized" - ) + assert ( + compressor.quantization_compressor is not None + ), "Quantization compressor not initialized" compressor.compress_model(model) compressor.decompress_model(model) @@ -459,9 +459,9 @@ def test_sparse_24_compressor_is_lossless(model_stub, recipe, sparse_format, tmp # the model instead compressor = ModelCompressor.from_compression_config(compression_config) - assert compressor.sparsity_compressor is not None, ( - "Sparse compressor not initialized" - ) + assert ( + compressor.sparsity_compressor is not None + ), "Sparse compressor not initialized" assert compressor.sparsity_config.format == sparse_format compressor.decompress(model_path=path, model=empty_model) From 6cd03504d6cd525e89ca48d5573e0a62be55ed5c Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 22 Sep 2025 16:42:18 +0000 Subject: [PATCH 19/27] quant mixin updates Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 12 ++++++++++++ .../modifiers/quantization/test_base.py | 9 ++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index 49698fe09..cc982e23e 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -148,6 +148,18 @@ def validate_model_after(model: "AWQModifier") -> "AWQModifier": """ config = model.resolve_quantization_config() + if model.targets is None: + if config.config_groups is not None: + targets = [] + for config_group in config.config_groups.values(): + for target in config_group.targets: + if target not in targets: + targets.append(target) + + model.targets = list(targets) + else: + model.targets = ["Linear"] + num_bits_set = set( group.weights.num_bits for group in config.config_groups.values() diff --git a/tests/llmcompressor/modifiers/quantization/test_base.py b/tests/llmcompressor/modifiers/quantization/test_base.py index a9d8ea9b7..ce62115fd 100644 --- a/tests/llmcompressor/modifiers/quantization/test_base.py +++ b/tests/llmcompressor/modifiers/quantization/test_base.py @@ -95,12 +95,11 @@ def test_block_strategy_parsing(block_q_config_kwargs): def test_actorder_resolution( has_actorder, actorder, q_config_kwargs, expected_0, expected_1 ): - if has_actorder: - modifier = GPTQModifier(**q_config_kwargs, actorder=actorder) - else: - modifier = GPTQModifier(**q_config_kwargs) - with pytest.raises(ValueError) if expected_0 == "error" else nullcontext(): + if has_actorder: + modifier = GPTQModifier(**q_config_kwargs, actorder=actorder) + else: + modifier = GPTQModifier(**q_config_kwargs) resolved = modifier.resolve_quantization_config() if expected_0 != "error": From 16193372f2d3af14fabc504b942f55873699081e Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 22 Sep 2025 17:43:30 +0000 Subject: [PATCH 20/27] Quant mixin targets validation Signed-off-by: Brian Dellabetta --- .../quantization/quantization/mixin.py | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py index 91f660333..c0f40c759 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/mixin.py +++ b/src/llmcompressor/modifiers/quantization/quantization/mixin.py @@ -83,7 +83,7 @@ class QuantizationMixin(HooksMixin): """ config_groups: Optional[Dict[str, QuantizationScheme]] = None - targets: Optional[Union[str, List[str]]] = None + targets: Union[str, List[str]] = Field(default_factory=list) ignore: List[str] = Field(default_factory=list) scheme: Optional[Union[str, Dict[str, Any]]] = None kv_cache_scheme: Optional[QuantizationArgs] = None @@ -121,23 +121,23 @@ def validate_scheme( @model_validator(mode="after") def validate_model_after(model: "QuantizationMixin") -> "QuantizationMixin": """ - If targets has not been set but config_groups has, aggregate targets from - each config_group into a single unique list for self.targets. If config_groups - has not been set, default to targets=["Linear"] + - If targets have not been set, aggregate targets from + each config_group into a single unique list for self.targets. + - If targets have still not been found, default to targets=["Linear"] """ config = model.resolve_quantization_config() - if model.targets is None: - if config.config_groups is not None: - targets = [] - for config_group in config.config_groups.values(): - for target in config_group.targets: - if target not in targets: - targets.append(target) + if len(model.targets) == 0: + targets = [] + for config_group in config.config_groups.values(): + for target in config_group.targets: + if target not in targets: + targets.append(target) - model.targets = list(targets) - else: - model.targets = ["Linear"] + if len(targets) == 0: + targets.append("Linear") + + model.targets = targets return model @@ -211,6 +211,9 @@ def resolve_quantization_config(self) -> QuantizationConfig: if scheme is not None and config_groups is not None: raise ValueError("Please specify either `scheme` or `config_groups`") + if len(targets) > 0 and config_groups is not None: + raise ValueError("Please specify either `targets` or `config_groups`") + if scheme is not None: # takes precedence over config_groups From e21a933a4d4fbb2aa78778185e4d8dff440b32a4 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 22 Sep 2025 17:46:24 +0000 Subject: [PATCH 21/27] remove extraneous awq changes Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index cc982e23e..49698fe09 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -148,18 +148,6 @@ def validate_model_after(model: "AWQModifier") -> "AWQModifier": """ config = model.resolve_quantization_config() - if model.targets is None: - if config.config_groups is not None: - targets = [] - for config_group in config.config_groups.values(): - for target in config_group.targets: - if target not in targets: - targets.append(target) - - model.targets = list(targets) - else: - model.targets = ["Linear"] - num_bits_set = set( group.weights.num_bits for group in config.config_groups.values() From c437c6f33a5e109480477c6aab69d9cc72baa21a Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 22 Sep 2025 17:50:08 +0000 Subject: [PATCH 22/27] move validation out of resolve quantization config Signed-off-by: Brian Dellabetta --- .../modifiers/quantization/quantization/mixin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py index c0f40c759..65fa8608b 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/mixin.py +++ b/src/llmcompressor/modifiers/quantization/quantization/mixin.py @@ -127,6 +127,9 @@ def validate_model_after(model: "QuantizationMixin") -> "QuantizationMixin": """ config = model.resolve_quantization_config() + if len(model.targets) > 0 and config.config_groups is not None: + raise ValueError("Please specify either `targets` or `config_groups`") + if len(model.targets) == 0: targets = [] for config_group in config.config_groups.values(): @@ -211,9 +214,6 @@ def resolve_quantization_config(self) -> QuantizationConfig: if scheme is not None and config_groups is not None: raise ValueError("Please specify either `scheme` or `config_groups`") - if len(targets) > 0 and config_groups is not None: - raise ValueError("Please specify either `targets` or `config_groups`") - if scheme is not None: # takes precedence over config_groups From 4326ee37932eded60b7a85c02c21cc13e69dcabf Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 22 Sep 2025 18:10:02 +0000 Subject: [PATCH 23/27] remove validation error Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/quantization/quantization/mixin.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py index 65fa8608b..639bfd217 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/mixin.py +++ b/src/llmcompressor/modifiers/quantization/quantization/mixin.py @@ -127,9 +127,6 @@ def validate_model_after(model: "QuantizationMixin") -> "QuantizationMixin": """ config = model.resolve_quantization_config() - if len(model.targets) > 0 and config.config_groups is not None: - raise ValueError("Please specify either `targets` or `config_groups`") - if len(model.targets) == 0: targets = [] for config_group in config.config_groups.values(): From 2611ac60832403ae7bc820bb75d7c023b8d362bb Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 22 Sep 2025 18:11:43 +0000 Subject: [PATCH 24/27] moved quant config call Signed-off-by: Brian Dellabetta --- .../modifiers/quantization/quantization/mixin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py index 639bfd217..7db57619f 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/mixin.py +++ b/src/llmcompressor/modifiers/quantization/quantization/mixin.py @@ -121,13 +121,13 @@ def validate_scheme( @model_validator(mode="after") def validate_model_after(model: "QuantizationMixin") -> "QuantizationMixin": """ - - If targets have not been set, aggregate targets from - each config_group into a single unique list for self.targets. + - If targets have not been set, aggregate targets from quantization + config into a single unique list for self.targets. - If targets have still not been found, default to targets=["Linear"] """ - config = model.resolve_quantization_config() if len(model.targets) == 0: + config = model.resolve_quantization_config() targets = [] for config_group in config.config_groups.values(): for target in config_group.targets: From 2ea5698c2d4a07e59fe05155c10f39c8d159f20b Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 22 Sep 2025 18:14:14 +0000 Subject: [PATCH 25/27] retain validation error Signed-off-by: Brian Dellabetta --- .../modifiers/quantization/quantization/mixin.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py index 7db57619f..7e257e9f8 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/mixin.py +++ b/src/llmcompressor/modifiers/quantization/quantization/mixin.py @@ -91,9 +91,7 @@ class QuantizationMixin(HooksMixin): _calibration_hooks: Set[RemovableHandle] = PrivateAttr(default_factory=set) @field_validator("targets", mode="before") - def validate_targets( - cls, value: Optional[Union[str, List[str]]] - ) -> Optional[List[str]]: + def validate_targets(cls, value: Union[str, List[str]]) -> List[str]: if isinstance(value, str): return [value] @@ -126,6 +124,9 @@ def validate_model_after(model: "QuantizationMixin") -> "QuantizationMixin": - If targets have still not been found, default to targets=["Linear"] """ + if len(model.targets) > 0 and model.config_groups is not None: + raise ValueError("Please specify either `targets` or `config_groups`") + if len(model.targets) == 0: config = model.resolve_quantization_config() targets = [] From 33695d59506a0bc4b351504de866ec94643773d7 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 22 Sep 2025 18:24:35 +0000 Subject: [PATCH 26/27] don't call resolve config in validation layer Signed-off-by: Brian Dellabetta --- .../modifiers/quantization/quantization/mixin.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py index 7e257e9f8..0ef507dbc 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/mixin.py +++ b/src/llmcompressor/modifiers/quantization/quantization/mixin.py @@ -119,8 +119,8 @@ def validate_scheme( @model_validator(mode="after") def validate_model_after(model: "QuantizationMixin") -> "QuantizationMixin": """ - - If targets have not been set, aggregate targets from quantization - config into a single unique list for self.targets. + - If targets have not been set, aggregate targets from config_groups + into a single unique list - If targets have still not been found, default to targets=["Linear"] """ @@ -128,9 +128,8 @@ def validate_model_after(model: "QuantizationMixin") -> "QuantizationMixin": raise ValueError("Please specify either `targets` or `config_groups`") if len(model.targets) == 0: - config = model.resolve_quantization_config() targets = [] - for config_group in config.config_groups.values(): + for config_group in model.config_groups.values(): for target in config_group.targets: if target not in targets: targets.append(target) From 170f04b75bd6887b2cbf59a2bafb0a6511d0e6ae Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 22 Sep 2025 18:44:27 +0000 Subject: [PATCH 27/27] minor refactor for when model.config_groups is None Signed-off-by: Brian Dellabetta --- .../modifiers/quantization/quantization/mixin.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py index 0ef507dbc..f37efb56a 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/mixin.py +++ b/src/llmcompressor/modifiers/quantization/quantization/mixin.py @@ -127,17 +127,14 @@ def validate_model_after(model: "QuantizationMixin") -> "QuantizationMixin": if len(model.targets) > 0 and model.config_groups is not None: raise ValueError("Please specify either `targets` or `config_groups`") - if len(model.targets) == 0: - targets = [] + if len(model.targets) == 0 and model.config_groups is not None: for config_group in model.config_groups.values(): for target in config_group.targets: - if target not in targets: - targets.append(target) - - if len(targets) == 0: - targets.append("Linear") + if target not in model.targets: + model.targets.append(target) - model.targets = targets + if len(model.targets) == 0: + model.targets.append("Linear") return model