From 8c3f4b5b20b83179ed8ab16a88dfd23ce88d0b88 Mon Sep 17 00:00:00 2001 From: JartX Date: Sat, 18 Oct 2025 13:31:52 +0200 Subject: [PATCH 1/8] add qwen3-vl-30b-a3b-Instruct-example Signed-off-by: JartX --- .../awq/qwen3-vl-30b-a3b-Instruct-example.py | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 examples/awq/qwen3-vl-30b-a3b-Instruct-example.py diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py new file mode 100644 index 000000000..0dab1106d --- /dev/null +++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py @@ -0,0 +1,147 @@ +import torch +from datasets import load_dataset +from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration + +from llmcompressor import oneshot +from llmcompressor.modeling import replace_modules_for_calibration +from llmcompressor.modifiers.awq import AWQModifier +from llmcompressor.utils import dispatch_for_generation + +# NOTE: Requires a minimum of transformers 4.57.0 + +MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct" + +# Load model. +model = Qwen3VLMoeForConditionalGeneration.from_pretrained( + MODEL_ID, + torch_dtype=torch.bfloat16, + device_map=None, + trust_remote_code=True +) +processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) +model = replace_modules_for_calibration(model) + +DATASET_ID = "neuralmagic/calibration" +NUM_CALIBRATION_SAMPLES = 256 +MAX_SEQUENCE_LENGTH = 8192 + +ds = load_dataset(DATASET_ID, name="LLM", + split=f"train[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess_function(example): + messages = [] + for message in example["messages"]: + messages.append( + { + "role": message["role"], + "content": [{"type": "text", "text": message["content"]}], + } + ) + + return processor.apply_chat_template( + messages, + return_tensors="pt", + padding=False, + truncation=True, + max_length=MAX_SEQUENCE_LENGTH, + tokenize=True, + add_special_tokens=False, + return_dict=True, + add_generation_prompt=False, + ) + + +ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names) + + +def data_collator(batch): + assert len(batch) == 1 + return { + key: ( + torch.tensor(value) + if key != "pixel_values" + else torch.tensor(value, dtype=torch.bfloat16).squeeze(0) + ) + for key, value in batch[0].items() + } + + +# Configure AWQ quantization with smoothing and balancing +recipe = AWQModifier( + ignore=[ + 're:.*embed_tokens', + 're:.*input_layernorm$', + 're:.*mlp[.]gate$', + 're:.*post_attention_layernorm$', + 're:.*norm$', + 're:model[.]visual.*', + 're:visual.*', + 'lm_head' + ], + mappings=[ + { + "smooth_layer": "re:.*input_layernorm$", + "balance_layers": ['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$'] + }, + { + "smooth_layer": "re:.*v_proj$", + "balance_layers": ['re:.*o_proj$'] + }, + { + "smooth_layer": "re:.*post_attention_layernorm$", + "balance_layers": ['re:.*gate_proj$', 're:.*up_proj$'] + }, + { + "smooth_layer": "re:.*up_proj$", + "balance_layers": ['re:.*down_proj$'] + } + ], + duo_scaling=True, + config_groups={ + "group_0": { + "targets": ["Linear"], + "weights": { + "num_bits": 8, + "type": "int", + "symmetric": True, + "group_size": 32, + "strategy": "group", + "block_structure": None, + "dynamic": False, + "actorder": None, + "observer": "mse", + "observer_kwargs": {} + }, + "input_activations": None, + "output_activations": None, + "format": None + } + } +) + +# Apply AWQ quantization. +oneshot( + model=model, + processor=processor, + recipe=recipe, + dataset=ds, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + data_collator=data_collator, + +) + +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +input_ids = processor(text="Hello my name is", + return_tensors="pt").input_ids.to("cuda") +output = model.generate(input_ids, max_new_tokens=20) +print(processor.decode(output[0])) +print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-AWQ-W8A16-mse-seq" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) From a11434de003c5e0fa22c6bcefcebda9eeeda2292 Mon Sep 17 00:00:00 2001 From: JartX Date: Sat, 18 Oct 2025 14:10:16 +0200 Subject: [PATCH 2/8] format Signed-off-by: JartX --- .../awq/qwen3-vl-30b-a3b-Instruct-example.py | 43 ++++++++----------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py index 0dab1106d..03b97fe13 100644 --- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py +++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py @@ -13,10 +13,7 @@ # Load model. model = Qwen3VLMoeForConditionalGeneration.from_pretrained( - MODEL_ID, - torch_dtype=torch.bfloat16, - device_map=None, - trust_remote_code=True + MODEL_ID, torch_dtype=torch.bfloat16, device_map=None, trust_remote_code=True ) processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) model = replace_modules_for_calibration(model) @@ -71,32 +68,27 @@ def data_collator(batch): # Configure AWQ quantization with smoothing and balancing recipe = AWQModifier( ignore=[ - 're:.*embed_tokens', - 're:.*input_layernorm$', - 're:.*mlp[.]gate$', - 're:.*post_attention_layernorm$', - 're:.*norm$', - 're:model[.]visual.*', - 're:visual.*', - 'lm_head' + "re:.*embed_tokens", + "re:.*input_layernorm$", + "re:.*mlp[.]gate$", + "re:.*post_attention_layernorm$", + "re:.*norm$", + "re:model[.]visual.*", + "re:visual.*", + "lm_head", ], mappings=[ { "smooth_layer": "re:.*input_layernorm$", - "balance_layers": ['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$'] - }, - { - "smooth_layer": "re:.*v_proj$", - "balance_layers": ['re:.*o_proj$'] + "balance_layers": ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"], }, + {"smooth_layer": "re:.*v_proj$", "balance_layers": ["re:.*o_proj$"]}, { "smooth_layer": "re:.*post_attention_layernorm$", - "balance_layers": ['re:.*gate_proj$', 're:.*up_proj$'] + "balance_layers": ["re:.*gate_proj$", "re:.*up_proj$"], }, - { - "smooth_layer": "re:.*up_proj$", - "balance_layers": ['re:.*down_proj$'] - } + {"smooth_layer": "re:.*up_proj$", + "balance_layers": ["re:.*down_proj$"]}, ], duo_scaling=True, config_groups={ @@ -112,13 +104,13 @@ def data_collator(batch): "dynamic": False, "actorder": None, "observer": "mse", - "observer_kwargs": {} + "observer_kwargs": {}, }, "input_activations": None, "output_activations": None, - "format": None + "format": None, } - } + }, ) # Apply AWQ quantization. @@ -130,7 +122,6 @@ def data_collator(batch): max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, data_collator=data_collator, - ) print("========== SAMPLE GENERATION ==============") From f68090a0d22400883b6f9009e22a0a57549bac5d Mon Sep 17 00:00:00 2001 From: JartX Date: Sat, 18 Oct 2025 14:27:51 +0200 Subject: [PATCH 3/8] applied ruff format Signed-off-by: JartX --- examples/awq/qwen3-vl-30b-a3b-Instruct-example.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py index 03b97fe13..ee21d4944 100644 --- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py +++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py @@ -22,8 +22,7 @@ NUM_CALIBRATION_SAMPLES = 256 MAX_SEQUENCE_LENGTH = 8192 -ds = load_dataset(DATASET_ID, name="LLM", - split=f"train[:{NUM_CALIBRATION_SAMPLES}]") +ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]") ds = ds.shuffle(seed=42) @@ -87,8 +86,7 @@ def data_collator(batch): "smooth_layer": "re:.*post_attention_layernorm$", "balance_layers": ["re:.*gate_proj$", "re:.*up_proj$"], }, - {"smooth_layer": "re:.*up_proj$", - "balance_layers": ["re:.*down_proj$"]}, + {"smooth_layer": "re:.*up_proj$", "balance_layers": ["re:.*down_proj$"]}, ], duo_scaling=True, config_groups={ @@ -126,8 +124,7 @@ def data_collator(batch): print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = processor(text="Hello my name is", - return_tensors="pt").input_ids.to("cuda") +input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(processor.decode(output[0])) print("==========================================") From 53874492d23dea3f8e3653d11c493904c897aaff Mon Sep 17 00:00:00 2001 From: JartX Date: Mon, 20 Oct 2025 23:54:45 +0200 Subject: [PATCH 4/8] Remove layer balancing mappings from config Removed mappings for layer balancing in the configuration. --- examples/awq/qwen3-vl-30b-a3b-Instruct-example.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py index ee21d4944..11e96622e 100644 --- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py +++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py @@ -76,18 +76,6 @@ def data_collator(batch): "re:visual.*", "lm_head", ], - mappings=[ - { - "smooth_layer": "re:.*input_layernorm$", - "balance_layers": ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"], - }, - {"smooth_layer": "re:.*v_proj$", "balance_layers": ["re:.*o_proj$"]}, - { - "smooth_layer": "re:.*post_attention_layernorm$", - "balance_layers": ["re:.*gate_proj$", "re:.*up_proj$"], - }, - {"smooth_layer": "re:.*up_proj$", "balance_layers": ["re:.*down_proj$"]}, - ], duo_scaling=True, config_groups={ "group_0": { From 79d63ea7b7c92caaa0aa62ee0e7de13f6157a1e3 Mon Sep 17 00:00:00 2001 From: JartX Date: Tue, 21 Oct 2025 01:00:33 +0200 Subject: [PATCH 5/8] Update num_bits for AWQ quantization Changed num_bits from 8 to 4 for quantization. --- examples/awq/qwen3-vl-30b-a3b-Instruct-example.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py index 11e96622e..1895ba906 100644 --- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py +++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py @@ -65,6 +65,8 @@ def data_collator(batch): # Configure AWQ quantization with smoothing and balancing +# NOTE: Using W4A16 quantization with group_size=32 +# (default W4A16 preset uses 128) recipe = AWQModifier( ignore=[ "re:.*embed_tokens", @@ -81,7 +83,7 @@ def data_collator(batch): "group_0": { "targets": ["Linear"], "weights": { - "num_bits": 8, + "num_bits": 4, "type": "int", "symmetric": True, "group_size": 32, From 59ec83cf3f7bff87342abb82815399ccdc20c7fa Mon Sep 17 00:00:00 2001 From: JartX Date: Tue, 21 Oct 2025 23:37:27 +0200 Subject: [PATCH 6/8] Apply suggestion from @brian-dellabetta Co-authored-by: Brian Dellabetta --- examples/awq/qwen3-vl-30b-a3b-Instruct-example.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py index 1895ba906..8479e2330 100644 --- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py +++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py @@ -88,15 +88,10 @@ def data_collator(batch): "symmetric": True, "group_size": 32, "strategy": "group", - "block_structure": None, "dynamic": False, "actorder": None, "observer": "mse", - "observer_kwargs": {}, - }, - "input_activations": None, - "output_activations": None, - "format": None, + } } }, ) From e3b373e8645885dd76fd93d51a5d0b175b148ffc Mon Sep 17 00:00:00 2001 From: JartX Date: Tue, 21 Oct 2025 23:37:41 +0200 Subject: [PATCH 7/8] Apply suggestion from @brian-dellabetta Co-authored-by: Brian Dellabetta --- examples/awq/qwen3-vl-30b-a3b-Instruct-example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py index 8479e2330..d715c9b9c 100644 --- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py +++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py @@ -65,8 +65,8 @@ def data_collator(batch): # Configure AWQ quantization with smoothing and balancing -# NOTE: Using W4A16 quantization with group_size=32 -# (default W4A16 preset uses 128) +# NOTE: This recipe uses W4A16 quantization with group_size=32 +# rather than the default preset with group_size=128 recipe = AWQModifier( ignore=[ "re:.*embed_tokens", From 9fb1145dab776c8936e3130859ee2d679a42d024 Mon Sep 17 00:00:00 2001 From: JartX Date: Tue, 21 Oct 2025 23:55:25 +0200 Subject: [PATCH 8/8] format Signed-off-by: JartX --- examples/awq/qwen3-vl-30b-a3b-Instruct-example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py index d715c9b9c..dceaad8ee 100644 --- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py +++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py @@ -91,7 +91,7 @@ def data_collator(batch): "dynamic": False, "actorder": None, "observer": "mse", - } + }, } }, )