From 8c3f4b5b20b83179ed8ab16a88dfd23ce88d0b88 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Sat, 18 Oct 2025 13:31:52 +0200
Subject: [PATCH 1/8] add qwen3-vl-30b-a3b-Instruct-example

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 .../awq/qwen3-vl-30b-a3b-Instruct-example.py  | 147 ++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 examples/awq/qwen3-vl-30b-a3b-Instruct-example.py

diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
new file mode 100644
index 000000000..0dab1106d
--- /dev/null
+++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
@@ -0,0 +1,147 @@
+import torch
+from datasets import load_dataset
+from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
+
+from llmcompressor import oneshot
+from llmcompressor.modeling import replace_modules_for_calibration
+from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: Requires a minimum of transformers 4.57.0
+
+MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
+
+# Load model.
+model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.bfloat16,
+    device_map=None,
+    trust_remote_code=True
+)
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = replace_modules_for_calibration(model)
+
+DATASET_ID = "neuralmagic/calibration"
+NUM_CALIBRATION_SAMPLES = 256
+MAX_SEQUENCE_LENGTH = 8192
+
+ds = load_dataset(DATASET_ID, name="LLM",
+                  split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess_function(example):
+    messages = []
+    for message in example["messages"]:
+        messages.append(
+            {
+                "role": message["role"],
+                "content": [{"type": "text", "text": message["content"]}],
+            }
+        )
+
+    return processor.apply_chat_template(
+        messages,
+        return_tensors="pt",
+        padding=False,
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+        tokenize=True,
+        add_special_tokens=False,
+        return_dict=True,
+        add_generation_prompt=False,
+    )
+
+
+ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
+
+
+def data_collator(batch):
+    assert len(batch) == 1
+    return {
+        key: (
+            torch.tensor(value)
+            if key != "pixel_values"
+            else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+        )
+        for key, value in batch[0].items()
+    }
+
+
+# Configure AWQ quantization with smoothing and balancing
+recipe = AWQModifier(
+    ignore=[
+        're:.*embed_tokens',
+        're:.*input_layernorm$',
+        're:.*mlp[.]gate$',
+        're:.*post_attention_layernorm$',
+        're:.*norm$',
+        're:model[.]visual.*',
+        're:visual.*',
+        'lm_head'
+    ],
+    mappings=[
+        {
+            "smooth_layer": "re:.*input_layernorm$",
+            "balance_layers": ['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$']
+        },
+        {
+            "smooth_layer": "re:.*v_proj$",
+            "balance_layers": ['re:.*o_proj$']
+        },
+        {
+            "smooth_layer": "re:.*post_attention_layernorm$",
+            "balance_layers": ['re:.*gate_proj$', 're:.*up_proj$']
+        },
+        {
+            "smooth_layer": "re:.*up_proj$",
+            "balance_layers": ['re:.*down_proj$']
+        }
+    ],
+    duo_scaling=True,
+    config_groups={
+        "group_0": {
+            "targets": ["Linear"],
+            "weights": {
+                "num_bits": 8,
+                "type": "int",
+                "symmetric": True,
+                "group_size": 32,
+                "strategy": "group",
+                "block_structure": None,
+                "dynamic": False,
+                "actorder": None,
+                "observer": "mse",
+                "observer_kwargs": {}
+            },
+            "input_activations": None,
+            "output_activations": None,
+            "format": None
+        }
+    }
+)
+
+# Apply AWQ quantization.
+oneshot(
+    model=model,
+    processor=processor,
+    recipe=recipe,
+    dataset=ds,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    data_collator=data_collator,
+
+)
+
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = processor(text="Hello my name is",
+                      return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=20)
+print(processor.decode(output[0]))
+print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-AWQ-W8A16-mse-seq"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)

From a11434de003c5e0fa22c6bcefcebda9eeeda2292 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Sat, 18 Oct 2025 14:10:16 +0200
Subject: [PATCH 2/8] format

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 .../awq/qwen3-vl-30b-a3b-Instruct-example.py  | 43 ++++++++-----------
 1 file changed, 17 insertions(+), 26 deletions(-)

diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
index 0dab1106d..03b97fe13 100644
--- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
+++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
@@ -13,10 +13,7 @@
 
 # Load model.
 model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.bfloat16,
-    device_map=None,
-    trust_remote_code=True
+    MODEL_ID, torch_dtype=torch.bfloat16, device_map=None, trust_remote_code=True
 )
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = replace_modules_for_calibration(model)
@@ -71,32 +68,27 @@ def data_collator(batch):
 # Configure AWQ quantization with smoothing and balancing
 recipe = AWQModifier(
     ignore=[
-        're:.*embed_tokens',
-        're:.*input_layernorm$',
-        're:.*mlp[.]gate$',
-        're:.*post_attention_layernorm$',
-        're:.*norm$',
-        're:model[.]visual.*',
-        're:visual.*',
-        'lm_head'
+        "re:.*embed_tokens",
+        "re:.*input_layernorm$",
+        "re:.*mlp[.]gate$",
+        "re:.*post_attention_layernorm$",
+        "re:.*norm$",
+        "re:model[.]visual.*",
+        "re:visual.*",
+        "lm_head",
     ],
     mappings=[
         {
             "smooth_layer": "re:.*input_layernorm$",
-            "balance_layers": ['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$']
-        },
-        {
-            "smooth_layer": "re:.*v_proj$",
-            "balance_layers": ['re:.*o_proj$']
+            "balance_layers": ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"],
         },
+        {"smooth_layer": "re:.*v_proj$", "balance_layers": ["re:.*o_proj$"]},
         {
             "smooth_layer": "re:.*post_attention_layernorm$",
-            "balance_layers": ['re:.*gate_proj$', 're:.*up_proj$']
+            "balance_layers": ["re:.*gate_proj$", "re:.*up_proj$"],
         },
-        {
-            "smooth_layer": "re:.*up_proj$",
-            "balance_layers": ['re:.*down_proj$']
-        }
+        {"smooth_layer": "re:.*up_proj$",
+            "balance_layers": ["re:.*down_proj$"]},
     ],
     duo_scaling=True,
     config_groups={
@@ -112,13 +104,13 @@ def data_collator(batch):
                 "dynamic": False,
                 "actorder": None,
                 "observer": "mse",
-                "observer_kwargs": {}
+                "observer_kwargs": {},
             },
             "input_activations": None,
             "output_activations": None,
-            "format": None
+            "format": None,
         }
-    }
+    },
 )
 
 # Apply AWQ quantization.
@@ -130,7 +122,6 @@ def data_collator(batch):
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     data_collator=data_collator,
-
 )
 
 print("========== SAMPLE GENERATION ==============")

From f68090a0d22400883b6f9009e22a0a57549bac5d Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Sat, 18 Oct 2025 14:27:51 +0200
Subject: [PATCH 3/8] applied ruff format

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 examples/awq/qwen3-vl-30b-a3b-Instruct-example.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
index 03b97fe13..ee21d4944 100644
--- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
+++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
@@ -22,8 +22,7 @@
 NUM_CALIBRATION_SAMPLES = 256
 MAX_SEQUENCE_LENGTH = 8192
 
-ds = load_dataset(DATASET_ID, name="LLM",
-                  split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
+ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
 ds = ds.shuffle(seed=42)
 
 
@@ -87,8 +86,7 @@ def data_collator(batch):
             "smooth_layer": "re:.*post_attention_layernorm$",
             "balance_layers": ["re:.*gate_proj$", "re:.*up_proj$"],
         },
-        {"smooth_layer": "re:.*up_proj$",
-            "balance_layers": ["re:.*down_proj$"]},
+        {"smooth_layer": "re:.*up_proj$", "balance_layers": ["re:.*down_proj$"]},
     ],
     duo_scaling=True,
     config_groups={
@@ -126,8 +124,7 @@ def data_collator(batch):
 
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = processor(text="Hello my name is",
-                      return_tensors="pt").input_ids.to("cuda")
+input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")

From 53874492d23dea3f8e3653d11c493904c897aaff Mon Sep 17 00:00:00 2001
From: JartX <sagformas@gmail.com>
Date: Mon, 20 Oct 2025 23:54:45 +0200
Subject: [PATCH 4/8] Remove layer balancing mappings from config

Removed mappings for layer balancing in the configuration.
---
 examples/awq/qwen3-vl-30b-a3b-Instruct-example.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
index ee21d4944..11e96622e 100644
--- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
+++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
@@ -76,18 +76,6 @@ def data_collator(batch):
         "re:visual.*",
         "lm_head",
     ],
-    mappings=[
-        {
-            "smooth_layer": "re:.*input_layernorm$",
-            "balance_layers": ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"],
-        },
-        {"smooth_layer": "re:.*v_proj$", "balance_layers": ["re:.*o_proj$"]},
-        {
-            "smooth_layer": "re:.*post_attention_layernorm$",
-            "balance_layers": ["re:.*gate_proj$", "re:.*up_proj$"],
-        },
-        {"smooth_layer": "re:.*up_proj$", "balance_layers": ["re:.*down_proj$"]},
-    ],
     duo_scaling=True,
     config_groups={
         "group_0": {

From 79d63ea7b7c92caaa0aa62ee0e7de13f6157a1e3 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@gmail.com>
Date: Tue, 21 Oct 2025 01:00:33 +0200
Subject: [PATCH 5/8] Update num_bits for AWQ quantization

Changed num_bits from 8 to 4 for quantization.
---
 examples/awq/qwen3-vl-30b-a3b-Instruct-example.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
index 11e96622e..1895ba906 100644
--- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
+++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
@@ -65,6 +65,8 @@ def data_collator(batch):
 
 
 # Configure AWQ quantization with smoothing and balancing
+# NOTE: Using W4A16 quantization with group_size=32 
+# (default W4A16 preset uses 128)
 recipe = AWQModifier(
     ignore=[
         "re:.*embed_tokens",
@@ -81,7 +83,7 @@ def data_collator(batch):
         "group_0": {
             "targets": ["Linear"],
             "weights": {
-                "num_bits": 8,
+                "num_bits": 4,
                 "type": "int",
                 "symmetric": True,
                 "group_size": 32,

From 59ec83cf3f7bff87342abb82815399ccdc20c7fa Mon Sep 17 00:00:00 2001
From: JartX <sagformas@gmail.com>
Date: Tue, 21 Oct 2025 23:37:27 +0200
Subject: [PATCH 6/8] Apply suggestion from @brian-dellabetta

Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
---
 examples/awq/qwen3-vl-30b-a3b-Instruct-example.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
index 1895ba906..8479e2330 100644
--- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
+++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
@@ -88,15 +88,10 @@ def data_collator(batch):
                 "symmetric": True,
                 "group_size": 32,
                 "strategy": "group",
-                "block_structure": None,
                 "dynamic": False,
                 "actorder": None,
                 "observer": "mse",
-                "observer_kwargs": {},
-            },
-            "input_activations": None,
-            "output_activations": None,
-            "format": None,
+            }
         }
     },
 )

From e3b373e8645885dd76fd93d51a5d0b175b148ffc Mon Sep 17 00:00:00 2001
From: JartX <sagformas@gmail.com>
Date: Tue, 21 Oct 2025 23:37:41 +0200
Subject: [PATCH 7/8] Apply suggestion from @brian-dellabetta

Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
---
 examples/awq/qwen3-vl-30b-a3b-Instruct-example.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
index 8479e2330..d715c9b9c 100644
--- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
+++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
@@ -65,8 +65,8 @@ def data_collator(batch):
 
 
 # Configure AWQ quantization with smoothing and balancing
-# NOTE: Using W4A16 quantization with group_size=32 
-# (default W4A16 preset uses 128)
+# NOTE: This recipe uses W4A16 quantization with group_size=32
+# rather than the default preset with group_size=128
 recipe = AWQModifier(
     ignore=[
         "re:.*embed_tokens",

From 9fb1145dab776c8936e3130859ee2d679a42d024 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Tue, 21 Oct 2025 23:55:25 +0200
Subject: [PATCH 8/8] format

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 examples/awq/qwen3-vl-30b-a3b-Instruct-example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
index d715c9b9c..dceaad8ee 100644
--- a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
+++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
@@ -91,7 +91,7 @@ def data_collator(batch):
                 "dynamic": False,
                 "actorder": None,
                 "observer": "mse",
-            }
+            },
         }
     },
 )