add qwen3-vl-30b-a3b-Instruct-example

JartX · JartX · commit 8c3f4b5b20b8 · 2025-10-18T13:46:37.000+02:00
Signed-off-by: JartX &lt;sagformas@epdcenter.es&gt;
diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
@@ -0,0 +1,147 @@
+import torch
+from datasets import load_dataset
+from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
+
+from llmcompressor import oneshot
+from llmcompressor.modeling import replace_modules_for_calibration
+from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: Requires a minimum of transformers 4.57.0
+
+MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
+
+# Load model.
+model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.bfloat16,
+    device_map=None,
+    trust_remote_code=True
+)
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = replace_modules_for_calibration(model)
+
+DATASET_ID = "neuralmagic/calibration"
+NUM_CALIBRATION_SAMPLES = 256
+MAX_SEQUENCE_LENGTH = 8192
+
+ds = load_dataset(DATASET_ID, name="LLM",
+                  split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess_function(example):
+    messages = []
+    for message in example["messages"]:
+        messages.append(
+            {
+                "role": message["role"],
+                "content": [{"type": "text", "text": message["content"]}],
+            }
+        )
+
+    return processor.apply_chat_template(
+        messages,
+        return_tensors="pt",
+        padding=False,
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+        tokenize=True,
+        add_special_tokens=False,
+        return_dict=True,
+        add_generation_prompt=False,
+    )
+
+
+ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
+
+
+def data_collator(batch):
+    assert len(batch) == 1
+    return {
+        key: (
+            torch.tensor(value)
+            if key != "pixel_values"
+            else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+        )
+        for key, value in batch[0].items()
+    }
+
+
+# Configure AWQ quantization with smoothing and balancing
+recipe = AWQModifier(
+    ignore=[
+        're:.*embed_tokens',
+        're:.*input_layernorm$',
+        're:.*mlp[.]gate$',
+        're:.*post_attention_layernorm$',
+        're:.*norm$',
+        're:model[.]visual.*',
+        're:visual.*',
+        'lm_head'
+    ],
+    mappings=[
+        {
+            "smooth_layer": "re:.*input_layernorm$",
+            "balance_layers": ['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$']
+        },
+        {
+            "smooth_layer": "re:.*v_proj$",
+            "balance_layers": ['re:.*o_proj$']
+        },
+        {
+            "smooth_layer": "re:.*post_attention_layernorm$",
+            "balance_layers": ['re:.*gate_proj$', 're:.*up_proj$']
+        },
+        {
+            "smooth_layer": "re:.*up_proj$",
+            "balance_layers": ['re:.*down_proj$']
+        }
+    ],
+    duo_scaling=True,
+    config_groups={
+        "group_0": {
+            "targets": ["Linear"],
+            "weights": {
+                "num_bits": 8,
+                "type": "int",
+                "symmetric": True,
+                "group_size": 32,
+                "strategy": "group",
+                "block_structure": None,
+                "dynamic": False,
+                "actorder": None,
+                "observer": "mse",
+                "observer_kwargs": {}
+            },
+            "input_activations": None,
+            "output_activations": None,
+            "format": None
+        }
+    }
+)
+
+# Apply AWQ quantization.
+oneshot(
+    model=model,
+    processor=processor,
+    recipe=recipe,
+    dataset=ds,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    data_collator=data_collator,
+
+)
+
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = processor(text="Hello my name is",
+                      return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=20)
+print(processor.decode(output[0]))
+print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-AWQ-W8A16-mse-seq"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)