NVIDIA · kevalmorabia97 · Oct 24, 2025 · Oct 22, 2025 · Sep 17, 2025 · Sep 19, 2025
@@ -61,7 +61,7 @@ jobs:
     if: needs.check-file-changes.outputs.any_changed == 'true'
     # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
     runs-on: linux-amd64-gpu-l4-latest-1
-    timeout-minutes: 90
+    timeout-minutes: 120
     container: &gpu_container
       image: nvcr.io/nvidia/pytorch:25.06-py3
       env:
@@ -80,7 +80,7 @@ jobs:
     if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
     # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
     runs-on: linux-amd64-gpu-h100-latest-1
-    timeout-minutes: 90
+    timeout-minutes: 120
     container: *gpu_container
     steps: *gpu_steps
   gpu-pr-required-check:

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -13,6 +13,7 @@ Model Optimizer Changelog (Linux)
 - Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
 - Add support for MCore MoE PTQ/QAT/QAD.
 - Add support for multi-node PTQ and export with FSDP2 in ``examples/llm_ptq/multinode_ptq.py``. See `examples/llm_ptq/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/llm_ptq#multi-node-post-training-quantization-with-fsdp2>`_ for more details.
+- Add support for Nemotron Nano VL v1 & v2 models in FP8/NVFP4 PTQ workflow.
 
 **Documentation**
 

@@ -39,6 +39,91 @@
 SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]
 
 
+def run_nemotron_vl_preview(
+    full_model, tokenizer, input_ids, pyt_ckpt_path, stage_name, allow_fallback=False
+):
+    """Run text-only and VL preview generation for Nemotron VL models.
+
+    Args:
+        full_model: The full VL model
+        tokenizer: The tokenizer
+        input_ids: Input tensor for generation
+        pyt_ckpt_path: Path to the model checkpoint
+        stage_name: Description of the stage (e.g., "before quantization", "after quantization")
+        allow_fallback: Whether to allow fallback to standard generate on failure
+
+    Returns:
+        Generated text response or None if generation failed
+    """
+    from vlm_utils import run_text_only_generation, run_vl_preview_generation
+
+    print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...")
+    question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
+    generation_config = {
+        "max_new_tokens": 100,
+        "do_sample": False,
+        "eos_token_id": tokenizer.eos_token_id,
+    }
+
+    # Try text-only generation
+    text_response = run_text_only_generation(
+        full_model, tokenizer, question, generation_config, pyt_ckpt_path
+    )
+
+    if text_response is not None:
+        print(f"✅ Text-only generation successful: {text_response[:100]}...")
+        generated_ids = text_response
+    elif allow_fallback:
+        print("Text-only generation failed, falling back to standard generate...")
+        generated_ids = full_model.generate(input_ids, max_new_tokens=100)
+    else:
+        generated_ids = None
+
+    # Run additional VL test with images
+    print(f"Running additional VL test with images ({stage_name})...")
+    run_vl_preview_generation(full_model, tokenizer, pyt_ckpt_path, stage_name)
+
+    return generated_ids
+
+
+def _is_multimodal_config(config):
+    """Check if a config indicates a multimodal model (config-only version of is_multimodal_model)."""
+    return (
+        hasattr(config, "vision_config")  # Standard vision config (e.g., Qwen2.5-VL)
+        or getattr(config, "model_type", "") == "phi4mm"  # Phi-4 multimodal
+        or hasattr(config, "vision_lora")  # Vision LoRA configurations
+        or hasattr(config, "audio_processor")  # Audio processing capabilities
+        or (
+            hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
+        )  # Image embedding layers
+    )
+
+
+def is_nemotron_vl(model_or_config):
+    """Check if model or config indicates a Nemotron VL model.
+
+    Args:
+        model_or_config: Either a model instance or a config object.
+
+    Returns:
+        bool: True if it's a Nemotron VL model, False otherwise.
+    """
+    # Try to get config from model, or use directly if it's a config
+    if hasattr(model_or_config, "config"):
+        config = model_or_config.config
+        from modelopt.torch.export.model_utils import is_multimodal_model
+
+        if not is_multimodal_model(model_or_config):
+            return False
+    else:
+        config = model_or_config
+        if not _is_multimodal_config(config):
+            return False
+
+    architectures = getattr(config, "architectures", [])
+    return any("nemotron" in arch.lower() for arch in architectures)
+
+
 def build_quant_cfg(
     qformat,
     kv_cache_qformat,
@@ -185,7 +270,21 @@ def get_model(
     if device == "cpu":
         device_map = "cpu"
 
+    # Prepare config kwargs for loading
     config_kwargs = {"trust_remote_code": trust_remote_code} if trust_remote_code else {}
+
+    # Load config once and handle VL model detection
+    try:
+        hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs)
+        if is_nemotron_vl(hf_config):
+            print(
+                "Detected Nemotron VL model from config. "
+                "Disabling automatic device mapping for compatibility."
+            )
+            device_map = None
+    except Exception as e:
+        print(f"Error: Could not load config from {ckpt_path}: {e}")
+        raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e
     if attn_implementation is not None:
         config_kwargs["attn_implementation"] = attn_implementation
 
@@ -207,11 +306,6 @@ def get_model(
         )
         model = hf_vila.llm
     else:
-        hf_config = AutoConfig.from_pretrained(
-            ckpt_path,
-            **config_kwargs,
-        )
-
         if use_seq_device_map:
             device_map = "sequential"
             # If we use sequential, set max_memory limit to ensure that the model does not occupy the full GPU
@@ -282,6 +376,12 @@ def get_model(
                 **model_kwargs,
             )
     model.eval()
+
+    # If device_map was disabled (None), manually move model to target device
+    if device_map is None and device != "cpu":
+        print(f"Moving model to {device} device...")
+        model = model.to(device)
+
     if device == "cuda" and not is_model_on_gpu(model):
         print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM")
 

@@ -30,6 +30,8 @@
     get_processor,
     get_tokenizer,
     is_enc_dec,
+    is_nemotron_vl,
+    run_nemotron_vl_preview,
 )
 from transformers import (
     AutoConfig,
@@ -48,7 +50,7 @@
     export_tensorrt_llm_checkpoint,
     get_model_type,
 )
-from modelopt.torch.export.model_utils import is_multimodal_model
+from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
 from modelopt.torch.quantization.config import need_calibration
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
@@ -283,6 +285,9 @@ def main(args):
 
     full_model = model
 
+    # Detect if this is a Nemotron VL model using architecture-based detection
+    is_nemotron_vl_model = is_nemotron_vl(full_model)
+
     if model_type == "mllama":
         processor = get_processor(
             args.pyt_ckpt_path,
@@ -312,15 +317,8 @@ def main(args):
         tokenizer.padding_side = "left"
 
         # We only quantize the language model for VLMs other than the type supported above.
-        if hasattr(model, "language_model"):
-            parent_model = model  # llama4 case
-            if isinstance(type(model).__dict__.get("language_model"), property):
-                assert hasattr(model, "model") and hasattr(model.model, "language_model"), (
-                    "Expected language_model in model.model, but attribute not found. "
-                    "This may indicate an unsupported model structure."
-                )
-                parent_model = model.model  # gemma3, qwen2.5 VL case
-
+        language_model, parent_model = get_language_model_from_vl(model)
+        if language_model is not None:
             disabled_quant_cfg = {
                 "quant_cfg": {"default": {"enable": False}},
                 "algorithm": "max",
@@ -331,7 +329,7 @@ def main(args):
                 if name != "language_model":
                     mtq.quantize(child, disabled_quant_cfg, forward_loop=None)
 
-            model = model.language_model
+            model = language_model
             model_type = get_model_type(model)
 
     if model_type == "phi4mm":
@@ -458,34 +456,65 @@ def main(args):
             KV_QUANT_CFG_CHOICES,
         )
 
+        # For Nemotron VL models, disable quantization of vision components
+        if is_nemotron_vl_model:
+            print("Disabling quantization for vision components in Nemotron VL model")
+            quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
+            quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
+            # Also disable radio model components specifically
+            quant_cfg["quant_cfg"]["*radio*"] = {"enable": False}
+            quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
+
         if not model_is_already_quantized or calibration_only:
             # Only run single sample for preview
             input_ids = next(iter(calib_dataloader))[
                 "input_features" if model_type == "whisper" else "input_ids"
             ][0:1]
-            try:
-                generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)
-            except Exception as e:
-                print(
-                    "Error during model generation. Please check if your transformers version is "
-                    "compatible with the model."
+
+            # Generate preview before quantization
+            if is_nemotron_vl_model and tokenizer is not None:
+                generated_ids_before_ptq = run_nemotron_vl_preview(
+                    full_model,
+                    tokenizer,
+                    input_ids,
+                    args.pyt_ckpt_path,
+                    "before quantization",
+                    allow_fallback=True,
                 )
-                print(f"Error details: {e}")
-                raise
+            else:
+                # Standard generation for non-Nemotron VL models
+                generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)
             if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only":
                 print("Applying nvfp4 quantization (MoE only) for gpt-oss")
 
             # quantize the model
             model = quantize_model(model, quant_cfg, args, calib_dataloader, calibration_only)
+
+            # For VL models, update full_model to use the quantized language model
+            if is_nemotron_vl_model:
+                _, parent_model = get_language_model_from_vl(full_model)
+                if parent_model is not None:
+                    print("Updating full_model with quantized language_model...")
+                    parent_model.language_model = model
+
             if args.verbose:
                 mtq.print_quant_summary(model)
 
             # Run some samples
             torch.cuda.empty_cache()
             generated_ids_after_ptq = None
-            if model_type != "llama4":
+            if model_type != "llama4" and not is_nemotron_vl_model:
                 # Our fake quantizer may not be fully compatible with torch.compile.
                 generated_ids_after_ptq = full_model.generate(input_ids, max_new_tokens=100)
+            elif is_nemotron_vl_model and tokenizer is not None:
+                generated_ids_after_ptq = run_nemotron_vl_preview(
+                    full_model,
+                    tokenizer,
+                    input_ids,
+                    args.pyt_ckpt_path,
+                    "after quantization",
+                    allow_fallback=False,
+                )
             else:
                 warnings.warn(
                     "Llama4 Maverick generation after quantization has a bug. Skipping generation sample."
@@ -518,15 +547,25 @@ def output_decode(generated_ids, input_shape):
 
             if generated_ids_after_ptq is not None:
                 print("--------")
-                print(f"example test input: {input_decode(input_ids)}")
-                print("--------")
-                print(
-                    f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}"
-                )
-                print("--------")
-                print(
-                    f"example outputs after ptq: {output_decode(generated_ids_after_ptq, input_ids.shape[1])}"
-                )
+                if is_nemotron_vl_model:
+                    # For Nemotron VL models, generated_ids are text strings from model.chat()
+                    print("Nemotron VL model text-only generation results:")
+                    print(f"Text response before quantization: {generated_ids_before_ptq}")
+                    print("--------")
+                    print(f"Text response after quantization: {generated_ids_after_ptq}")
+                    print("--------")
+                    print("Note: Additional VL tests with images were run separately above")
+                else:
+                    # For regular LLMs, generated_ids are token tensors that need decoding
+                    print(f"example test input: {input_decode(input_ids)}")
+                    print("--------")
+                    print(
+                        f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}"
+                    )
+                    print("--------")
+                    print(
+                        f"example outputs after ptq: {output_decode(generated_ids_after_ptq, input_ids.shape[1])}"
+                    )
         else:
             warnings.warn("Skipping quantization: model is already quantized.")
 
@@ -548,9 +587,12 @@ def output_decode(generated_ids, input_shape):
             # Save original model config and the processor config to the export path for VLMs.
             print(f"Saving original model config to {export_path}")
 
-            AutoConfig.from_pretrained(
-                args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
-            ).save_pretrained(export_path)
+            config_kwargs = {"trust_remote_code": args.trust_remote_code}
+            if args.attn_implementation is not None:
+                config_kwargs["attn_implementation"] = args.attn_implementation
+            AutoConfig.from_pretrained(args.pyt_ckpt_path, **config_kwargs).save_pretrained(
+                export_path
+            )
 
             # Try to save processor config if available
             try:
@@ -748,7 +790,7 @@ def output_decode(generated_ids, input_shape):
     parser.add_argument(
         "--attn_implementation",
         help=(
-            "Specify the attention implementation to use."
+            "Specify the attention implementation to use. "
             "This arg will be passed to the HF model loading if specified."
         ),
         default=None,