Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
476b59f
Add option to benchmark pipeline in diffusion_trt.py (#457)
ajrasane Oct 22, 2025
b583d98
default attn_implementaion to eager to avoid issues
Edwardf0t1 Sep 17, 2025
8b76102
add proper detection and handling for nemotron VL model in ptq examples
Edwardf0t1 Sep 19, 2025
89d207c
create fake vl inputs in export for nemotron VL model
Edwardf0t1 Sep 19, 2025
6991fdf
update fake inputs generation, initialize distributed for Nemotron mo…
Edwardf0t1 Sep 19, 2025
80fecf0
remove distributed prcessing setup and vision input generation since …
Edwardf0t1 Sep 20, 2025
15f0d61
special handling for nemotron VL preview generation in hf_ptq
Edwardf0t1 Sep 21, 2025
ae42b9b
fix mypy error
Edwardf0t1 Sep 21, 2025
587d427
add support for v2 model inference (.generate) with image inputs
Edwardf0t1 Oct 15, 2025
208cb9e
debug loading v2 converted nvfp4 weights from mcore
Edwardf0t1 Oct 17, 2025
f94558f
load scalers only for v2 fp4
Edwardf0t1 Oct 19, 2025
31c4f75
re-use existing vlm detection util function
Edwardf0t1 Oct 23, 2025
ec4a0ef
refactor and create a utils script for vlm
Edwardf0t1 Oct 23, 2025
5f0ea72
remove dulicated is_nemotron_vl usage
Edwardf0t1 Oct 23, 2025
60a698a
update
Edwardf0t1 Oct 23, 2025
446e135
add a util function to extract language model from VLM, update changelog
Edwardf0t1 Oct 23, 2025
f849c17
fix format
Edwardf0t1 Oct 23, 2025
96e1613
update
Edwardf0t1 Oct 23, 2025
c572513
update
Edwardf0t1 Oct 23, 2025
8e6dea3
update
Edwardf0t1 Oct 23, 2025
16bea91
WIP: local changes before pulling remote updates
Edwardf0t1 Oct 23, 2025
8e1d6cb
Increase gpu_tests timeout from 90 to 120 mins
kevalmorabia97 Oct 23, 2025
4561de9
revert torch_onnx.py
Edwardf0t1 Oct 24, 2025
57d388e
revert diffusion_trt.py
Edwardf0t1 Oct 24, 2025
f9b88fd
minor
Edwardf0t1 Oct 24, 2025
0e00954
update
Edwardf0t1 Oct 24, 2025
1a3bac1
update
Edwardf0t1 Oct 24, 2025
a4fa12d
update
Edwardf0t1 Oct 24, 2025
6216038
update
Edwardf0t1 Oct 24, 2025
4352ab6
update
Edwardf0t1 Oct 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/gpu_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
if: needs.check-file-changes.outputs.any_changed == 'true'
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
runs-on: linux-amd64-gpu-l4-latest-1
timeout-minutes: 90
timeout-minutes: 120
container: &gpu_container
image: nvcr.io/nvidia/pytorch:25.06-py3
env:
Expand All @@ -80,7 +80,7 @@ jobs:
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
runs-on: linux-amd64-gpu-h100-latest-1
timeout-minutes: 90
timeout-minutes: 120
container: *gpu_container
steps: *gpu_steps
gpu-pr-required-check:
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Model Optimizer Changelog (Linux)
- Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
- Add support for MCore MoE PTQ/QAT/QAD.
- Add support for multi-node PTQ and export with FSDP2 in ``examples/llm_ptq/multinode_ptq.py``. See `examples/llm_ptq/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/llm_ptq#multi-node-post-training-quantization-with-fsdp2>`_ for more details.
- Add support for Nemotron Nano VL v1 & v2 models in FP8/NVFP4 PTQ workflow.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Forward-dated release entry

0.39 (2025-11-07) is in the future (today is 2025-10-23). Please mark this as Unreleased/TBD to avoid confusion until the release is cut.

-0.39 (2025-11-07)
+0.39 (Unreleased)

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In CHANGELOG.rst around line 16, the release entry "0.39 (2025-11-07)" is
forward-dated; change the header to indicate it is not yet released (e.g., "0.39
(Unreleased)" or "0.39 (TBD)") and leave the content line "Add support for
Nemotron Nano VL v1 & v2 models in FP8/NVFP4 PTQ workflow." under that
Unreleased/TBD heading so the changelog does not show a future date.


**Documentation**

Expand Down
110 changes: 105 additions & 5 deletions examples/llm_ptq/example_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,91 @@
SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]


def run_nemotron_vl_preview(
full_model, tokenizer, input_ids, pyt_ckpt_path, stage_name, allow_fallback=False
):
"""Run text-only and VL preview generation for Nemotron VL models.

Args:
full_model: The full VL model
tokenizer: The tokenizer
input_ids: Input tensor for generation
pyt_ckpt_path: Path to the model checkpoint
stage_name: Description of the stage (e.g., "before quantization", "after quantization")
allow_fallback: Whether to allow fallback to standard generate on failure

Returns:
Generated text response or None if generation failed
"""
from vlm_utils import run_text_only_generation, run_vl_preview_generation

print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...")
question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
generation_config = {
"max_new_tokens": 100,
"do_sample": False,
"eos_token_id": tokenizer.eos_token_id,
}

# Try text-only generation
text_response = run_text_only_generation(
full_model, tokenizer, question, generation_config, pyt_ckpt_path
)

if text_response is not None:
print(f"✅ Text-only generation successful: {text_response[:100]}...")
generated_ids = text_response
elif allow_fallback:
print("Text-only generation failed, falling back to standard generate...")
generated_ids = full_model.generate(input_ids, max_new_tokens=100)
else:
generated_ids = None

# Run additional VL test with images
print(f"Running additional VL test with images ({stage_name})...")
run_vl_preview_generation(full_model, tokenizer, pyt_ckpt_path, stage_name)

return generated_ids


def _is_multimodal_config(config):
"""Check if a config indicates a multimodal model (config-only version of is_multimodal_model)."""
return (
hasattr(config, "vision_config") # Standard vision config (e.g., Qwen2.5-VL)
or getattr(config, "model_type", "") == "phi4mm" # Phi-4 multimodal
or hasattr(config, "vision_lora") # Vision LoRA configurations
or hasattr(config, "audio_processor") # Audio processing capabilities
or (
hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
) # Image embedding layers
)


def is_nemotron_vl(model_or_config):
"""Check if model or config indicates a Nemotron VL model.

Args:
model_or_config: Either a model instance or a config object.

Returns:
bool: True if it's a Nemotron VL model, False otherwise.
"""
# Try to get config from model, or use directly if it's a config
if hasattr(model_or_config, "config"):
config = model_or_config.config
from modelopt.torch.export.model_utils import is_multimodal_model

if not is_multimodal_model(model_or_config):
return False
else:
config = model_or_config
if not _is_multimodal_config(config):
return False

architectures = getattr(config, "architectures", [])
return any("nemotron" in arch.lower() for arch in architectures)


def build_quant_cfg(
qformat,
kv_cache_qformat,
Expand Down Expand Up @@ -185,7 +270,21 @@ def get_model(
if device == "cpu":
device_map = "cpu"

# Prepare config kwargs for loading
config_kwargs = {"trust_remote_code": trust_remote_code} if trust_remote_code else {}

# Load config once and handle VL model detection
try:
hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs)
if is_nemotron_vl(hf_config):
print(
"Detected Nemotron VL model from config. "
"Disabling automatic device mapping for compatibility."
)
device_map = None
except Exception as e:
print(f"Error: Could not load config from {ckpt_path}: {e}")
raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e
if attn_implementation is not None:
config_kwargs["attn_implementation"] = attn_implementation

Expand All @@ -207,11 +306,6 @@ def get_model(
)
model = hf_vila.llm
else:
hf_config = AutoConfig.from_pretrained(
ckpt_path,
**config_kwargs,
)

if use_seq_device_map:
device_map = "sequential"
# If we use sequential, set max_memory limit to ensure that the model does not occupy the full GPU
Expand Down Expand Up @@ -282,6 +376,12 @@ def get_model(
**model_kwargs,
)
model.eval()

# If device_map was disabled (None), manually move model to target device
if device_map is None and device != "cpu":
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if device == "cpu"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was handled by HF's device_map="cpu" in L210.

print(f"Moving model to {device} device...")
model = model.to(device)

if device == "cuda" and not is_model_on_gpu(model):
print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM")

Expand Down
108 changes: 75 additions & 33 deletions examples/llm_ptq/hf_ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
get_processor,
get_tokenizer,
is_enc_dec,
is_nemotron_vl,
run_nemotron_vl_preview,
)
from transformers import (
AutoConfig,
Expand All @@ -48,7 +50,7 @@
export_tensorrt_llm_checkpoint,
get_model_type,
)
from modelopt.torch.export.model_utils import is_multimodal_model
from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
from modelopt.torch.quantization.config import need_calibration
from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
from modelopt.torch.quantization.utils import is_quantized
Expand Down Expand Up @@ -283,6 +285,9 @@ def main(args):

full_model = model

# Detect if this is a Nemotron VL model using architecture-based detection
is_nemotron_vl_model = is_nemotron_vl(full_model)

if model_type == "mllama":
processor = get_processor(
args.pyt_ckpt_path,
Expand Down Expand Up @@ -312,15 +317,8 @@ def main(args):
tokenizer.padding_side = "left"

# We only quantize the language model for VLMs other than the type supported above.
if hasattr(model, "language_model"):
parent_model = model # llama4 case
if isinstance(type(model).__dict__.get("language_model"), property):
assert hasattr(model, "model") and hasattr(model.model, "language_model"), (
"Expected language_model in model.model, but attribute not found. "
"This may indicate an unsupported model structure."
)
parent_model = model.model # gemma3, qwen2.5 VL case

language_model, parent_model = get_language_model_from_vl(model)
if language_model is not None:
disabled_quant_cfg = {
"quant_cfg": {"default": {"enable": False}},
"algorithm": "max",
Expand All @@ -331,7 +329,7 @@ def main(args):
if name != "language_model":
mtq.quantize(child, disabled_quant_cfg, forward_loop=None)

model = model.language_model
model = language_model
model_type = get_model_type(model)

if model_type == "phi4mm":
Expand Down Expand Up @@ -458,34 +456,65 @@ def main(args):
KV_QUANT_CFG_CHOICES,
)

# For Nemotron VL models, disable quantization of vision components
if is_nemotron_vl_model:
print("Disabling quantization for vision components in Nemotron VL model")
quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
# Also disable radio model components specifically
quant_cfg["quant_cfg"]["*radio*"] = {"enable": False}
quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}

if not model_is_already_quantized or calibration_only:
# Only run single sample for preview
input_ids = next(iter(calib_dataloader))[
"input_features" if model_type == "whisper" else "input_ids"
][0:1]
try:
generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)
except Exception as e:
print(
"Error during model generation. Please check if your transformers version is "
"compatible with the model."

# Generate preview before quantization
if is_nemotron_vl_model and tokenizer is not None:
generated_ids_before_ptq = run_nemotron_vl_preview(
full_model,
tokenizer,
input_ids,
args.pyt_ckpt_path,
"before quantization",
allow_fallback=True,
)
print(f"Error details: {e}")
raise
else:
# Standard generation for non-Nemotron VL models
generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)
if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only":
print("Applying nvfp4 quantization (MoE only) for gpt-oss")

# quantize the model
model = quantize_model(model, quant_cfg, args, calib_dataloader, calibration_only)

# For VL models, update full_model to use the quantized language model
if is_nemotron_vl_model:
_, parent_model = get_language_model_from_vl(full_model)
if parent_model is not None:
print("Updating full_model with quantized language_model...")
parent_model.language_model = model

if args.verbose:
mtq.print_quant_summary(model)

# Run some samples
torch.cuda.empty_cache()
generated_ids_after_ptq = None
if model_type != "llama4":
if model_type != "llama4" and not is_nemotron_vl_model:
# Our fake quantizer may not be fully compatible with torch.compile.
generated_ids_after_ptq = full_model.generate(input_ids, max_new_tokens=100)
elif is_nemotron_vl_model and tokenizer is not None:
generated_ids_after_ptq = run_nemotron_vl_preview(
full_model,
tokenizer,
input_ids,
args.pyt_ckpt_path,
"after quantization",
allow_fallback=False,
)
else:
warnings.warn(
"Llama4 Maverick generation after quantization has a bug. Skipping generation sample."
Expand Down Expand Up @@ -518,15 +547,25 @@ def output_decode(generated_ids, input_shape):

if generated_ids_after_ptq is not None:
print("--------")
print(f"example test input: {input_decode(input_ids)}")
print("--------")
print(
f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}"
)
print("--------")
print(
f"example outputs after ptq: {output_decode(generated_ids_after_ptq, input_ids.shape[1])}"
)
if is_nemotron_vl_model:
# For Nemotron VL models, generated_ids are text strings from model.chat()
print("Nemotron VL model text-only generation results:")
print(f"Text response before quantization: {generated_ids_before_ptq}")
print("--------")
print(f"Text response after quantization: {generated_ids_after_ptq}")
print("--------")
print("Note: Additional VL tests with images were run separately above")
else:
# For regular LLMs, generated_ids are token tensors that need decoding
print(f"example test input: {input_decode(input_ids)}")
print("--------")
print(
f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}"
)
print("--------")
print(
f"example outputs after ptq: {output_decode(generated_ids_after_ptq, input_ids.shape[1])}"
)
else:
warnings.warn("Skipping quantization: model is already quantized.")

Expand All @@ -548,9 +587,12 @@ def output_decode(generated_ids, input_shape):
# Save original model config and the processor config to the export path for VLMs.
print(f"Saving original model config to {export_path}")

AutoConfig.from_pretrained(
args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
).save_pretrained(export_path)
config_kwargs = {"trust_remote_code": args.trust_remote_code}
if args.attn_implementation is not None:
config_kwargs["attn_implementation"] = args.attn_implementation
AutoConfig.from_pretrained(args.pyt_ckpt_path, **config_kwargs).save_pretrained(
export_path
)

# Try to save processor config if available
try:
Expand Down Expand Up @@ -748,7 +790,7 @@ def output_decode(generated_ids, input_shape):
parser.add_argument(
"--attn_implementation",
help=(
"Specify the attention implementation to use."
"Specify the attention implementation to use. "
"This arg will be passed to the HF model loading if specified."
),
default=None,
Expand Down
Loading