vllm-project · mmangkad · Oct 26, 2025
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -139,6 +139,7 @@
     VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict"
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_MXFP4_USE_MARLIN: bool | None = None
+    VLLM_MXFP4_USE_TRITON: bool | None = None
     VLLM_V1_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
     VLLM_TPU_MOST_MODEL_LEN: int | None = None
@@ -1039,6 +1040,10 @@ def get_vllm_port() -> int | None:
     "VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
         os.environ.get("VLLM_MXFP4_USE_MARLIN", None)
     ),
+    # Whether to use triton kernel in mxfp4 quantization method
+    "VLLM_MXFP4_USE_TRITON": lambda: maybe_convert_bool(
+        os.environ.get("VLLM_MXFP4_USE_TRITON", None)
+    ),
     # Whether to turn on the outlines cache for V1
     # This cache is unbounded and on disk, so it's not safe to use in
     # an environment with potentially malicious users.

@@ -115,8 +115,13 @@ def get_mxfp4_backend():
             )
 
         # If FlashInfer is not available, try either Marlin or Triton
+        # For SM90, default to Marlin unless Triton is explicitly requested
         if (
             envs.VLLM_MXFP4_USE_MARLIN
+            or (
+                current_platform.get_device_capability()[0] == 9
+                and not envs.VLLM_MXFP4_USE_TRITON
+            )
             or current_platform.get_device_capability()[0] < 9
             or not has_triton_kernels()
             or not is_torch_equal_or_newer("2.8.0")