Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@
VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict"
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
VLLM_MXFP4_USE_MARLIN: bool | None = None
VLLM_MXFP4_USE_TRITON: bool | None = None
VLLM_V1_USE_OUTLINES_CACHE: bool = False
VLLM_TPU_BUCKET_PADDING_GAP: int = 0
VLLM_TPU_MOST_MODEL_LEN: int | None = None
Expand Down Expand Up @@ -1039,6 +1040,10 @@ def get_vllm_port() -> int | None:
"VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
os.environ.get("VLLM_MXFP4_USE_MARLIN", None)
),
# Whether to use triton kernel in mxfp4 quantization method
"VLLM_MXFP4_USE_TRITON": lambda: maybe_convert_bool(
os.environ.get("VLLM_MXFP4_USE_TRITON", None)
),
# Whether to turn on the outlines cache for V1
# This cache is unbounded and on disk, so it's not safe to use in
# an environment with potentially malicious users.
Expand Down
5 changes: 5 additions & 0 deletions vllm/model_executor/layers/quantization/mxfp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,13 @@ def get_mxfp4_backend():
)

# If FlashInfer is not available, try either Marlin or Triton
# For SM90, default to Marlin unless Triton is explicitly requested
if (
envs.VLLM_MXFP4_USE_MARLIN
or (
current_platform.get_device_capability()[0] == 9
and not envs.VLLM_MXFP4_USE_TRITON
)
or current_platform.get_device_capability()[0] < 9
or not has_triton_kernels()
or not is_torch_equal_or_newer("2.8.0")
Expand Down