diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
new file mode 100644
index 000000000000..199c33159fde
--- /dev/null
+++ b/.buildkite/ci_config.yaml
@@ -0,0 +1,24 @@
+name: vllm_ci
+job_dirs:
+ - ".buildkite/test_areas"
+ - ".buildkite/image_build"
+run_all_patterns:
+ - "docker/Dockerfile"
+ - "CMakeLists.txt"
+ - "requirements/common.txt"
+ - "requirements/cuda.txt"
+ - "requirements/build.txt"
+ - "requirements/test.txt"
+ - "setup.py"
+ - "csrc/"
+ - "cmake/"
+run_all_exclude_patterns:
+ - "docker/Dockerfile."
+ - "csrc/cpu/"
+ - "csrc/rocm/"
+ - "cmake/hipify.py"
+ - "cmake/cpu_extension.cmake"
+registries: public.ecr.aws/q9t5s3a7
+repositories:
+ main: "vllm-ci-postmerge-repo"
+ premerge: "vllm-ci-test-repo"
diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
deleted file mode 100644
index bbed80ebe847..000000000000
--- a/.buildkite/generate_index.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """
-
-
- Links for vLLM
- {x86_wheel}
- {arm_wheel}
-
-
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
- print(f"Generated index.html for {args.wheel}")
- # sync the abi tag with .buildkite/scripts/upload-wheels.sh
- if "x86_64" in filename:
- x86_wheel = filename
- arm_wheel = filename.replace("x86_64", "aarch64").replace(
- "manylinux1", "manylinux2014"
- )
- elif "aarch64" in filename:
- x86_wheel = filename.replace("aarch64", "x86_64").replace(
- "manylinux2014", "manylinux1"
- )
- arm_wheel = filename
- else:
- raise ValueError(f"Unsupported wheel: {filename}")
- # cloudfront requires escaping the '+' character
- f.write(
- template.format(
- x86_wheel=x86_wheel,
- x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
- arm_wheel=arm_wheel,
- arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
- )
- )
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
new file mode 100755
index 000000000000..9a2384e524b6
--- /dev/null
+++ b/.buildkite/image_build/image_build.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 8 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+BRANCH=$4
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+CACHE_FROM=$7
+CACHE_TO=$8
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+
+# docker buildx
+docker buildx create --name vllm-builder --driver docker-container --use
+docker buildx inspect --bootstrap
+docker buildx ls
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+ merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+else
+ merge_base_commit_build_args=""
+fi
+
+# build
+docker buildx build --file docker/Dockerfile \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+ --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+ --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+ ${merge_base_commit_build_args} \
+ --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+ --cache-to type=registry,ref=${CACHE_TO},mode=max \
+ --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
+ $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+ --push \
+ --target test \
+ --progress plain .
diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
new file mode 100644
index 000000000000..d01c71dd9bec
--- /dev/null
+++ b/.buildkite/image_build/image_build.yaml
@@ -0,0 +1,57 @@
+group: Abuild
+steps:
+ - label: ":docker: Build image"
+ key: image-build
+ depends_on: []
+ commands:
+ - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build CPU image"
+ key: image-build-cpu
+ depends_on: []
+ commands:
+ - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build HPU image"
+ soft_fail: true
+ depends_on: []
+ key: image-build-hpu
+ commands:
+ - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build CPU arm64 image"
+ key: cpu-arm64-image-build
+ depends_on: []
+ optional: true
+ commands:
+ - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
new file mode 100755
index 000000000000..a69732f43098
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --build-arg VLLM_CPU_AVX512BF16=true \
+ --build-arg VLLM_CPU_AVX512VNNI=true \
+ --build-arg VLLM_CPU_AMXBF16=true \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+ --target vllm-test \
+ --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
new file mode 100755
index 000000000000..615298b6555b
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+ --target vllm-test \
+ --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh
new file mode 100755
index 000000000000..192447ef4577
--- /dev/null
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build \
+ --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+ --progress plain \
+ https://github.com/vllm-project/vllm-gaudi.git
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
index 46f1a9fbf6ff..6c0b5540cbb6 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -8,3 +8,4 @@ tasks:
value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5
+rtol: 0.05
diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
new file mode 100644
index 000000000000..4fb0b84bc4d8
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 3627b760eddc..f94d681197d2 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -9,11 +9,40 @@
--tp-size=1
"""
+import os
+from contextlib import contextmanager
+
import lm_eval
import numpy as np
import yaml
-RTOL = 0.08
+DEFAULT_RTOL = 0.08
+
+
+@contextmanager
+def scoped_env_vars(new_env: dict[str, str]):
+ if not new_env:
+ # Fast path: nothing to do
+ yield
+ return
+
+ old_values = {}
+ new_keys = []
+
+ try:
+ for key, value in new_env.items():
+ if key in os.environ:
+ old_values[key] = os.environ[key]
+ else:
+ new_keys.append(key)
+ os.environ[key] = str(value)
+ yield
+ finally:
+ # Restore / clean up
+ for key, value in old_values.items():
+ os.environ[key] = value
+ for key in new_keys:
+ os.environ.pop(key, None)
def launch_lm_eval(eval_config, tp_size):
@@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len},"
)
- results = lm_eval.simple_evaluate(
- model=backend,
- model_args=model_args,
- tasks=[task["name"] for task in eval_config["tasks"]],
- num_fewshot=eval_config["num_fewshot"],
- limit=eval_config["limit"],
- # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
- # text models. however, this is regressing measured strict-match for
- # existing text models in CI, so only apply it for mm, or explicitly set
- apply_chat_template=eval_config.get(
- "apply_chat_template", backend == "vllm-vlm"
- ),
- fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
- # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
- gen_kwargs=eval_config.get("gen_kwargs"),
- batch_size=batch_size,
- )
+
+ env_vars = eval_config.get("env_vars", None)
+ with scoped_env_vars(env_vars):
+ results = lm_eval.simple_evaluate(
+ model=backend,
+ model_args=model_args,
+ tasks=[task["name"] for task in eval_config["tasks"]],
+ num_fewshot=eval_config["num_fewshot"],
+ limit=eval_config["limit"],
+ # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+ # text models. however, this is regressing measured strict-match for
+ # existing text models in CI, so only apply it for mm, or explicitly set
+ apply_chat_template=eval_config.get(
+ "apply_chat_template", backend == "vllm-vlm"
+ ),
+ fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+ # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+ gen_kwargs=eval_config.get("gen_kwargs"),
+ batch_size=batch_size,
+ )
return results
@@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
results = launch_lm_eval(eval_config, tp_size)
+ rtol = eval_config.get("rtol", DEFAULT_RTOL)
+
success = True
for task in eval_config["tasks"]:
for metric in task["metrics"]:
@@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
measured_value = results["results"][task["name"]][metric["name"]]
print(
f"{task['name']} | {metric['name']}: "
- f"ground_truth={ground_truth} | measured={measured_value}"
+ f"ground_truth={ground_truth:.3f} | "
+ f"measured={measured_value:.3f} | rtol={rtol}"
)
- success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
+ success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
assert success
diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md
index 6d494f64f14f..015f48c2520d 100644
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
+#### Default Parameters Field
+
+We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
+
+
+ An Example of default parameters field
+
+```json
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "server_environment_variables": {
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+ },
+ "server_parameters": {
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "block_size": 128,
+ "disable_log_stats": "",
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "num_prompts": 200,
+ "ignore-eos": ""
+ }
+ },
+ "tests": [
+ {
+ "test_name": "serving_llama3B_tp2_random_128_128",
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "tensor_parallel_size": 2,
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ }
+ },
+ {
+ "test_name": "serving_qwen3_tp4_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-14B",
+ "tensor_parallel_size": 4,
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-14B",
+ }
+ },
+ ]
+}
+```
+
+
+
### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
index 99a5a5e334f8..34ceefe0996f 100644
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -110,7 +110,8 @@ json2envs() {
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
- timeout 1200 bash -c '
+ local timeout_val="1200"
+ timeout "$timeout_val" bash -c '
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done' && return 0 || return 1
@@ -316,12 +317,44 @@ run_throughput_tests() {
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
+ #
+ # Supported JSON formats:
+ # 1) Plain format: top-level array
+ # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+ #
+ # 2) Default parameters field + plain format tests
+ # {
+ # "defaults": { ... },
+ # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+ # }
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
- jq -c '.[]' "$serving_test_file" | while read -r params; do
+ jq -c '
+ if type == "array" then
+ # Plain format: test cases array
+ .[]
+ elif (type == "object" and has("tests")) then
+ # merge the default parameters into each test cases
+ . as $root
+ | ($root.defaults // {}) as $d
+ | ($root.tests // [])[]
+ # default qps / max_concurrency from defaults if missing
+ | .qps_list = (.qps_list // $d.qps_list)
+ | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
+ # merge envs / params: test overrides defaults
+ | .server_environment_variables =
+ (($d.server_environment_variables // {}) + (.server_environment_variables // {}))
+ | .server_parameters =
+ (($d.server_parameters // {}) + (.server_parameters // {}))
+ | .client_parameters =
+ (($d.client_parameters // {}) + (.client_parameters // {}))
+ else
+ error("Unsupported serving test file format: must be array or object with .tests")
+ end
+ ' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -335,20 +368,25 @@ run_serving_tests() {
continue
fi
- # get client and server arguments
+ # get client and server arguments (after merged the default parameters)
server_params=$(echo "$params" | jq -r '.server_parameters')
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters')
+
server_args=$(json2args "$server_params")
server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params")
+
+ # qps_list
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
+
+ # max_concurrency_list (fallback to num_prompts if missing)
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
- num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
- max_concurrency_list="[$num_prompts]"
+ num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+ max_concurrency_list="[$num_prompts]"
fi
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
echo "Running over max concurrency list $max_concurrency_list"
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
deleted file mode 100644
index f758097e098e..000000000000
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ /dev/null
@@ -1,610 +0,0 @@
-[
- {
- "test_name": "serving_llama8B_bf16_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- }
-]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
deleted file mode 100644
index 0b1a42e79025..000000000000
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ /dev/null
@@ -1,1023 +0,0 @@
-[
- {
- "test_name": "serving_llama8B_bf16_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- }
-]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
index f792956f3947..8f7200862d20 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -1,276 +1,246 @@
-[
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "ignore-eos": "",
+ "num_prompts": 200
+ }
+ },
+ "tests": [
+ {
+ "test_name": "serving_llama8B_tp1_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
{
- "test_name": "serving_llama8B_tp1_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 32
- }
+ "test_name": "serving_llama8B_tp4_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp2_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 32
- }
+ "test_name": "serving_llama3B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp1_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "test_name": "serving_granite2B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp2_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "test_name": "serving_qwen1.7B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-1.7B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-1.7B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp1_random_128_2048",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 2048,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "test_name": "serving_qwen4B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-4B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-4B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp2_random_128_2048",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 2048,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "test_name": "serving_qwen8B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-8B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-8B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp1_random_2048_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "test_name": "serving_glm9B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "zai-org/glm-4-9b-hf",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "zai-org/glm-4-9b-hf",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp2_random_2048_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "test_name": "serving_gemma7B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "google/gemma-7b",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "google/gemma-7b",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
}
-]
+ ]
+}
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 38c400ba1faf..a9d51557bd9b 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -8,41 +8,43 @@ steps:
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- # aarch64 build
- - label: "Build arm64 CPU wheel"
+ - label: "Build arm64 wheel - CUDA 13.0"
depends_on: ~
- id: build-wheel-arm64-cpu
+ id: build-wheel-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+ # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+ # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- # x86 + CUDA builds
- - label: "Build wheel - CUDA 12.8"
+ # aarch64 build
+ - label: "Build arm64 CPU wheel"
depends_on: ~
- id: build-wheel-cuda-12-8
+ id: build-wheel-arm64-cpu
agents:
- queue: cpu_queue_postmerge
+ queue: arm64_cpu_queue_postmerge
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
+ # x86 + CUDA builds
- label: "Build wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-cuda-12-9
@@ -52,7 +54,7 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
env:
DOCKER_BUILDKIT: "1"
@@ -65,7 +67,21 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+ env:
+ DOCKER_BUILDKIT: "1"
+
+ # x86 CPU wheel build
+ - label: "Build x86 CPU wheel"
+ depends_on: ~
+ id: build-wheel-x86-cpu
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+ - "mkdir artifacts"
+ - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
@@ -109,7 +125,6 @@ steps:
- label: "Annotate release workflow"
depends_on:
- create-multi-arch-manifest
- - build-wheel-cuda-12-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
new file mode 100644
index 000000000000..d0965fbd5640
--- /dev/null
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -0,0 +1,400 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# do not complain about line length (for docstring)
+# ruff: noqa: E501
+
+import argparse
+import json
+import sys
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+
+import regex as re
+
+if not sys.version_info >= (3, 12):
+ raise RuntimeError("This script requires Python 3.12 or higher.")
+
+INDEX_HTML_TEMPLATE = """
+
+
+
+
+{items}
+
+
+"""
+
+
+@dataclass
+class WheelFileInfo:
+ package_name: str
+ version: str
+ build_tag: str | None
+ python_tag: str
+ abi_tag: str
+ platform_tag: str
+ variant: str | None
+ filename: str
+
+
+def parse_from_filename(file: str) -> WheelFileInfo:
+ """
+ Parse wheel file name to extract metadata.
+
+ The format of wheel names:
+ {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
+ All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
+ Example:
+ vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
+ vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
+ vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
+ vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
+ """
+ wheel_file_re = re.compile(
+ r"^(?P.+)-(?P[^-]+?)(-(?P[^-]+))?-(?P[^-]+)-(?P[^-]+)-(?P[^-]+)\.whl$"
+ )
+ match = wheel_file_re.match(file)
+ if not match:
+ raise ValueError(f"Invalid wheel file name: {file}")
+
+ package_name = match.group("package_name")
+ version = match.group("version")
+ build_tag = match.group("build_tag")
+ python_tag = match.group("python_tag")
+ abi_tag = match.group("abi_tag")
+ platform_tag = match.group("platform_tag")
+
+ # extract variant from version
+ variant = None
+ if "dev" in version:
+ ver_after_dev = version.split("dev")[-1]
+ if "." in ver_after_dev:
+ variant = ver_after_dev.split(".")[-1]
+ version = version.removesuffix("." + variant)
+ else:
+ if "+" in version:
+ version, variant = version.split("+")
+
+ return WheelFileInfo(
+ package_name=package_name,
+ version=version,
+ build_tag=build_tag,
+ python_tag=python_tag,
+ abi_tag=abi_tag,
+ platform_tag=platform_tag,
+ variant=variant,
+ filename=file,
+ )
+
+
+def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
+ """
+ Generate project list HTML content linking to each project & variant sub-directory.
+ """
+ href_tags = []
+ for name in sorted(subdir_names):
+ name = name.strip("/").strip(".")
+ href_tags.append(f' {name}/
')
+ return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
+
+
+def generate_package_index_and_metadata(
+ wheel_files: list[WheelFileInfo],
+ wheel_base_dir: Path,
+ index_base_dir: Path,
+ comment: str = "",
+) -> tuple[str, str]:
+ """
+ Generate package index HTML content for a specific package, linking to actual wheel files.
+ """
+ href_tags = []
+ metadata = []
+ for file in sorted(wheel_files, key=lambda x: x.filename):
+ relative_path = (
+ wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+ )
+ # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
+ # NOTE: this is AWS S3 specific behavior!
+ file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
+ href_tags.append(f' {file.filename}
')
+ file_meta = asdict(file)
+ file_meta["path"] = file_path_quoted
+ metadata.append(file_meta)
+ index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
+ metadata_str = json.dumps(metadata, indent=2)
+ return index_str, metadata_str
+
+
+def generate_index_and_metadata(
+ whl_files: list[str],
+ wheel_base_dir: Path,
+ index_base_dir: Path,
+ default_variant: str | None = None,
+ alias_to_default: str | None = None,
+ comment: str = "",
+):
+ """
+ Generate index for all wheel files.
+
+ Args:
+ whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
+ wheel_base_dir (Path): Base directory for wheel files.
+ index_base_dir (Path): Base directory to store index files.
+ default_variant (str | None): The default variant name, if any.
+ alias_to_default (str | None): Alias variant name for the default variant, if any.
+ comment (str | None): Optional comment to include in the generated HTML files.
+
+ First, parse all wheel files to extract metadata.
+ We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+ The index for the default variant (if any) is generated in the root index directory.
+
+ If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
+ is purely a copy of the corresponding variant index, with only the links adjusted.
+ Otherwise, all wheels without variant suffixes are treated as the default variant.
+
+ If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+ as the default variant index, but the links are adjusted accordingly.
+
+ Index directory structure:
+ index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
+ index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
+ vllm/
+ index.html # package index, pointing to actual files in wheel_base_dir (relative path)
+ metadata.json # machine-readable metadata for all wheels in this package
+ cpu/ # cpu variant sub-directory
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ cu129/ # cu129 is actually the alias to default variant
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ cu130/ # cu130 variant sub-directory
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ ...
+
+ metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
+ [
+ {
+ "package_name": "vllm",
+ "version": "0.10.2rc2",
+ "build_tag": null,
+ "python_tag": "cp38",
+ "abi_tag": "abi3",
+ "platform_tag": "manylinux2014_aarch64",
+ "variant": "cu129",
+ "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
+ "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
+ },
+ ...
+ ]
+ """
+
+ parsed_files = [parse_from_filename(f) for f in whl_files]
+
+ if not parsed_files:
+ print("No wheel files found, skipping index generation.")
+ return
+
+ # Group by variant
+ variant_to_files: dict[str, list[WheelFileInfo]] = {}
+ for file in parsed_files:
+ variant = file.variant or "default"
+ if variant not in variant_to_files:
+ variant_to_files[variant] = []
+ variant_to_files[variant].append(file)
+
+ print(f"Found variants: {list(variant_to_files.keys())}")
+
+ # sanity check for default variant
+ if default_variant:
+ if "default" in variant_to_files:
+ raise ValueError(
+ "All wheel files must have variant suffixes when `default_variant` is specified."
+ )
+ if default_variant not in variant_to_files:
+ raise ValueError(
+ f"Default variant '{default_variant}' not found among wheel files."
+ )
+
+ if alias_to_default:
+ if "default" not in variant_to_files:
+ # e.g. only some wheels are uploaded to S3 currently
+ print(
+ "[WARN] Alias to default variant specified, but no default variant found."
+ )
+ elif alias_to_default in variant_to_files:
+ raise ValueError(
+ f"Alias variant name '{alias_to_default}' already exists among wheel files."
+ )
+ else:
+ variant_to_files[alias_to_default] = variant_to_files["default"].copy()
+ print(f"Alias variant '{alias_to_default}' created for default variant.")
+
+ # Generate comment in HTML header
+ comment_str = f" ({comment})" if comment else ""
+ comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
+
+ # Generate index for each variant
+ subdir_names = set()
+ for variant, files in variant_to_files.items():
+ if variant == "default":
+ variant_dir = index_base_dir
+ else:
+ variant_dir = index_base_dir / variant
+ subdir_names.add(variant)
+
+ variant_dir.mkdir(parents=True, exist_ok=True)
+
+ # gather all package names in this variant
+ packages = set(f.package_name for f in files)
+ if variant == "default":
+ # these packages should also appear in the "project list"
+ # generate after all variants are processed
+ subdir_names = subdir_names.union(packages)
+ else:
+ # generate project list for this variant directly
+ project_list_str = generate_project_list(sorted(packages), comment_tmpl)
+ with open(variant_dir / "index.html", "w") as f:
+ f.write(project_list_str)
+
+ for package in packages:
+ # filter files belonging to this package only
+ package_files = [f for f in files if f.package_name == package]
+ package_dir = variant_dir / package
+ package_dir.mkdir(parents=True, exist_ok=True)
+ index_str, metadata_str = generate_package_index_and_metadata(
+ package_files, wheel_base_dir, package_dir, comment
+ )
+ with open(package_dir / "index.html", "w") as f:
+ f.write(index_str)
+ with open(package_dir / "metadata.json", "w") as f:
+ f.write(metadata_str)
+
+ # Generate top-level project list index
+ project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
+ with open(index_base_dir / "index.html", "w") as f:
+ f.write(project_list_str)
+
+
+if __name__ == "__main__":
+ """
+ Arguments:
+ --version : version string for the current build (e.g., commit hash)
+ --current-objects : path to JSON file containing current S3 objects listing in this version directory
+ --output-dir : directory to store generated index files
+ --alias-to-default : (optional) alias variant name for the default variant
+ --comment : (optional) comment string to include in generated HTML files
+ """
+
+ parser = argparse.ArgumentParser(
+ description="Process nightly build wheel files to generate indices."
+ )
+ parser.add_argument(
+ "--version",
+ type=str,
+ required=True,
+ help="Version string for the current build (e.g., commit hash)",
+ )
+ parser.add_argument(
+ "--current-objects",
+ type=str,
+ required=True,
+ help="Path to JSON file containing current S3 objects listing in this version directory",
+ )
+ parser.add_argument(
+ "--output-dir",
+ type=str,
+ required=True,
+ help="Directory to store generated index files",
+ )
+ parser.add_argument(
+ "--alias-to-default",
+ type=str,
+ default=None,
+ help="Alias variant name for the default variant",
+ )
+ parser.add_argument(
+ "--comment",
+ type=str,
+ default="",
+ help="Optional comment string to include in generated HTML files",
+ )
+
+ args = parser.parse_args()
+
+ version = args.version
+ if "/" in version or "\\" in version:
+ raise ValueError("Version string must not contain slashes.")
+ current_objects_path = Path(args.current_objects)
+ output_dir = Path(args.output_dir)
+ if not output_dir.exists():
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Read current objects JSON
+ with open(current_objects_path) as f:
+ current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
+
+ # current_objects looks like from list_objects_v2 S3 API:
+ """
+ "Contents": [
+ {
+ "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
+ "LastModified": "2025-11-28T14:00:32+00:00",
+ "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
+ "ChecksumAlgorithm": [
+ "CRC64NVME"
+ ],
+ "ChecksumType": "FULL_OBJECT",
+ "Size": 435649349,
+ "StorageClass": "STANDARD"
+ },
+ ...
+ ]
+ """
+
+ # Extract wheel file keys
+ wheel_files = []
+ for item in current_objects.get("Contents", []):
+ key: str = item["Key"]
+ if key.endswith(".whl"):
+ wheel_files.append(key.split("/")[-1]) # only the filename is used
+
+ print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+
+ # keep only "official" files for a non-nightly version (specifed by cli args)
+ PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
+ if PY_VERSION_RE.match(version):
+ # upload-wheels.sh ensures no "dev" is in args.version
+ wheel_files = list(
+ filter(lambda x: version in x and "dev" not in x, wheel_files)
+ )
+ print(f"Non-nightly version detected, wheel files used: {wheel_files}")
+ else:
+ print("Nightly version detected, keeping all wheel files.")
+
+ # Generate index and metadata, assuming wheels and indices are stored as:
+ # s3://vllm-wheels/{version}/
+ # s3://vllm-wheels//
+ wheel_base_dir = Path(output_dir).parent / version
+ index_base_dir = Path(output_dir)
+
+ generate_index_and_metadata(
+ whl_files=wheel_files,
+ wheel_base_dir=wheel_base_dir,
+ index_base_dir=index_base_dir,
+ default_variant=None,
+ alias_to_default=args.alias_to_default,
+ comment=args.comment.strip(),
+ )
+ print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index d0036f24c8d0..b6274d698d01 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -7,53 +7,57 @@ set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-0-16}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
-NUMA_NODE=${NUMA_NODE:-0}
-export CMAKE_BUILD_PARALLEL_LEVEL=32
+export CMAKE_BUILD_PARALLEL_LEVEL=16
# Setup cleanup
remove_docker_container() {
set -e;
- docker rm -f cpu-test-"$NUMA_NODE" || true;
+ docker rm -f cpu-test || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
-# Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+# Run the image
+docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
function cpu_tests() {
set -e
- export NUMA_NODE=$2
- docker exec cpu-test-"$NUMA_NODE" bash -c "
+ docker exec cpu-test bash -c "
set -e
pip list"
# offline inference
- docker exec cpu-test-"$NUMA_NODE" bash -c "
+ docker exec cpu-test bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+ # Run model tests
+ docker exec cpu-test bash -c "
+ set -e
+ pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
+
# Run kernel tests
- docker exec cpu-test-"$NUMA_NODE" bash -c "
+ docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/kernels/test_onednn.py
- pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
+ pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+ pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
# basic online serving
- docker exec cpu-test-"$NUMA_NODE" bash -c '
+ docker exec cpu-test bash -c '
set -e
- VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
+ VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
- --model meta-llama/Llama-3.2-3B-Instruct \
+ --model Qwen/Qwen3-0.6B \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
@@ -61,4 +65,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2h bash -c cpu_tests
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 2267718f75ca..438fe522c870 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh
index 29c8f5ed5a91..0db1abe37ba1 100644
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
+ENV SOC_VERSION="ascend910b1"
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index d49f3e2f47cf..dfc9db512d1e 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -35,9 +35,10 @@ docker run \
echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
- python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+ python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
@@ -46,6 +47,6 @@ docker run \
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
- pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+ pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py
'
diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh
index 5b25c358fc4a..3fb7c82c8d33 100755
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+ echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+ exit 0
+fi
+
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
new file mode 100644
index 000000000000..937a43d1a322
--- /dev/null
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8040}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+ local port=$1
+ timeout 600 bash -c '
+ until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+ sleep 1
+ done'
+}
+
+MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+ # ROCm platform
+ BACKENDS=("allgather_reducescatter")
+ # Disable MOE padding for ROCm since it is causing eplb to fail
+ export VLLM_ROCM_MOE_PADDING=0
+else
+ # Non-ROCm platform (CUDA/other)
+ BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+ if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+ kill "${SERVER_PID}" 2>/dev/null || true
+ for _ in {1..20}; do
+ kill -0 "${SERVER_PID}" 2>/dev/null || break
+ sleep 0.5
+ done
+ kill -9 "${SERVER_PID}" 2>/dev/null || true
+ fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+ VLLM_DEEP_GEMM_WARMUP=skip \
+ VLLM_ALL2ALL_BACKEND=$BACK \
+ vllm serve "$MODEL" \
+ --enforce-eager \
+ --tensor-parallel-size 4 \
+ --enable-expert-parallel \
+ --enable-eplb \
+ --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
+ --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
+ --trust-remote-code \
+ --max-model-len 2048 \
+ --gpu-memory-utilization 0.9 \
+ --port $PORT &
+ SERVER_PID=$!
+ wait_for_server $PORT
+
+ TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+ OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+ python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+ python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+ cleanup
+ SERVER_PID=
+ sleep 1
+ PORT=$((PORT+1))
+done
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 945c5e48c009..3a218a4bb2e6 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,6 +2,28 @@
set -ex
+# ======== part 0: setup ========
+
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+ # use new python from docker
+ docker pull python:3-slim
+ PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ========= part 1: collect, rename & upload the wheel ==========
+
# Assume wheels are in artifacts/dist/*.whl
wheel_files=(artifacts/dist/*.whl)
@@ -10,74 +32,76 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
exit 1
fi
-
-# Get the single wheel file
wheel="${wheel_files[0]}"
-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
- manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
- manylinux_version="manylinux2014"
-else
- echo "Warning: Unknown architecture $arch, using manylinux1 as default"
- manylinux_version="manylinux1"
-fi
+# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# we also accept params as manylinux tag
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+manylinux_version="${1:-manylinux_2_31}"
# Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+ echo "Error: Wheel filename does not contain 'linux': $wheel"
+ exit 1
+fi
new_wheel="${wheel/linux/$manylinux_version}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"
# Extract the version from the wheel
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
-
-normal_wheel="$wheel" # Save the original wheel filename
-
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
- suffix="${version##*.}"
- if [[ $suffix == cu* ]]; then
- new_version="1.0.0.dev+${suffix}"
- else
- new_version="1.0.0.dev"
- fi
- new_wheel="${wheel/$version/$new_version}"
- # use cp to keep both files in the artifacts directory
- cp -- "$wheel" "$new_wheel"
- wheel="$new_wheel"
- version="$new_version"
-fi
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
+echo "Pure version (without variant): $pure_version"
-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+# copy wheel to its own bucket
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+# ========= part 2: generate and upload indices ==========
+# generate indices for all existing wheels in the commit directory
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below
-if [[ $normal_wheel == *"cu129"* ]]; then
- # only upload index.html for cu129 wheels (default wheels) as it
- # is available on both x86 and arm64
- aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
- aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+# list all wheels in the commit directory
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+obj_json="objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
+mkdir -p "$INDICES_OUTPUT_DIR"
+
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels//
+# and indices can be placed in //, or /nightly/, or //
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+ alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
else
- echo "Skipping index files for non-cu129 wheels"
+ alias_arg=""
fi
-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+# HACK: we do not need regex module here, but it is required by pre-commit hook
+# To avoid any external dependency, we simply replace it back to the stdlib re module
+sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
-if [[ $normal_wheel == *"cu129"* ]]; then
- # only upload index.html for cu129 wheels (default wheels) as it
- # is available on both x86 and arm64
- aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-else
- echo "Skipping index files for non-cu129 wheels"
+# copy indices to // unconditionally
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# copy to /nightly/ only if it is on the main branch and not a PR
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+ echo "Uploading indices to overwrite /nightly/"
+ aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
fi
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
+# re-generate and copy to // only if it does not have "dev" in the version
+if [[ "$version" != *"dev"* ]]; then
+ echo "Re-generating indices for /$pure_version/"
+ rm -rf "$INDICES_OUTPUT_DIR/*"
+ mkdir -p "$INDICES_OUTPUT_DIR"
+ $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
+ aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4ddf11c0b268..3c9b8cbedcf0 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -39,9 +39,9 @@ steps:
# if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
soft_fail: true
source_file_dependencies:
- requirements/nightly_torch_test.txt
@@ -50,9 +50,9 @@ steps:
- label: Async Engine, Inputs, Utils, Worker Test # 10min
timeout_in_minutes: 15
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/multimodal
@@ -61,17 +61,19 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
- timeout_in_minutes: 10
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
+ - tests/tokenizers_
+ - tests/tool_parsers
- tests/transformers_utils
- tests/config
no_gpu: true
@@ -80,6 +82,8 @@ steps:
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s tokenizers_
+ - pytest -v -s tool_parsers
- pytest -v -s transformers_utils
- pytest -v -s config
@@ -113,9 +117,9 @@ steps:
- pytest -v -s basic_correctness/test_cpu_offload.py
- label: Entrypoints Unit Tests # 5min
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
timeout_in_minutes: 10
working_dir: "/vllm-workspace/tests"
fast_check: true
@@ -212,6 +216,7 @@ steps:
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@@ -250,9 +255,9 @@ steps:
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: EPLB Algorithm Test # 5min
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
@@ -308,28 +313,25 @@ steps:
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
-- label: Engine Test # 25min
- timeout_in_minutes: 40
+- label: Engine Test # 9min
+ timeout_in_minutes: 15
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/engine
- - tests/tokenization
- tests/test_sequence
- tests/test_config
- tests/test_logger
- tests/test_vllm_port
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- # OOM in the CI unless we run this separately
- - pytest -v -s tokenization
-- label: V1 Test e2e + engine # 30min
- timeout_in_minutes: 45
+- label: V1 Test e2e + engine # 65min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
- agent_pool: mi325_1
+ agent_pool: mi325_4
# grade: Blocking
source_file_dependencies:
- vllm/
@@ -342,9 +344,9 @@ steps:
- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/v1
@@ -392,6 +394,21 @@ steps:
commands:
- pytest -v -s v1/attention
+- label: Batch Invariance Tests (H100) # 10min
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ timeout_in_minutes: 25
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - vllm/model_executor/layers
+ - tests/v1/determinism/
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pip install pytest-timeout pytest-forked
+ - pytest -v -s v1/determinism/test_batch_invariance.py
+ - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
- label: V1 Test attention (B200) # 10min
timeout_in_minutes: 30
gpu: b200
@@ -402,9 +419,9 @@ steps:
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
- label: V1 Test others (CPU) # 5 mins
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/v1
@@ -420,29 +437,34 @@ steps:
- label: Examples Test # 30min
timeout_in_minutes: 45
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
+ - vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
+ # for basic
+ - python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- - python3 offline_inference/basic/chat.py
- - python3 offline_inference/prefix_caching.py
- - python3 offline_inference/llm_engine_example.py
+ - python3 offline_inference/basic/classify.py
+ - python3 offline_inference/basic/embed.py
+ - python3 offline_inference/basic/score.py
+ # for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- - python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- - python3 offline_inference/basic/classify.py
- - python3 offline_inference/basic/embed.py
- - python3 offline_inference/basic/score.py
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -496,7 +518,7 @@ steps:
- label: PyTorch Compilation Unit Tests # 15min
timeout_in_minutes: 30
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
@@ -513,7 +535,7 @@ steps:
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
@@ -569,7 +591,7 @@ steps:
- label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
@@ -596,7 +618,7 @@ steps:
- label: Kernels MoE Test %N # 40min
timeout_in_minutes: 60
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
@@ -623,6 +645,26 @@ steps:
commands:
- pytest -v -s kernels/mamba
+- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
+# Not replicating for CUTLAS & CuTe
+ timeout_in_minutes: 45
+ gpu: h100
+ num_gpus: 1
+ source_file_dependencies:
+ - tools/install_deepgemm.sh
+ - vllm/utils/deep_gemm.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization/test_block_fp8.py
+ - tests/kernels/moe/test_deepgemm.py
+ - tests/kernels/moe/test_batched_deepgemm.py
+ - tests/kernels/attention/test_deepgemm_attention.py
+ commands:
+ - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+ - pytest -v -s kernels/moe/test_deepgemm.py
+ - pytest -v -s kernels/moe/test_batched_deepgemm.py
+ - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
- label: Model Executor Test # 23min
timeout_in_minutes: 35
torch_nightly: true
@@ -681,16 +723,18 @@ steps:
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0
+ - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
-- label: LM Eval Small Models # 15min
- timeout_in_minutes: 20
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: LM Eval Small Models # 53min
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
+ autorun_on_main: true
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
@@ -703,7 +747,7 @@ steps:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
- commands: # LMEval
+ commands: # LMEval+Transcription WER check
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
- pytest -s entrypoints/openai/correctness/
@@ -717,19 +761,7 @@ steps:
- vllm/
- tests/tool_use
commands:
- - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_1
- # grade: Blocking
- timeout_in_minutes: 10
- source_file_dependencies:
- - vllm/
- - tests/tool_use
- no_gpu: true
- commands:
- - pytest -v -s -m 'cpu_test' tool_use
+ - pytest -v -s tool_use
##### models test #####
@@ -900,6 +932,18 @@ steps:
commands:
- pytest -v -s models/language/pooling_mteb_test
+- label: Multi-Modal Processor Test (CPU)
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ no_gpu: true
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
- label: Multi-Modal Processor Test # 44min
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
@@ -927,8 +971,8 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
-- label: Multi-Modal Accuracy Eval (Small Models) # 10min
- timeout_in_minutes: 70
+- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
+ timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
@@ -940,7 +984,8 @@ steps:
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-- label: Multi-Modal Models Test (Extended) 1
+- label: Multi-Modal Models Test (Extended) 1 # 60min
+ timeout_in_minutes: 120
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@@ -964,7 +1009,8 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
-- label: Multi-Modal Models Test (Extended) 3
+- label: Multi-Modal Models Test (Extended) 3 # 75min
+ timeout_in_minutes: 150
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@@ -1056,6 +1102,7 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40
@@ -1065,11 +1112,18 @@ steps:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/worker/
+ - vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/test_fusion_attn.py
+ - tests/compile/test_silu_mul_quant_fusion.py
+ - tests/compile/distributed/test_fusion_all_reduce.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ - tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
@@ -1080,7 +1134,7 @@ steps:
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
+ - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40
@@ -1098,17 +1152,15 @@ steps:
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
- - tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- - pytest -v -s tests/compile/test_fusions_e2e.py
+ - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-- label: ROCm GPT-OSS Eval
+- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
- agent_pool: mi325_1
- mirror_hardwares: [amdexperimental, amdproduction]
+ gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
@@ -1117,7 +1169,7 @@ steps:
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60
@@ -1217,6 +1269,7 @@ steps:
- tests/v1/worker/test_worker_memory_snapshot.py
commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
@@ -1252,7 +1305,7 @@ steps:
- label: Plugin Tests (2 GPUs) # 40min
timeout_in_minutes: 60
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
@@ -1321,14 +1374,14 @@ steps:
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- # Disabled for now because MXFP4 backend on non-cuda platform
+ # Disabled for now because MXFP4 backend on non-cuda platform
# doesn't support LoRA yet
#- pytest -v -s -x lora/test_gptoss_tp.py
- label: Weight Loading Multiple GPU Test # 33min
timeout_in_minutes: 45
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
@@ -1387,12 +1440,13 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
+
- label: LM Eval Large Models # optional
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_4
- # grade: Blocking
gpu: a100
optional: true
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
@@ -1404,11 +1458,11 @@ steps:
##### H100 test #####
- label: LM Eval Large Models (H100) # optional
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_4
- # grade: Blocking
gpu: h100
optional: true
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
@@ -1418,6 +1472,7 @@ steps:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
##### H200 test #####
- label: Distributed Tests (H200) # optional
mirror_hardwares: [amdexperimental]
@@ -1428,14 +1483,14 @@ steps:
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- - pytest -v -s tests/compile/distributed/test_async_tp.py
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- - pytest -v -s tests/compile/distributed/test_sequence_parallel.py
+ - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
@@ -1449,6 +1504,57 @@ steps:
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py
+##### E2E Eval Tests #####
+- label: LM Eval Small Models (1 Card) # 15min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 Card)
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_4
+ # grade: Blocking
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: ROCm LM Eval Large Models (8 Card)
+ mirror_hardwares: [amdproduction]
+ agent_pool: mi325_8
+ num_gpus: 8
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
+
+- label: ROCm GPT-OSS Eval
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ agent_pool: mi325_1
+ mirror_hardwares: [amdexperimental, amdproduction]
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
mirror_hardwares: [amdexperimental]
@@ -1463,9 +1569,8 @@ steps:
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
-
- label: DeepSeek V2-Lite Accuracy
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 60
@@ -1476,8 +1581,8 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
-- label: Qwen3-30B-A3B-FP8-block Accuracy
- mirror_hardwares: [amdexperimental]
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 60
@@ -1487,3 +1592,35 @@ steps:
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+ timeout_in_minutes: 60
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e444becd9867..2dcca5711b3d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -57,14 +57,16 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
- timeout_in_minutes: 10
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+ timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
+ - tests/tokenizers_
+ - tests/tool_parsers
- tests/transformers_utils
- tests/config
no_gpu: true
@@ -73,6 +75,8 @@ steps:
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s tokenizers_
+ - pytest -v -s tool_parsers
- pytest -v -s transformers_utils
- pytest -v -s config
@@ -276,21 +280,18 @@ steps:
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
-- label: Engine Test # 25min
- timeout_in_minutes: 40
+- label: Engine Test # 9min
+ timeout_in_minutes: 15
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/engine
- - tests/tokenization
- tests/test_sequence
- tests/test_config
- tests/test_logger
- tests/test_vllm_port
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- # OOM in the CI unless we run this separately
- - pytest -v -s tokenization
- label: V1 Test e2e + engine # 30min
timeout_in_minutes: 45
@@ -351,7 +352,8 @@ steps:
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- - vllm/
+ - vllm/v1/attention
+ - vllm/model_executor/layers
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -388,23 +390,28 @@ steps:
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
+ - vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
+ # for basic
+ - python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- - python3 offline_inference/basic/chat.py
- - python3 offline_inference/prefix_caching.py
- - python3 offline_inference/llm_engine_example.py
+ - python3 offline_inference/basic/classify.py
+ - python3 offline_inference/basic/embed.py
+ - python3 offline_inference/basic/score.py
+ # for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- - python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- - python3 offline_inference/basic/classify.py
- - python3 offline_inference/basic/embed.py
- - python3 offline_inference/basic/score.py
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -463,7 +470,9 @@ steps:
# tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+ # However, find does not normally propagate error codes, so we combine it with xargs
+ # (using -0 for proper path handling)
+ - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
@@ -477,7 +486,9 @@ steps:
# as it is a heavy test that is covered in other steps.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+ # However, find does not normally propagate error codes, so we combine it with xargs
+ # (using -0 for proper path handling)
+ - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Test # 27min
timeout_in_minutes: 40
@@ -632,6 +643,7 @@ steps:
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+ - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min
@@ -662,16 +674,7 @@ steps:
- vllm/
- tests/tool_use
commands:
- - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
- timeout_in_minutes: 10
- source_file_dependencies:
- - vllm/
- - tests/tool_use
- no_gpu: true
- commands:
- - pytest -v -s -m 'cpu_test' tool_use
+ - pytest -v -s tool_use
##### models test #####
@@ -682,6 +685,7 @@ steps:
source_file_dependencies:
- vllm/
- tests/models/test_initialization.py
+ - tests/models/registry.py
commands:
# Run a subset of model initialization tests
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -694,6 +698,7 @@ steps:
- vllm/model_executor/models/
- vllm/transformers_utils/
- tests/models/test_initialization.py
+ - tests/models/registry.py
commands:
# Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above
@@ -819,14 +824,24 @@ steps:
commands:
- pytest -v -s models/language/pooling_mteb_test
-- label: Multi-Modal Processor Test # 44min
+- label: Multi-Modal Processor Test (CPU)
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ no_gpu: true
+ commands:
+ - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
+ - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor Test
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- - pytest -v -s models/multimodal/processing
+ - pytest -v -s models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Models Test (Standard) # 60min
timeout_in_minutes: 80
@@ -903,11 +918,12 @@ steps:
- label: Transformers Nightly Models Test
working_dir: "/vllm-workspace/"
optional: true
+ soft_fail: true
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
+ - pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
- # - pytest -v -s tests/models/multimodal/processing/
+ - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
@@ -971,7 +987,6 @@ steps:
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- - vllm/model_executor/layers/fused_moe/layer.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
@@ -1302,11 +1317,11 @@ steps:
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- - pytest -v -s tests/compile/distributed/test_async_tp.py
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- - pytest -v -s tests/distributed/test_sequence_parallel.py
+ - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
@@ -1326,6 +1341,7 @@ steps:
- label: Prime-RL Integration Test # 15min
timeout_in_minutes: 30
optional: true
+ soft_fail: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
@@ -1359,4 +1375,4 @@ steps:
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml
new file mode 100644
index 000000000000..6e444eae14c7
--- /dev/null
+++ b/.buildkite/test_areas/attention.yaml
@@ -0,0 +1,21 @@
+group: Attention
+depends_on:
+ - image-build
+steps:
+- label: V1 attention (H100)
+ timeout_in_minutes: 30
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - tests/v1/attention
+ commands:
+ - pytest -v -s v1/attention
+
+- label: V1 attention (B200)
+ timeout_in_minutes: 30
+ gpu: b200
+ source_file_dependencies:
+ - vllm/v1/attention
+ - tests/v1/attention
+ commands:
+ - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml
new file mode 100644
index 000000000000..759d2b535871
--- /dev/null
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -0,0 +1,16 @@
+group: Basic Correctness
+depends_on:
+ - image-build
+steps:
+- label: Basic Correctness
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/basic_correctness/test_basic_correctness
+ - tests/basic_correctness/test_cpu_offload
+ - tests/basic_correctness/test_cumem.py
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s basic_correctness/test_cumem.py
+ - pytest -v -s basic_correctness/test_basic_correctness.py
+ - pytest -v -s basic_correctness/test_cpu_offload.py
diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml
new file mode 100644
index 000000000000..574b642d407b
--- /dev/null
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -0,0 +1,19 @@
+group: Benchmarks
+depends_on:
+ - image-build
+steps:
+- label: Benchmarks
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/.buildkite"
+ source_file_dependencies:
+ - benchmarks/
+ commands:
+ - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - vllm/
+ - tests/benchmarks/
+ commands:
+ - pytest -v -s benchmarks/
diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml
new file mode 100644
index 000000000000..0ba00925a483
--- /dev/null
+++ b/.buildkite/test_areas/compile.yaml
@@ -0,0 +1,57 @@
+group: Compile
+depends_on:
+ - image-build
+steps:
+- label: Fusion and Compile Tests (B200)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/worker/
+ - vllm/v1/cudagraph_dispatcher.py
+ - vllm/compilation/
+ # can affect pattern matching
+ - vllm/model_executor/layers/layernorm.py
+ - vllm/model_executor/layers/activation.py
+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/test_fusion_attn.py
+ - tests/compile/test_silu_mul_quant_fusion.py
+ - tests/compile/distributed/test_fusion_all_reduce.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ - tests/compile/fullgraph/test_full_graph.py
+ commands:
+ - nvidia-smi
+ - pytest -v -s tests/compile/test_fusion_attn.py
+ - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+ # this runner has 2 GPUs available even though num_gpus=2 is not set
+ - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+ # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+ # Wrap with quotes to escape yaml
+ - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+ # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+ - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Fusion E2E (2 GPUs)(B200)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/compilation/
+ # can affect pattern matching
+ - vllm/model_executor/layers/layernorm.py
+ - vllm/model_executor/layers/activation.py
+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ commands:
+ - nvidia-smi
+ # Run all e2e fusion tests
+ - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+
diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml
new file mode 100644
index 000000000000..50c0c338c243
--- /dev/null
+++ b/.buildkite/test_areas/cuda.yaml
@@ -0,0 +1,22 @@
+group: CUDA
+depends_on:
+ - image-build
+steps:
+- label: Platform Tests (CUDA)
+ timeout_in_minutes: 15
+ source_file_dependencies:
+ - vllm/
+ - tests/cuda
+ commands:
+ - pytest -v -s cuda/test_cuda_context.py
+
+- label: Cudagraph
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - tests/v1/cudagraph
+ - vllm/v1/cudagraph_dispatcher.py
+ - vllm/config/compilation.py
+ - vllm/compilation
+ commands:
+ - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+ - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
new file mode 100644
index 000000000000..2cc90698d916
--- /dev/null
+++ b/.buildkite/test_areas/distributed.yaml
@@ -0,0 +1,199 @@
+group: Distributed
+depends_on:
+ - image-build
+steps:
+- label: Distributed Comm Ops
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/distributed
+ - tests/distributed
+ commands:
+ - pytest -v -s distributed/test_comm_ops.py
+ - pytest -v -s distributed/test_shm_broadcast.py
+ - pytest -v -s distributed/test_shm_buffer.py
+ - pytest -v -s distributed/test_shm_storage.py
+
+- label: Distributed (2 GPUs)
+ timeout_in_minutes: 90
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/compilation/
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/worker/worker_base.py
+ - vllm/v1/engine/
+ - vllm/v1/worker/
+ - tests/compile/fullgraph/test_basic_correctness.py
+ - tests/compile/test_wrapper.py
+ - tests/distributed/
+ - tests/entrypoints/llm/test_collective_rpc.py
+ - tests/v1/distributed
+ - tests/v1/entrypoints/openai/test_multi_api_servers.py
+ - tests/v1/shutdown
+ - tests/v1/worker/test_worker_memory_snapshot.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+ - pytest -v -s entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+ - pytest -v -s ./compile/test_wrapper.py
+ - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - pytest -v -s distributed/test_sequence_parallel.py
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+ - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Tests (4 GPUs)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - tests/distributed/test_utils
+ - tests/distributed/test_pynccl
+ - tests/distributed/test_events
+ - tests/compile/fullgraph/test_basic_correctness.py
+ - examples/offline_inference/rlhf.py
+ - examples/offline_inference/rlhf_colocate.py
+ - tests/examples/offline_inference/data_parallel.py
+ - tests/v1/distributed
+ - tests/v1/engine/test_engine_core_client.py
+ - tests/distributed/test_symm_mem_allreduce.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ # test with torchrun tp=2 and external_dp=2
+ - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=2 and pp=2
+ - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=4 and dp=1
+ - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2, pp=2 and dp=1
+ - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=1 and dp=4 with ep
+ - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2 and dp=2 with ep
+ - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with internal dp
+ - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+ - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+ - pytest -v -s distributed/test_utils.py
+ - pytest -v -s compile/fullgraph/test_basic_correctness.py
+ - pytest -v -s distributed/test_pynccl.py
+ - pytest -v -s distributed/test_events.py
+ - pytest -v -s distributed/test_symm_mem_allreduce.py
+ # TODO: create a dedicated test section for multi-GPU example tests
+ # when we have multiple distributed example tests
+ - cd ../examples/offline_inference
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+
+- label: Distributed Tests (8 GPUs)(H100)
+ timeout_in_minutes: 10
+ gpu: h100
+ num_gpus: 8
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - examples/offline_inference/torchrun_dp_example.py
+ - vllm/config/parallel.py
+ - vllm/distributed/
+ - vllm/v1/engine/llm_engine.py
+ - vllm/v1/executor/uniproc_executor.py
+ - vllm/v1/worker/gpu_worker.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ # test with torchrun tp=2 and dp=4 with ep
+ - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
+- label: Distributed Tests (4 GPUs)(A100)
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/
+ commands:
+ # NOTE: don't test llama model here, it seems hf implementation is buggy
+ # see https://github.com/vllm-project/vllm/pull/5689 for details
+ - pytest -v -s distributed/test_custom_all_reduce.py
+ - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+ - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - pytest -v -s -x lora/test_mixtral.py
+
+- label: Distributed Tests (2 GPUs)(H200)
+ gpu: h200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+ - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+ - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: Distributed Tests (2 GPUs)(B200)
+ gpu: b200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+ - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: 2 Node Test (4 GPUs)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ num_nodes: 2
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ - tests/examples/offline_inference/data_parallel.py
+ commands:
+ - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
+
+- label: Distributed NixlConnector PD accuracy (4 GPUs)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+ - tests/v1/kv_connector/nixl_integration/
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+ - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+
+- label: Pipeline + Context Parallelism (4 GPUs))
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ commands:
+ - pytest -v -s distributed/test_pp_cudagraph.py
+ - pytest -v -s distributed/test_pipeline_parallel.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
new file mode 100644
index 000000000000..93d389815eda
--- /dev/null
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -0,0 +1,59 @@
+group: E2E Integration
+depends_on:
+ - image-build
+steps:
+- label: DeepSeek V2-Lite Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+ timeout_in_minutes: 60
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: Prime-RL Integration (2 GPUs)
+ timeout_in_minutes: 30
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ source_file_dependencies:
+ - vllm/
+ - .buildkite/scripts/run-prime-rl-test.sh
+ commands:
+ - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
new file mode 100644
index 000000000000..a028e0e4af4c
--- /dev/null
+++ b/.buildkite/test_areas/engine.yaml
@@ -0,0 +1,26 @@
+group: Engine
+depends_on:
+ - image-build
+steps:
+- label: Engine
+ timeout_in_minutes: 15
+ source_file_dependencies:
+ - vllm/
+ - tests/engine
+ - tests/test_sequence
+ - tests/test_config
+ - tests/test_logger
+ - tests/test_vllm_port
+ commands:
+ - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+
+- label: V1 e2e + engine
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ # TODO: accuracy does not match, whether setting
+ # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+ - pytest -v -s v1/e2e
+ - pytest -v -s v1/engine
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
new file mode 100644
index 000000000000..0a789be943f3
--- /dev/null
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -0,0 +1,68 @@
+group: Entrypoints
+depends_on:
+ - image-build
+steps:
+- label: Entrypoints Unit Tests
+ timeout_in_minutes: 10
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/entrypoints
+ - tests/entrypoints/
+ commands:
+ - pytest -v -s entrypoints/openai/tool_parsers
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration (LLM)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/llm
+ - tests/entrypoints/offline_mode
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+ - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Integration (API Server)
+ timeout_in_minutes: 130
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/openai
+ - tests/entrypoints/test_chat_utils
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+ - pytest -v -s entrypoints/test_chat_utils.py
+
+
+- label: Entrypoints Integration (Pooling)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/pooling
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/pooling
+
+
+- label: Entrypoints V1
+ timeout_in_minutes: 50
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - pytest -v -s v1/entrypoints
+
+- label: OpenAI API Correctness
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - csrc/
+ - vllm/entrypoints/openai/
+ - vllm/model_executor/models/whisper.py
+ commands: # LMEval+Transcription WER check
+ - pytest -s entrypoints/openai/correctness/
diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
new file mode 100644
index 000000000000..feb8252148c7
--- /dev/null
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -0,0 +1,23 @@
+group: Expert Parallelism
+depends_on:
+ - image-build
+steps:
+- label: EPLB Algorithm
+ timeout_in_minutes: 15
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_algo.py
+ commands:
+ - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_execute.py
+ commands:
+ - pytest -v -s distributed/test_eplb_execute.py
+ - pytest -v -s distributed/test_eplb_spec_decode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
new file mode 100644
index 000000000000..7ca099516d64
--- /dev/null
+++ b/.buildkite/test_areas/kernels.yaml
@@ -0,0 +1,117 @@
+group: Kernels
+depends_on:
+ - image-build
+steps:
+- label: Kernels Core Operation Test
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - csrc/
+ - tests/kernels/core
+ - tests/kernels/test_top_k_per_row.py
+ commands:
+ - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+
+- label: Kernels Attention Test %N
+ timeout_in_minutes: 35
+ source_file_dependencies:
+ - csrc/attention/
+ - vllm/attention
+ - vllm/v1/attention
+ - tests/kernels/attention
+ commands:
+ - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Quantization Test %N
+ timeout_in_minutes: 90
+ source_file_dependencies:
+ - csrc/quantization/
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization
+ commands:
+ - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels MoE Test %N
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - csrc/quantization/cutlass_w8a8/moe/
+ - csrc/moe/
+ - tests/kernels/moe
+ - vllm/model_executor/layers/fused_moe/
+ - vllm/distributed/device_communicators/
+ - vllm/envs.py
+ - vllm/config
+ commands:
+ - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Mamba Test
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - csrc/mamba/
+ - tests/kernels/mamba
+ - vllm/model_executor/layers/mamba/ops
+ commands:
+ - pytest -v -s kernels/mamba
+
+- label: Kernels DeepGEMM Test (H100)
+ timeout_in_minutes: 45
+ gpu: h100
+ num_gpus: 1
+ source_file_dependencies:
+ - tools/install_deepgemm.sh
+ - vllm/utils/deep_gemm.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization/test_block_fp8.py
+ - tests/kernels/moe/test_deepgemm.py
+ - tests/kernels/moe/test_batched_deepgemm.py
+ - tests/kernels/attention/test_deepgemm_attention.py
+ commands:
+ - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+ - pytest -v -s kernels/moe/test_deepgemm.py
+ - pytest -v -s kernels/moe/test_batched_deepgemm.py
+ - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
+- label: Kernels (B200)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ # optional: true
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - csrc/attention/mla/
+ - csrc/quantization/cutlass_w8a8/moe/
+ - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/attention/backends/mla/cutlass_mla.py
+ - vllm/v1/attention/backends/mla/flashinfer_mla.py
+ - vllm/platforms/cuda.py
+ - vllm/attention/selector.py
+ commands:
+ - nvidia-smi
+ - python3 examples/offline_inference/basic/chat.py
+ # Attention
+ # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+ - pytest -v -s tests/kernels/attention/test_attention_selector.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+ - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+ - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+ # Quantization
+ - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+ - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+ - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+ - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+ - pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
new file mode 100644
index 000000000000..9af43e0c375a
--- /dev/null
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -0,0 +1,46 @@
+group: LM Eval
+depends_on:
+ - image-build
+steps:
+- label: LM Eval Small Models
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ autorun_on_main: true
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 GPUs)(A100)
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: LM Eval Large Models (4 GPUs)(H100)
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+- label: LM Eval Small Models (B200)
+ timeout_in_minutes: 120
+ gpu: b200
+ optional: true
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
new file mode 100644
index 000000000000..809b4138f44b
--- /dev/null
+++ b/.buildkite/test_areas/lora.yaml
@@ -0,0 +1,31 @@
+group: LoRA
+depends_on:
+ - image-build
+steps:
+- label: LoRA %N
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
+ parallelism: 4
+
+
+- label: LoRA TP (Distributed)
+ timeout_in_minutes: 30
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ # FIXIT: find out which code initialize cuda before running the test
+ # before the fix, we need to use spawn to test it
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ # There is some Tensor Parallelism related processing logic in LoRA that
+ # requires multi-GPU testing for validation.
+ - pytest -v -s -x lora/test_chatglm3_tp.py
+ - pytest -v -s -x lora/test_llama_tp.py
+ - pytest -v -s -x lora/test_llm_with_multi_loras.py
+ - pytest -v -s -x lora/test_olmoe_tp.py
+ - pytest -v -s -x lora/test_gptoss_tp.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
new file mode 100644
index 000000000000..252af1e56a10
--- /dev/null
+++ b/.buildkite/test_areas/misc.yaml
@@ -0,0 +1,165 @@
+group: Miscellaneous
+depends_on:
+ - image-build
+steps:
+- label: V1 Others
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+ # split the test to avoid interference
+ - pytest -v -s -m 'not cpu_test' v1/core
+ - pytest -v -s v1/executor
+ - pytest -v -s v1/kv_offload
+ - pytest -v -s v1/sample
+ - pytest -v -s v1/logits_processors
+ - pytest -v -s v1/worker
+ - pytest -v -s v1/spec_decode
+ - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'not cpu_test' v1/metrics
+ - pytest -v -s v1/test_oracle.py
+ - pytest -v -s v1/test_request.py
+ - pytest -v -s v1/test_outputs.py
+ # Integration test for streaming correctness (requires special branch).
+ - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+ - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: V1 Others (CPU)
+ depends_on: ~
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ no_gpu: true
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s -m 'cpu_test' v1/core
+ - pytest -v -s v1/structured_output
+ - pytest -v -s v1/test_serial_utils.py
+ - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'cpu_test' v1/metrics
+
+- label: Regression
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - vllm/
+ - tests/test_regression
+ commands:
+ - pip install modelscope
+ - pytest -v -s test_regression.py
+ working_dir: "/vllm-workspace/tests" # optional
+
+- label: Examples
+ timeout_in_minutes: 45
+ working_dir: "/vllm-workspace/examples"
+ source_file_dependencies:
+ - vllm/entrypoints
+ - vllm/multimodal
+ - examples/
+ commands:
+ - pip install tensorizer # for tensorizer test
+ - python3 offline_inference/basic/chat.py # for basic
+ - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+ - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+ - python3 offline_inference/basic/classify.py
+ - python3 offline_inference/basic/embed.py
+ - python3 offline_inference/basic/score.py
+ # for multi-modal models
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+ - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+ # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+ - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+
+- label: Metrics, Tracing (2 GPUs)
+ timeout_in_minutes: 20
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/
+ - tests/v1/tracing
+ commands:
+ - "pip install \
+ 'opentelemetry-sdk>=1.26.0' \
+ 'opentelemetry-api>=1.26.0' \
+ 'opentelemetry-exporter-otlp>=1.26.0' \
+ 'opentelemetry-semantic-conventions-ai>=0.4.1'"
+ - pytest -v -s v1/tracing
+
+- label: Python-only Installation
+ depends_on: ~
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - tests/standalone_tests/python_only_compile.sh
+ - setup.py
+ commands:
+ - bash standalone_tests/python_only_compile.sh
+
+- label: Async Engine, Inputs, Utils, Worker
+ timeout_in_minutes: 50
+ source_file_dependencies:
+ - vllm/
+ - tests/multimodal
+ - tests/utils_
+ commands:
+ - pytest -v -s -m 'not cpu_test' multimodal
+ - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
+ depends_on: ~
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/test_inputs.py
+ - tests/test_outputs.py
+ - tests/multimodal
+ - tests/standalone_tests/lazy_imports.py
+ - tests/tokenizers_
+ - tests/tool_parsers
+ - tests/transformers_utils
+ - tests/config
+ no_gpu: true
+ commands:
+ - python3 standalone_tests/lazy_imports.py
+ - pytest -v -s test_inputs.py
+ - pytest -v -s test_outputs.py
+ - pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s tokenizers_
+ - pytest -v -s tool_parsers
+ - pytest -v -s transformers_utils
+ - pytest -v -s config
+
+- label: GPT-OSS Eval (B200)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Batch Invariance (H100)
+ timeout_in_minutes: 25
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - vllm/model_executor/layers
+ - tests/v1/determinism/
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pip install pytest-timeout pytest-forked
+ - pytest -v -s v1/determinism/test_batch_invariance.py
+ - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml
new file mode 100644
index 000000000000..996c8bb8b780
--- /dev/null
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -0,0 +1,17 @@
+group: Model Executor
+depends_on:
+ - image-build
+steps:
+- label: Model Executor
+ timeout_in_minutes: 35
+ source_file_dependencies:
+ - vllm/engine/arg_utils.py
+ - vllm/config/model.py
+ - vllm/model_executor
+ - tests/model_executor
+ - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+ commands:
+ - apt-get update && apt-get install -y curl libsodium23
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s model_executor
+ - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
new file mode 100644
index 000000000000..39a5d51c4883
--- /dev/null
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -0,0 +1,62 @@
+group: Models - Basic
+depends_on:
+ - image-build
+steps:
+- label: Basic Models Tests (Initialization)
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_initialization.py
+ commands:
+ # Run a subset of model initialization tests
+ - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/test_initialization.py
+ commands:
+ # Only when vLLM model source is modified - test initialization of a large
+ # subset of supported models (the complement of the small subset in the above
+ # test.) Also run if model initialization test file is modified
+ - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Basic Models Tests (Other)
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_transformers.py
+ - tests/models/test_registry.py
+ commands:
+ - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+ timeout_in_minutes: 10
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_utils.py
+ - tests/models/test_vision.py
+ no_gpu: true
+ commands:
+ - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Transformers Nightly Models
+ working_dir: "/vllm-workspace/"
+ optional: true
+ soft_fail: true
+ commands:
+ - pip install --upgrade git+https://github.com/huggingface/transformers
+ - pytest -v -s tests/models/test_initialization.py
+ - pytest -v -s tests/models/test_transformers.py
+ - pytest -v -s tests/models/multimodal/processing/
+ - pytest -v -s tests/models/multimodal/test_mapping.py
+ - python3 examples/offline_inference/basic/chat.py
+ - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+ # Whisper needs spawn method to avoid deadlock
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml
new file mode 100644
index 000000000000..b6bfbf2ddab4
--- /dev/null
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -0,0 +1,22 @@
+group: Models - Distributed
+depends_on:
+ - image-build
+steps:
+- label: Distributed Model Tests (2 GPUs)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/model_executor/model_loader/sharded_state_loader.py
+ - vllm/model_executor/models/
+ - tests/basic_correctness/
+ - tests/model_executor/model_loader/test_sharded_state_loader.py
+ - tests/models/
+ commands:
+ - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+ # Avoid importing model tests that cause CUDA reinitialization error
+ - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
new file mode 100644
index 000000000000..f70192c4ebc0
--- /dev/null
+++ b/.buildkite/test_areas/models_language.yaml
@@ -0,0 +1,91 @@
+group: Models - Language
+depends_on:
+ - image-build
+steps:
+- label: Language Models Tests (Standard)
+ timeout_in_minutes: 25
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language
+ commands:
+ # Test standard language models, excluding a subset of slow tests
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/language/pooling/test_embedding.py
+ - tests/models/language/generation/test_common.py
+ - tests/models/language/pooling/test_classification.py
+ commands:
+ # Shard slow subset of standard language models tests. Only run when model
+ # source is modified, or when specified test files are modified
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install fast path packages for testing against transformers
+ # Note: also needed to run plamo2 model in vLLM
+ - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+ # Shard hybrid language model tests
+ - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Test (Extended Generation) # 80min
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install fast path packages for testing against transformers
+ # Note: also needed to run plamo2 model in vLLM
+ - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+ - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (PPL)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation_ppl_test
+ commands:
+ - pytest -v -s models/language/generation_ppl_test
+
+- label: Language Models Test (Extended Pooling) # 36min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling
+ commands:
+ - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Language Models Test (MTEB)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling_mteb_test
+ commands:
+ - pytest -v -s models/language/pooling_mteb_test
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
new file mode 100644
index 000000000000..fc24068c20a4
--- /dev/null
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -0,0 +1,79 @@
+group: Models - Multimodal
+depends_on:
+ - image-build
+steps:
+- label: Multi-Modal Models (Standard) # 60min
+ timeout_in_minutes: 80
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+ - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Processor Test (CPU)
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ no_gpu: true
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor # 44min
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+ timeout_in_minutes: 70
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - vllm/multimodal/
+ - vllm/inputs/
+ - vllm/v1/core/
+ commands:
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
+- label: Multi-Modal Models (Extended) 1
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+
+- label: Multi-Modal Models (Extended) 2
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models (Extended) 3
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models
+ optional: true
+ commands:
+ - echo 'Testing custom models...'
+ # PR authors can temporarily add commands below to test individual models
+ # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+ # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml
new file mode 100644
index 000000000000..60c179aa098e
--- /dev/null
+++ b/.buildkite/test_areas/plugins.yaml
@@ -0,0 +1,34 @@
+group: Plugins
+depends_on:
+ - image-build
+steps:
+- label: Plugin Tests (2 GPUs)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/plugins/
+ - tests/plugins/
+ commands:
+ # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+ - pip install -e ./plugins/vllm_add_dummy_platform
+ - pytest -v -s plugins_tests/test_platform_plugins.py
+ - pip uninstall vllm_add_dummy_platform -y
+ # end platform plugin tests
+ # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+ - pip install -e ./plugins/prithvi_io_processor_plugin
+ - pytest -v -s plugins_tests/test_io_processor_plugins.py
+ - pip uninstall prithvi_io_processor_plugin -y
+ # end io_processor plugins test
+ # begin stat_logger plugins test
+ - pip install -e ./plugins/vllm_add_dummy_stat_logger
+ - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+ - pip uninstall dummy_stat_logger -y
+ # end stat_logger plugins test
+ # other tests continue here:
+ - pytest -v -s plugins_tests/test_scheduler_plugins.py
+ - pip install -e ./plugins/vllm_add_dummy_model
+ - pytest -v -s distributed/test_distributed_oot.py
+ - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+ - pytest -v -s models/test_oot_registration.py # it needs a clean process
+ - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
new file mode 100644
index 000000000000..703c82eb1a91
--- /dev/null
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -0,0 +1,50 @@
+group: PyTorch
+depends_on:
+ - image-build
+steps:
+- label: PyTorch Compilation Unit Tests
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # Run unit tests defined directly under compile/,
+ # not including subdirectories, which are usually heavier
+ # tests covered elsewhere.
+ # Use `find` to launch multiple instances of pytest so that
+ # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+ - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph Smoke Test
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # Run smoke tests under fullgraph directory, except test_full_graph.py
+ # as it is a heavy test that is covered in other steps.
+ # Use `find` to launch multiple instances of pytest so that
+ # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+ - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph
+ timeout_in_minutes: 40
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # fp8 kv scales not supported on sm89, tested on Blackwell instead
+ - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+ # Limit to no custom ops to reduce running time
+ # Wrap with quotes to escape yaml and avoid starting -k string with a -
+ - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+ # if this test fails, it means the nightly torch version is not compatible with some
+ # of the dependencies. Please check the error message and add the package to whitelist
+ # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+ soft_fail: true
+ source_file_dependencies:
+ - requirements/nightly_torch_test.txt
+ commands:
+ - bash standalone_tests/pytorch_nightly_dependency.sh
\ No newline at end of file
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
new file mode 100644
index 000000000000..6e89d6af3b8d
--- /dev/null
+++ b/.buildkite/test_areas/quantization.yaml
@@ -0,0 +1,46 @@
+group: Quantization
+depends_on:
+ - image-build
+steps:
+- label: Quantization
+ timeout_in_minutes: 90
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ - tests/quantization
+ commands:
+ # temporary install here since we need nightly, will move to requirements/test.in
+ # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+ # since torchao nightly is only compatible with torch nightly currently
+ # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+ # we can only upgrade after this is resolved
+ # TODO(jerryzh168): resolve the above comment
+ - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+ - uv pip install --system conch-triton-kernels
+ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+
+- label: Quantized MoE Test (B200)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - tests/quantization/test_blackwell_moe.py
+ - vllm/model_executor/models/deepseek_v2.py
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/models/llama4.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization/compressed_tensors
+ - vllm/model_executor/layers/quantization/modelopt.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Quantized Models Test
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/model_executor/layers/quantization
+ - tests/models/quantization
+ commands:
+ - pytest -v -s models/quantization
diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml
new file mode 100644
index 000000000000..ad377148fd07
--- /dev/null
+++ b/.buildkite/test_areas/samplers.yaml
@@ -0,0 +1,14 @@
+group: Samplers
+depends_on:
+ - image-build
+steps:
+- label: Samplers Test
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - vllm/model_executor/layers
+ - vllm/sampling_metadata.py
+ - tests/samplers
+ - tests/conftest.py
+ commands:
+ - pytest -v -s samplers
+ - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml
new file mode 100644
index 000000000000..69527a121422
--- /dev/null
+++ b/.buildkite/test_areas/tool_use.yaml
@@ -0,0 +1,13 @@
+group: Tool use
+depends_on:
+ - image-build
+steps:
+- label: OpenAI-Compatible Tool Use
+ timeout_in_minutes: 35
+ mirror_hardwares: [amdexperimental]
+ fast_check: false
+ source_file_dependencies:
+ - vllm/
+ - tests/tool_use
+ commands:
+ - pytest -v -s tool_use
diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml
new file mode 100644
index 000000000000..cfc5bb20fe7a
--- /dev/null
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -0,0 +1,25 @@
+group: Weight Loading
+depends_on:
+ - image-build
+steps:
+- label: Weight Loading Multiple GPU # 33min
+ timeout_in_minutes: 45
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU - Large Models # optional
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ gpu: a100
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 997a40e18e58..3ad79f93bc7a 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -14,6 +14,52 @@ pull_request_rules:
comment:
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
+- name: comment-pre-commit-failure
+ description: Comment on PR when pre-commit check fails
+ conditions:
+ - status-failure=pre-commit
+ - -closed
+ - -draft
+ actions:
+ comment:
+ message: |
+ Hi @{{author}}, the pre-commit checks have failed. Please run:
+
+ ```bash
+ uv pip install pre-commit
+ pre-commit install
+ pre-commit run --all-files
+ ```
+
+ Then, commit the changes and push to your branch.
+
+ For future commits, `pre-commit` will run automatically on changed files before each commit.
+
+ > [!TIP]
+ >
+ > Is mypy or markdownlint failing?
+ >
+ > mypy and markdownlint are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+ >
+ > ```bash
+ > # For mypy (substitute "3.10" with the failing version if needed)
+ > pre-commit run --hook-stage manual mypy-3.10
+ > # For markdownlint
+ > pre-commit run --hook-stage manual markdownlint
+ > ```
+ >
+
+- name: comment-dco-failure
+ description: Comment on PR when DCO check fails
+ conditions:
+ - status-failure=dco
+ - -closed
+ - -draft
+ actions:
+ comment:
+ message: |
+ Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
+
- name: label-ci-build
description: Automatically apply ci/build label
conditions:
@@ -140,7 +186,7 @@ pull_request_rules:
- files~=^tests/entrypoints/test_context.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
- - files~=^vllm/entrypoints/harmony_utils.py
+ - files~=^vllm/entrypoints/openai/parser/harmony_utils.py
- files~=^vllm/entrypoints/tool_server.py
- files~=^vllm/entrypoints/tool.py
- files~=^vllm/entrypoints/context.py
@@ -358,4 +404,4 @@ pull_request_rules:
actions:
label:
add:
- - kv-connector
\ No newline at end of file
+ - kv-connector
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index c3e132a536a4..df8910837715 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,10 +13,10 @@ jobs:
steps:
- name: Checkout repository
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Set up Python
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+ uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: '3.12'
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index 7d565ef9f2e4..629966b95933 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -105,6 +105,31 @@ jobs:
}
],
},
+ cpu: {
+ // Keyword search - matches whole words only (with word boundaries)
+ keywords: [
+ {
+ term: "CPU Backend",
+ searchIn: "title"
+ },
+ {
+ term: "x86",
+ searchIn: "title"
+ },
+ {
+ term: "ARM",
+ searchIn: "title"
+ },
+ {
+ term: "Apple Silicon",
+ searchIn: "title"
+ },
+ {
+ term: "IBM Z",
+ searchIn: "title"
+ },
+ ],
+ },
// Add more label configurations here as needed
// example: {
// keywords: [...],
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index a183033c9add..e80a5c0cc80f 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -12,7 +12,7 @@ jobs:
timeout-minutes: 30
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6.0.1
- uses: astral-sh/setup-uv@v7
with:
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index e21d13b8161f..1041653c2f57 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,8 +16,8 @@ jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+ - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+ - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index dca3089f496c..44bf71db5e9d 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -7,13 +7,15 @@ on:
jobs:
close-issues-and-pull-requests:
+ # Prevents triggering on forks or other repos
+ if: github.repository == 'vllm-project/vllm'
permissions:
issues: write
pull-requests: write
actions: write
runs-on: ubuntu-latest
steps:
- - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+ - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86746a0db4c0..cd52df86e034 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -354,8 +354,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
- # 9.0 for latest bf16 atomicAdd PTX
- cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+
+ # marlin arches for fp16 output
+ cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+ # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
+ cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+ # marlin arches for fp8 input
+ # - sm80 doesn't support fp8 computation
+ # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+ # so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
+ cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+
if (MARLIN_ARCHS)
#
@@ -365,16 +374,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+ list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+ set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
- message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
- message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+ message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+ message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
- if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
- OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+ if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+ OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=$PYTHONPATH
- ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+ PYTHONPATH=$ENV{PYTHONPATH}
+ ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
RESULT_VARIABLE marlin_generation_result
OUTPUT_VARIABLE marlin_generation_result
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
@@ -387,15 +398,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
else()
- set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
- CACHE STRING "Last run Marlin generate script hash" FORCE)
+ set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+ CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
message(STATUS "Marlin generation completed successfully.")
endif()
else()
message(STATUS "Marlin generation script has not changed, skipping generation.")
endif()
- file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
+ file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -403,12 +414,34 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
-
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+ file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
+ CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
+ list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
+
+ if (MARLIN_FP8_ARCHS)
+ file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
+ CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
+ list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
+ endif()
+
set(MARLIN_SRCS
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
+ "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
set_gencode_flags_for_srcs(
@@ -604,12 +637,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
- "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+ "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+ "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
+ "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
+ list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else()
message(STATUS "Not building NVFP4 as no compatible archs were found.")
@@ -786,7 +822,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+ PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH}
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output
@@ -838,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+ "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
+ "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
+ "csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
+ )
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -908,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu"
- "csrc/moe/moe_lora_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -938,8 +976,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
CUDA_ARCHS "${CUDA_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
- # 9.0 for latest bf16 atomicAdd PTX
- cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+ # moe marlin arches
+ # note that we always set `use_atomic_add=False` for moe marlin now,
+ # so we don't need 9.0 for bf16 atomicAdd PTX
+ cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+ # moe marlin arches for fp8 input
+ # - sm80 doesn't support fp8 computation
+ # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+ # so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
+ cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
#
@@ -949,16 +994,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MOE_MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+ list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+ set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
- message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
- message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+ message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+ message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
- if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
- OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+ if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+ OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=$PYTHONPATH
- ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+ PYTHONPATH=$ENV{PYTHONPATH}
+ ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
@@ -971,7 +1018,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
else()
- set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+ set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
message(STATUS "Marlin MOE generation completed successfully.")
endif()
@@ -979,16 +1026,28 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
endif()
- file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
+ file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+ list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
set_gencode_flags_for_srcs(
- SRCS "${MOE_WNAA16_MARLIN_SRC}"
+ SRCS "${MARLIN_MOE_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
- set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+ set_source_files_properties(${MARLIN_MOE_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
-
- list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
+ list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
+
+ if (MARLIN_MOE_FP8_ARCHS)
+ file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${MARLIN_MOE_FP8_SRC}"
+ CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_MOE_FP8_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
+ list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
+ endif()
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else()
diff --git a/README.md b/README.md
index 033e1035d891..26222b815370 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
*Latest News* 🔥
+- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
@@ -136,16 +137,19 @@ Compute Resources:
- Alibaba Cloud
- AMD
- Anyscale
+- Arm
- AWS
- Crusoe Cloud
- Databricks
- DeepInfra
- Google Cloud
+- IBM
- Intel
- Lambda Lab
- Nebius
- Novita AI
- NVIDIA
+- Red Hat
- Replicate
- Roblox
- RunPod
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index d1bdb4c43f10..9a9600e08daf 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
```
-#### 2. Maximize Throughput with a Latency Requirement
+### 2. Maximize Throughput with a Latency Requirement
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
- **Configuration**:
@@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=500
```
-#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+### 3. Maximize Throughput with Prefix Caching and Latency Requirements
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
- **Configuration**:
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 56b721cbb402..a245e2022e60 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
+HOSTNAME=$(hostname)
+if [[ -z "$HOSTNAME" ]]; then
+ echo "Error: Failed to determine hostname." >&2
+ exit 1
+fi
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
@@ -82,6 +87,7 @@ start_server() {
"$MODEL"
"--disable-log-requests"
"--port" "8004"
+ "--host" "$HOSTNAME"
"--gpu-memory-utilization" "$gpu_memory_utilization"
"--max-num-seqs" "$max_num_seqs"
"--max-num-batched-tokens" "$max_num_batched_tokens"
@@ -96,8 +102,9 @@ start_server() {
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
- VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
- vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+ local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
+ VLLM_SERVER_DEV_MODE=1 \
+ vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
VLLM_SERVER_DEV_MODE=1 \
@@ -112,7 +119,7 @@ start_server() {
# since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break
- RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
+ RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
@@ -172,6 +179,7 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \
--random-prefix-len $prefix_len \
+ --host "$HOSTNAME" \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -187,7 +195,7 @@ run_benchmark() {
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
# clear prefix cache
- curl -X POST http://0.0.0.0:8004/reset_prefix_cache
+ curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
vllm bench serve \
@@ -203,6 +211,7 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
+ --host "$HOSTNAME" \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -303,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
+ --host "$HOSTNAME" \
--port 8004 \
--profile &> "$bm_log"
else
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4021fede7215..831b76b66e09 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
kwargs["use_fast"] = False
if tokenizer_mode == "mistral":
try:
- from vllm.transformers_utils.tokenizer import MistralTokenizer
+ from vllm.tokenizers.mistral import MistralTokenizer
except ImportError as e:
raise ImportError(
"MistralTokenizer requires vllm package.\n"
diff --git a/benchmarks/benchmark_hash.py b/benchmarks/benchmark_hash.py
new file mode 100644
index 000000000000..08cdc012d652
--- /dev/null
+++ b/benchmarks/benchmark_hash.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
+
+This focuses on a single test payload shaped like the prefix-cache hash input:
+ (32-byte bytes object, 32-int tuple)
+
+Usage:
+ python benchmarks/hash_micro_benchmark.py --iterations 20000
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import time
+from collections.abc import Callable, Iterable
+
+from vllm.utils.hashing import sha256, xxhash
+
+
+def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
+ """Generate a deterministic test payload."""
+ random.seed(seed)
+ bytes_data = bytes(random.getrandbits(8) for _ in range(32))
+ int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
+ return (bytes_data, int_tuple)
+
+
+def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
+ """Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
+ times: list[float] = []
+
+ # Warm-up to avoid first-run noise.
+ for _ in range(200):
+ func(data)
+
+ for _ in range(iterations):
+ start = time.perf_counter()
+ func(data)
+ end = time.perf_counter()
+ times.append(end - start)
+
+ avg = statistics.mean(times)
+ std = statistics.stdev(times) if len(times) > 1 else 0.0
+ return avg, std
+
+
+def _run_benchmarks(
+ benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
+ data: tuple,
+ iterations: int,
+):
+ """Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
+ for name, func in benchmarks:
+ try:
+ avg, std = _benchmark_func(func, data, iterations)
+ except ModuleNotFoundError as exc:
+ print(f"Skipping {name}: {exc}")
+ continue
+ yield name, avg, std
+
+
+def builtin_hash(data: tuple) -> int:
+ """Wrapper for Python's built-in hash()."""
+ return hash(data)
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--iterations",
+ type=int,
+ default=10_000,
+ help="Number of measured iterations per hash function.",
+ )
+ parser.add_argument(
+ "--seed", type=int, default=42, help="Random seed for test payload."
+ )
+ args = parser.parse_args()
+
+ data = _generate_test_data(args.seed)
+ benchmarks = (
+ ("SHA256 (pickle)", sha256),
+ ("xxHash (pickle)", xxhash),
+ ("built-in hash()", builtin_hash),
+ )
+
+ print("=" * 60)
+ print("HASH FUNCTION MICRO BENCHMARK")
+ print("=" * 60)
+ print("Test data: (32-byte bytes object, 32-int tuple)")
+ print(f"Iterations: {args.iterations:,}")
+ print("=" * 60)
+
+ results = list(_run_benchmarks(benchmarks, data, args.iterations))
+ builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
+
+ print("\nResults:")
+ for name, avg, std in results:
+ print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
+
+ if builtin_entry:
+ _, builtin_avg, _ = builtin_entry
+ print("\n" + "=" * 60)
+ print("SUMMARY (relative to built-in hash())")
+ print("=" * 60)
+ for name, avg, _ in results:
+ if name == "built-in hash()":
+ continue
+ speed_ratio = avg / builtin_avg
+ print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
+ else:
+ print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index dedb564fffac..b5373d383b54 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -32,12 +32,11 @@ def benchmark_propose(args):
model_config = ModelConfig(
model="facebook/opt-125m",
- task="generate",
max_model_len=args.num_token + args.num_spec_token,
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
dtype="auto",
- seed=None,
+ seed=0,
trust_remote_code=False,
)
proposer = NgramProposer(
@@ -108,7 +107,10 @@ def benchmark_batched_propose(args):
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
- scheduler_config=SchedulerConfig(),
+ scheduler_config=SchedulerConfig(
+ max_model_len=model_config.max_model_len,
+ is_encoder_decoder=model_config.is_encoder_decoder,
+ ),
)
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
diff --git a/benchmarks/benchmark_prefix_block_hash.py b/benchmarks/benchmark_prefix_block_hash.py
new file mode 100644
index 000000000000..8bcd8af0d310
--- /dev/null
+++ b/benchmarks/benchmark_prefix_block_hash.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Simple benchmark to compare prefix-cache block hashing algorithms.
+
+Example:
+ python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import sys
+import time
+from collections.abc import Callable, Iterable, Sequence
+
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
+
+SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
+
+
+def _generate_blocks(
+ num_blocks: int, block_size: int, vocab_size: int, seed: int
+) -> list[list[int]]:
+ rng = random.Random(seed)
+ return [
+ [rng.randrange(vocab_size) for _ in range(block_size)]
+ for _ in range(num_blocks)
+ ]
+
+
+def _hash_all_blocks(
+ hash_fn: Callable[[object], bytes],
+ blocks: Iterable[Sequence[int]],
+) -> float:
+ parent_hash: BlockHash | None = None
+ start = time.perf_counter()
+ for block in blocks:
+ parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
+ end = time.perf_counter()
+ return end - start
+
+
+def _benchmark(
+ hash_algo: str,
+ blocks: list[list[int]],
+ trials: int,
+) -> tuple[float, float, float] | None:
+ try:
+ hash_fn = get_hash_fn_by_name(hash_algo)
+ init_none_hash(hash_fn)
+ timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
+ except ModuleNotFoundError as exc:
+ print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
+ return None
+
+ avg = statistics.mean(timings)
+ best = min(timings)
+ # throughput: tokens / second
+ tokens_hashed = len(blocks) * len(blocks[0])
+ throughput = tokens_hashed / best
+ return avg, best, throughput
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
+ parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
+ parser.add_argument(
+ "--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
+ )
+ parser.add_argument("--seed", type=int, default=0, help="Random seed.")
+ parser.add_argument(
+ "--trials", type=int, default=5, help="Number of timed trials per algorithm."
+ )
+ parser.add_argument(
+ "--algorithms",
+ nargs="+",
+ default=SUPPORTED_ALGOS,
+ choices=SUPPORTED_ALGOS,
+ help="Hash algorithms to benchmark.",
+ )
+ args = parser.parse_args()
+
+ blocks = _generate_blocks(
+ args.num_blocks, args.block_size, args.vocab_size, args.seed
+ )
+ print(
+ f"Benchmarking {len(args.algorithms)} algorithms on "
+ f"{args.num_blocks} blocks (block size={args.block_size})."
+ )
+
+ for algo in args.algorithms:
+ result = _benchmark(algo, blocks, args.trials)
+ if result is None:
+ continue
+
+ avg, best, throughput = result
+ print(
+ f"{algo:14s} avg: {avg:.6f}s best: {best:.6f}s "
+ f"throughput: {throughput / 1e6:.2f}M tokens/s"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 28fc383a318d..e6391134ff93 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -40,7 +40,7 @@
from vllm.utils.argparse_utils import FlexibleArgumentParser
try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
+ from vllm.tokenizers import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 55001cf3722a..33aca831883a 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -46,7 +46,7 @@
from transformers import PreTrainedTokenizerBase
try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
+ from vllm.tokenizers import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
@@ -574,7 +574,7 @@ async def limited_request_func(request_func_input, pbar):
)
print(
"{:<40} {:<10.2f}".format(
- "Total Token throughput (tok/s):", metrics.total_token_throughput
+ "Total token throughput (tok/s):", metrics.total_token_throughput
)
)
@@ -963,8 +963,7 @@ def create_argument_parser():
parser.add_argument(
"--profile",
action="store_true",
- help="Use Torch Profiler. The endpoint must be launched with "
- "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+ help="Use vLLM Profiling. --profiler-config must be provided on the server.",
)
parser.add_argument(
"--result-dir",
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index d809bf1db8cb..fb3329975cee 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -14,6 +14,9 @@
import vllm._custom_ops as ops
from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ per_token_group_quant_fp8,
+)
@dataclass
@@ -22,6 +25,7 @@ class bench_params_t:
hidden_size: int
add_residual: bool
dtype: torch.dtype
+ group_size: list[int]
def description(self):
return (
@@ -29,6 +33,7 @@ def description(self):
f"x D {self.hidden_size} "
f"x R {self.add_residual} "
f"x DT {self.dtype}"
+ f"x GS {self.group_size}"
)
@@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
HIDDEN_SIZES = list(range(1024, 8129, 1024))
ADD_RESIDUAL = [True, False]
DTYPES = [torch.bfloat16, torch.float]
+ GROUP_SIZES = [[1, 64], [1, 128]]
- combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+ combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
bench_params = list(
- map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
+ map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
)
return bench_params
@@ -52,6 +58,7 @@ def unfused_int8_impl(
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
# Norm
torch_out = None
@@ -69,6 +76,7 @@ def unfused_fp8_impl(
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
# Norm
torch_out = None
@@ -81,23 +89,63 @@ def unfused_fp8_impl(
torch_out, _ = ops.scaled_fp8_quant(torch_out)
+def unfused_groupwise_fp8_impl(
+ rms_norm_layer: RMSNorm,
+ x: torch.Tensor,
+ residual: torch.Tensor | None,
+ quant_dtype: torch.dtype,
+ group_size: list[int],
+):
+ # Norm
+ torch_out = None
+ if residual is None:
+ torch_out = rms_norm_layer.forward_cuda(x, residual)
+ else:
+ torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+ # Quant
+ torch_out, _ = per_token_group_quant_fp8(
+ torch_out, group_size=group_size[1], use_ue8m0=False
+ )
+
+
def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
out, _ = ops.rms_norm_dynamic_per_token_quant(
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
)
+def fused_groupwise_impl(
+ rms_norm_layer: RMSNorm, # this stores the weights
+ x: torch.Tensor,
+ residual: torch.Tensor | None,
+ quant_dtype: torch.dtype,
+ group_size: list[int],
+):
+ out, _ = ops.rms_norm_per_block_quant(
+ x,
+ rms_norm_layer.weight,
+ 1e-6,
+ quant_dtype,
+ group_size,
+ residual=residual,
+ is_scale_transposed=True,
+ )
+
+
# Bench functions
def bench_fn(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: torch.Tensor,
quant_dtype: torch.dtype,
+ group_size: list[int],
label: str,
sub_label: str,
fn: Callable,
@@ -110,10 +158,11 @@ def bench_fn(
"x": x,
"residual": residual,
"quant_dtype": quant_dtype,
+ "group_size": group_size,
"fn": fn,
}
return TBenchmark.Timer(
- stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+ stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
globals=globals,
label=label,
sub_label=sub_label,
@@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.int8,
+ params.group_size,
label,
sub_label,
unfused_int8_impl,
@@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.float8_e4m3fn,
+ params.group_size,
label,
sub_label,
unfused_fp8_impl,
@@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.int8,
+ params.group_size,
label,
sub_label,
fused_impl,
@@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.float8_e4m3fn,
+ params.group_size,
label,
sub_label,
fused_impl,
@@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
)
)
+ # unfused groupwise fp8 impl.
+ timers.append(
+ bench_fn(
+ layer,
+ x,
+ residual,
+ torch.float8_e4m3fn,
+ params.group_size,
+ label,
+ sub_label,
+ unfused_groupwise_fp8_impl,
+ "unfused_groupwise_fp8_impl",
+ )
+ )
+
+ # fused groupwise fp8 impl.
+ timers.append(
+ bench_fn(
+ layer,
+ x,
+ residual,
+ torch.float8_e4m3fn,
+ params.group_size,
+ label,
+ sub_label,
+ fused_groupwise_impl,
+ "fused_groupwise_fp8_impl",
+ )
+ )
+
print_timers(timers)
return timers
diff --git a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
new file mode 100644
index 000000000000..04921dafbdbe
--- /dev/null
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from enum import Enum
+from itertools import product
+from typing import Any
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ _per_token_group_quant_fp8_colmajor,
+ silu_mul_per_token_group_quant_fp8_colmajor,
+)
+from vllm.triton_utils import triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+from .utils import ArgPool, Bench, CudaGraphBenchParams
+
+GROUP_SIZE = 128
+FLOAT8_T = torch.float8_e4m3fn
+
+
+def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
+ print(
+ f"Note : The timings reported above is for {cuda_graph_nops} "
+ "consecutive invocations of the benchmarking functions. "
+ f"Please divide by {cuda_graph_nops} for single invocation "
+ "timings."
+ )
+ compare = TBenchmark.Compare(timers)
+ compare.print()
+
+
+class ImplType(Enum):
+ SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
+ REFERENCE = 2
+
+ def get_impl(self):
+ if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+ return silu_mul_per_token_group_quant_fp8_colmajor
+ elif self == ImplType.REFERENCE:
+ return reference
+ raise ValueError(f"Unrecognized ImplType {self}")
+
+
+@dataclass
+class BenchmarkTensors:
+ input: torch.Tensor
+ output: torch.Tensor
+
+ # Reference act output tensor
+ ref_act_out: torch.Tensor
+ ref_quant_out: torch.Tensor
+
+ @staticmethod
+ def make(T: int, N: int) -> "BenchmarkTensors":
+ assert T % GROUP_SIZE == 0
+ assert N % (GROUP_SIZE * 2) == 0
+
+ input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
+
+ # silu_mul_per_token_group_quant_fp8_colmajor output.
+ output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
+ FLOAT8_T
+ )
+
+ # reference output.
+ ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
+ ref_quant_out = torch.empty(
+ (T, N // 2), dtype=torch.bfloat16, device="cuda"
+ ).to(FLOAT8_T)
+
+ return BenchmarkTensors(
+ input=input,
+ output=output,
+ ref_act_out=ref_act_out,
+ ref_quant_out=ref_quant_out,
+ )
+
+ @property
+ def T(self):
+ return self.input.size(0)
+
+ @property
+ def N(self):
+ return self.input.size(1)
+
+ def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
+ if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+ return {
+ "input": self.input,
+ "output": self.output,
+ "use_ue8m0": is_deep_gemm_e8m0_used(),
+ }
+ elif impl_type == ImplType.REFERENCE:
+ return {
+ "input": self.input,
+ "act_out": self.ref_act_out,
+ "quant_out": self.ref_quant_out,
+ "use_ue8m0": is_deep_gemm_e8m0_used(),
+ }
+ raise ValueError(f"Unrecognized impl_type {impl_type}")
+
+
+def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
+ """
+ Reference triton quant kernel from,
+ vllm.model_executor.layers.quantization.utils.fp8_utils
+ """
+ assert quant_out.size() == x.size()
+ # Allocate the scale tensor column-major format.
+ shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
+ x_q = quant_out
+ x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
+
+ M = x.numel() // GROUP_SIZE
+ N = GROUP_SIZE
+ BLOCK = triton.next_power_of_2(N)
+ # heuristics for number of warps
+ num_warps = min(max(BLOCK // 256, 1), 8)
+ num_stages = 1
+
+ finfo = torch.finfo(FLOAT8_T)
+ fp8_min = finfo.min
+ fp8_max = finfo.max
+
+ _per_token_group_quant_fp8_colmajor[(M,)](
+ x,
+ x_q,
+ x_s,
+ GROUP_SIZE,
+ x.shape[1],
+ x.stride(0),
+ x_s.stride(1),
+ eps=1e-10,
+ fp8_min=fp8_min,
+ fp8_max=fp8_max,
+ use_ue8m0=use_ue8m0,
+ BLOCK=BLOCK,
+ num_warps=num_warps,
+ num_stages=num_stages,
+ )
+ return x_q, x_s
+
+
+def reference(
+ input: torch.Tensor,
+ act_out: torch.Tensor,
+ quant_out: torch.Tensor,
+ use_ue8m0: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+ torch.ops._C.silu_and_mul(act_out, input)
+ return reference_quant(act_out, quant_out, use_ue8m0)
+
+
+def bench_impl(
+ bench_tensors: list[BenchmarkTensors], impl_type: ImplType
+) -> TMeasurement:
+ T = bench_tensors[0].T
+ N = bench_tensors[0].N
+
+ arg_pool_size = len(bench_tensors)
+ kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
+
+ # warmup
+ for kwargs in kwargs_list:
+ impl_type.get_impl()(**kwargs)
+ torch.cuda.synchronize()
+
+ # Merge into a single kwargs and qualify arguments as ArgPool
+ kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
+ for _kwargs in kwargs_list:
+ for k, v in _kwargs.items():
+ kwargs[k].values.append(v)
+
+ cuda_graph_params = None
+ cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
+ timer = None
+ with Bench(
+ cuda_graph_params,
+ "silu-mul-quant",
+ f"num_tokens={T}, N={N}",
+ impl_type.name,
+ impl_type.get_impl(),
+ **kwargs,
+ ) as bench:
+ timer = bench.run()
+ return timer
+
+
+def test_correctness(T: int, N: int):
+ print(f"Testing num_tokens={T}, N={N} ...")
+
+ bench_tensor = BenchmarkTensors.make(T, N)
+
+ def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
+ return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
+
+ # reference output
+ ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
+
+ # test ouptut
+ out_q, out_s = output_from_impl(
+ ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+ )
+
+ torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
+ torch.testing.assert_close(ref_out_s, out_s)
+
+
+def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
+ timers = []
+ for N, T in product(Ns, Ts):
+ test_correctness(T, N)
+
+ bench_tensors: list[BenchmarkTensors] = [
+ BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
+ ]
+
+ silu_mul_quant_timer = bench_impl(
+ bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+ )
+ timers.append(silu_mul_quant_timer)
+ reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
+ timers.append(reference_timer)
+
+ print_timers(
+ [silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
+ )
+
+ print_timers(timers, cuda_graph_nops=arg_pool_size)
+
+ return timers
+
+
+if __name__ == "__main__":
+ T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
+ N = [2048, 4096, 8192]
+
+ print(f"T = {T}, N = {N}")
+ run(T, N, arg_pool_size=8)
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 8787724d77cf..ac78c019a59e 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
b_q_weight=w_q,
b_bias=None,
b_scales=w_s,
+ a_scales=None,
global_scale=None,
b_zeros=w_zp,
g_idx=g_idx,
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 12ca9214b1f9..48d790aec9e0 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -263,7 +263,7 @@ def gen_allspark_params():
results.append(
benchmark.Timer(
- stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
+ stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
@@ -273,7 +273,7 @@ def gen_allspark_params():
results.append(
benchmark.Timer(
- stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
+ stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
diff --git a/benchmarks/kernels/benchmark_mla_k_concat.py b/benchmarks/kernels/benchmark_mla_k_concat.py
new file mode 100644
index 000000000000..fb3b6c8f1200
--- /dev/null
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
+in MLA (Multi-head Latent Attention) prefill.
+
+This validates that the optimization from commit 8d4142bd is beneficial across
+various batch sizes, not just the originally tested batch size of 32768.
+"""
+
+import time
+from collections.abc import Callable
+
+import torch
+
+# DeepSeek-V3 MLA dimensions
+NUM_HEADS = 128
+QK_NOPE_HEAD_DIM = 128
+PE_DIM = 64
+
+
+def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+ """Original torch.cat approach with expand."""
+ return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+
+def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+ """Optimized direct copy approach (avoids expand + cat overhead)."""
+ k = torch.empty(
+ (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
+ dtype=k_nope.dtype,
+ device=k_nope.device,
+ )
+ k[..., : k_nope.shape[-1]] = k_nope
+ k[..., k_nope.shape[-1] :] = k_pe
+ return k
+
+
+def benchmark_method(
+ method: Callable,
+ k_nope: torch.Tensor,
+ k_pe: torch.Tensor,
+ num_warmup: int = 10,
+ num_iters: int = 100,
+) -> float:
+ """Benchmark a concatenation method and return mean latency in ms."""
+ # Warmup
+ for _ in range(num_warmup):
+ _ = method(k_nope, k_pe)
+ torch.cuda.synchronize()
+
+ # Benchmark
+ start = time.perf_counter()
+ for _ in range(num_iters):
+ _ = method(k_nope, k_pe)
+ torch.cuda.synchronize()
+ end = time.perf_counter()
+
+ return (end - start) / num_iters * 1000 # Convert to ms
+
+
+@torch.inference_mode()
+def run_benchmark(dtype: torch.dtype, dtype_name: str):
+ """Run benchmark for a specific dtype."""
+ torch.set_default_device("cuda")
+
+ # Batch sizes to test (powers of 2 from 32 to 65536)
+ batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
+
+ print("=" * 80)
+ print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
+ print("=" * 80)
+ print(
+ f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
+ f"k_pe=[B, 1, {PE_DIM}]"
+ )
+ print(f"dtype: {dtype_name}")
+ print()
+ print(
+ f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
+ f"{'Speedup':>8} | {'Reduction':>10}"
+ )
+ print("-" * 70)
+
+ results = []
+ for batch_size in batch_sizes:
+ # Create input tensors (generate in float32 then convert for FP8 compatibility)
+ k_nope = torch.randn(
+ batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
+ ).to(dtype)
+ k_pe = torch.randn(
+ batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
+ ).to(dtype)
+
+ # Benchmark both methods
+ cat_time = benchmark_method(cat_method, k_nope, k_pe)
+ direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
+
+ speedup = cat_time / direct_time
+ reduction = (1 - direct_time / cat_time) * 100
+
+ results.append((batch_size, cat_time, direct_time, speedup, reduction))
+
+ print(
+ f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
+ f"{speedup:>7.2f}x | {reduction:>9.1f}%"
+ )
+
+ print("=" * 80)
+
+ # Summary statistics
+ speedups = [r[3] for r in results]
+ print("\nSpeedup summary:")
+ print(f" Min: {min(speedups):.2f}x")
+ print(f" Max: {max(speedups):.2f}x")
+ print(f" Mean: {sum(speedups) / len(speedups):.2f}x")
+
+ # Find crossover point
+ crossover_batch = None
+ for batch_size, _, _, speedup, _ in results:
+ if speedup >= 1.0:
+ crossover_batch = batch_size
+ break
+
+ print("\nConclusion:")
+ if crossover_batch:
+ print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}")
+ # Filter for large batches (>= 512 which is typical for prefill)
+ large_batch_speedups = [r[3] for r in results if r[0] >= 512]
+ if large_batch_speedups:
+ avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
+ print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
+ print(" - MLA prefill typically uses large batches, so optimization is effective")
+
+ return results
+
+
+@torch.inference_mode()
+def main():
+ # Test bfloat16
+ print("\n")
+ run_benchmark(torch.bfloat16, "bfloat16")
+
+ # Test float8_e4m3fn
+ print("\n")
+ run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py
index f540cff6261a..5f9a131f79b0 100644
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512]
topk_range = [1, 2, 8]
-configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+ep_size_range = [1, 8]
+configs = list(
+ itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
+)
@triton.testing.perf_report(
triton.testing.Benchmark(
- x_names=["num_tokens", "num_experts", "topk"],
+ x_names=["num_tokens", "num_experts", "topk", "ep_size"],
x_vals=configs,
line_arg="provider",
line_vals=["vllm"],
@@ -38,16 +41,26 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
args={},
)
)
-def benchmark(num_tokens, num_experts, topk, provider):
+def benchmark(num_tokens, num_experts, topk, ep_size, provider):
"""Benchmark function for Triton."""
block_size = 256
+ torch.cuda.manual_seed_all(0)
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+ e_map = None
+ if ep_size != 1:
+ local_e = num_experts // ep_size
+ e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
+ e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
+ e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+
quantiles = [0.5, 0.2, 0.8]
if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench(
- lambda: moe_align_block_size(topk_ids, block_size, num_experts),
+ lambda: moe_align_block_size(
+ topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
+ ),
quantiles=quantiles,
)
diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py
index 83bd91917508..09de5fa822f8 100644
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -99,7 +99,6 @@ def benchmark_mrope(
# the parameters to compute the q k v size based on tp_size
mrope_helper_class = get_rope(
head_size=head_dim,
- rotary_dim=head_dim,
max_position=max_position,
is_neox_style=is_neox_style,
rope_parameters=rope_parameters,
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 074b7a440b61..7a1bc050bb33 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
def benchmark(batch_size, seq_len, num_heads, provider):
dtype = torch.bfloat16
max_position = 8192
- base = 10000
- rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+ rope_parameters = {"partial_rotary_factor": rotary_dim / head_size}
+ rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
rope = rope.to(dtype=dtype, device=device)
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index fbbb03c5ed46..85b286f8d8d0 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
endif()
# Build ACL with CMake
- set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
- set(CMAKE_BUILD_TYPE "Release")
- set(ARM_COMPUTE_ARCH "armv8.2-a")
- set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
- set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
- set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
- set(ARM_COMPUTE_ENABLE_OPENMP "ON")
- set(ARM_COMPUTE_ENABLE_WERROR "OFF")
- set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
- set(ARM_COMPUTE_BUILD_TESTING "OFF")
-
set(_cmake_config_cmd
${CMAKE_COMMAND} -G Ninja -B build
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index ca0062ba4fab..bdb2ba74d944 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -140,16 +140,21 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
run_python(_VLLM_TORCH_GOMP_PATH
"
import os, glob
-try:
- import torch
- torch_pkg = os.path.dirname(torch.__file__)
- site_root = os.path.dirname(torch_pkg)
- torch_libs = os.path.join(site_root, 'torch.libs')
- print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
-except:
- print('')
+import torch
+torch_pkg = os.path.dirname(torch.__file__)
+site_root = os.path.dirname(torch_pkg)
+
+# Search both torch.libs and torch/lib
+roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')]
+candidates = []
+for root in roots:
+ if not os.path.isdir(root):
+ continue
+ candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))
+
+print(candidates[0] if candidates else '')
"
- "failed to probe torch.libs for libgomp")
+ "failed to probe for libgomp")
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
return()
@@ -495,7 +500,13 @@ function (define_extension_target MOD_NAME)
set(SOABI_KEYWORD "")
endif()
- if (ARG_USE_SABI)
+ run_python(IS_FREETHREADED_PYTHON
+ "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
+ "Failed to determine whether interpreter is free-threaded")
+
+ # Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
+ # so avoid using the stable ABI under free-threading only.
+ if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
else()
Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
index 229d9862fb67..27d1e990c611 100644
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
const float* prefix_lse, const scalar_t* suffix_output,
const float* suffix_lse, const uint num_tokens, const uint num_heads,
- const uint head_size) {
+ const uint head_size, const uint prefix_head_stride,
+ const uint output_head_stride) {
using pack_128b_t = uint4;
const uint pack_size = 16 / sizeof(scalar_t);
const uint threads_per_head = head_size / pack_size;
@@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
const uint head_idx = token_head_idx % num_heads;
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
- const uint head_offset =
- token_idx * num_heads * head_size + head_idx * head_size;
- const scalar_t* prefix_head_ptr = prefix_output + head_offset;
- const scalar_t* suffix_head_ptr = suffix_output + head_offset;
- scalar_t* output_head_ptr = output + head_offset;
+ const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
+ head_idx * prefix_head_stride;
+ const uint dst_head_offset = token_idx * num_heads * output_head_stride +
+ head_idx * output_head_stride;
+ const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
+ const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
+ scalar_t* output_head_ptr = output + dst_head_offset;
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
reinterpret_cast(prefix_lse.data_ptr()), \
reinterpret_cast(suffix_output.data_ptr()), \
reinterpret_cast(suffix_lse.data_ptr()), num_tokens, \
- num_heads, head_size); \
+ num_heads, head_size, prefix_head_stride, output_head_stride); \
}
/*@brief Merges the attention states from prefix and suffix
@@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
const uint num_tokens = output.size(0);
const uint num_heads = output.size(1);
const uint head_size = output.size(2);
+ const uint prefix_head_stride = prefix_output.stride(1);
+ const uint output_head_stride = output.stride(1);
const uint pack_size = 16 / sizeof(scalar_t);
TORCH_CHECK(head_size % pack_size == 0,
"headsize must be multiple of pack_size:", pack_size);
- TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
- "output heads must be contiguous in memory");
- TORCH_CHECK(
- prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
- "prefix_output heads must be contiguous in memory");
- TORCH_CHECK(
- suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
- "suffix_output heads must be contiguous in memory");
float* output_lse_ptr = nullptr;
if (output_lse.has_value()) {
output_lse_ptr = output_lse.value().data_ptr();
diff --git a/csrc/cache.h b/csrc/cache.h
index f2a5ec0acf5c..cbe44c09eb62 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -1,6 +1,7 @@
#pragma once
#include
+#include
#include