Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
bee13ba
Add an one-off workflow to benchmark gpt-oss
huydhn Aug 6, 2025
56e4451
Fix workflow syntax
huydhn Aug 6, 2025
19e9e9b
Add the models
huydhn Aug 6, 2025
96e08d6
More tweaks
huydhn Aug 6, 2025
f76c72f
Would it work?
huydhn Aug 6, 2025
ca94597
More tweaks
huydhn Aug 6, 2025
456df1a
Fix workflow syntax
huydhn Aug 6, 2025
052b329
Debug
huydhn Aug 6, 2025
180a976
KISS
huydhn Aug 6, 2025
dd34649
Ready to debug
huydhn Aug 6, 2025
fbfb2bb
Another try
huydhn Aug 7, 2025
3189a5d
Login to ECR
huydhn Aug 7, 2025
5176b11
Debug
huydhn Aug 7, 2025
3ef58c4
Add accuracy check
huydhn Aug 7, 2025
30e0138
Another tweak
huydhn Aug 7, 2025
ba28e1e
Really?
huydhn Aug 7, 2025
044f1c3
Debug
huydhn Aug 7, 2025
a17fe1a
Move the logic to a script
huydhn Aug 7, 2025
a7a1664
Better now
huydhn Aug 7, 2025
1ddea76
You're an odd one, ain't you?
huydhn Aug 7, 2025
d46a0a6
Another attempt
huydhn Aug 7, 2025
898b35e
Neeed newer setuptools
huydhn Aug 7, 2025
78f1493
Another try
huydhn Aug 7, 2025
577c1ab
Increase the timeout to 12h
huydhn Aug 7, 2025
73e8373
Another round
huydhn Aug 7, 2025
9cab527
It seems to work now
huydhn Aug 7, 2025
585b42b
Let's run everything now
huydhn Aug 8, 2025
9960023
Use bigger runners
huydhn Aug 8, 2025
623d293
Another round of perf benchmark
huydhn Aug 8, 2025
12375c8
Why CI not running
huydhn Aug 8, 2025
82fa731
Merge branch 'main' into one-off-gpt-oss-benchmark
huydhn Aug 8, 2025
ce29b20
Benchmark aime25
huydhn Aug 8, 2025
8cb19ee
Small bug
huydhn Aug 8, 2025
d22c815
Make upload benchmark results optional
huydhn Aug 8, 2025
0514931
Ugh
huydhn Aug 8, 2025
c2f3dc7
Darn it, I forgot to upload the results
huydhn Aug 8, 2025
d641866
Let's just keep perf run then
huydhn Aug 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions .github/scripts/gpt-oss/run_accuracy_checks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/bin/bash

set -eux

# https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html
if [[ "${DEVICE_TYPE}" == *B200* ]]; then
export VLLM_USE_TRTLLM_ATTENTION=1
export VLLM_USE_TRTLLM_DECODE_ATTENTION=1
export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1
export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1
elif [[ "${DEVICE_NAME}" == *rocm* ]]; then
export VLLM_ROCM_USE_AITER=1
export VLLM_USE_AITER_UNIFIED_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
fi

tp=0
if [[ "${MODEL}" == "openai/gpt-oss-120b" ]]; then
tp=4
elif [[ "${MODEL}" == "openai/gpt-oss-20b" ]]; then
tp=1
fi

echo $tp
# Prepare the accuracy test
vllm serve $MODEL --tensor_parallel_size $tp &
server_pid=$!

wait_for_server() {
timeout 1200 bash -c '
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done' && return 0 || return 1
}

if wait_for_server; then
echo "vLLM server is up and running"
else
echo "vLLM failed to start within the timeout period"
fi

pushd vllm-benchmarks/gpt-oss
mkdir -p /tmp/gpqa_openai
mkdir -p /tmp/aime25_openai

if [[ "${DEVICE_NAME}" == "rocm" ]]; then
# Not sure why this is needed on ROCm
pushd gpt_oss
# Low
OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's this eval library?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's just gpt_oss.evals calling inside gpt_oss directory. I couldn't under stand why I need to do that on ROCm and don't have ssh access to the runners to check why (they are on AMD side). My ROCm devgpu doesn't have Docker to try this out locally

--model $MODEL \
--eval aime25 \
--reasoning-effort low \
--n-threads $(expr $(nproc) / 2)

# Mid
OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \
--model $MODEL \
--eval aime25 \
--reasoning-effort medium \
--n-threads $(expr $(nproc) / 2)

# High
OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \
--model $MODEL \
--eval aime25 \
--reasoning-effort high \
--n-threads $(expr $(nproc) / 2)
popd
else
# Low
OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \
--model $MODEL \
--eval aime25 \
--reasoning-effort low \
--n-threads $(expr $(nproc) / 2)

# Mid
OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \
--model $MODEL \
--eval aime25 \
--reasoning-effort medium \
--n-threads $(expr $(nproc) / 2)

# High
OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \
--model $MODEL \
--eval aime25 \
--reasoning-effort high \
--n-threads $(expr $(nproc) / 2)
fi

mv /tmp/gpqa_openai .
mv /tmp/aime25_openai .
popd

kill -9 $server_pid
30 changes: 30 additions & 0 deletions .github/scripts/gpt-oss/run_benchmarks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

set -eux

# https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html
if [[ "${DEVICE_TYPE}" == *B200* ]]; then
export VLLM_USE_TRTLLM_ATTENTION=1
export VLLM_USE_TRTLLM_DECODE_ATTENTION=1
export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1
export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1
elif [[ "${DEVICE_NAME}" == *rocm* ]]; then
export VLLM_ROCM_USE_AITER=1
export VLLM_USE_AITER_UNIFIED_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
fi

pushd vllm-benchmarks/vllm
cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true

if [[ "${DEVICE_NAME}" != "rocm" ]]; then
pip install -U openai transformers setuptools
pip install --pre vllm==0.10.1+gptoss \
--extra-index-url https://wheels.vllm.ai/gpt-oss/ \
--extra-index-url https://download.pytorch.org/whl/nightly/cu128
fi

pip freeze
# Just run accuracy tests for now
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
popd
244 changes: 244 additions & 0 deletions .github/workflows/gpt-oss-benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
name: gpt-oss benchmark

on:
pull_request:
paths:
- .github/workflows/gpt-oss-benchmark.yml

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true

jobs:
benchmarks:
name: Run gpt-oss benchmarks
strategy:
matrix:
include:
# gpt-oss-120b
- runner: linux.aws.h100.8
model: openai/gpt-oss-120b
docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b'
- runner: linux.dgx.b200.8
model: openai/gpt-oss-120b
docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b'
- runner: linux.rocm.gpu.gfx942.8
model: openai/gpt-oss-120b
docker-image: rocm/vllm-dev:open-mi300-08052025
# gpt-oss-20b
- runner: linux.aws.h100.4
model: openai/gpt-oss-20b
docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b'
- runner: linux.dgx.b200.8
model: openai/gpt-oss-20b
docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b'
- runner: linux.rocm.gpu.gfx942.8
model: openai/gpt-oss-20b
docker-image: rocm/vllm-dev:open-mi300-08052025
fail-fast: false
runs-on: ${{ matrix.runner }}
environment: pytorch-x-vllm
permissions:
id-token: write
contents: read
timeout-minutes: 720
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Checkout vLLM repository
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
path: vllm-benchmarks/vllm

- name: Checkout gpt-oss repository
uses: actions/checkout@v4
with:
repository: openai/gpt-oss
path: vllm-benchmarks/gpt-oss

- uses: actions/setup-python@v5
# Amazon Linux fails on this step
continue-on-error: true
with:
python-version: '3.12'
cache: 'pip'

- name: Check if the device is supported
shell: bash
run: |
set -eux

if command -v nvidia-smi; then
DEVICE_NAME=cuda
nvidia-smi
elif command -v rocm-smi; then
DEVICE_NAME=rocm
rocm-smi
else
DEVICE_NAME=cpu
lscpu
fi
echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV

- name: Set GPU name and type
working-directory: vllm-benchmarks
shell: bash
run: |
set -eux

if [[ "${DEVICE_NAME}" == "cuda" ]]; then
DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
fi
echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV

- name: Install dependencies
shell: bash
run: |
set -eux

if [[ "${DEVICE_NAME}" == "rocm" ]]; then
pip install -r .github/scripts/requirements.txt \
--extra-index-url https://download.pytorch.org/whl/rocm6.3
else
pip install -r .github/scripts/requirements.txt \
--extra-index-url https://download.pytorch.org/whl/cu128
fi

- name: Setup CUDA GPU_FLAG for docker run
if: env.DEVICE_NAME == 'cuda'
run: |
echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"

- name: Setup ROCm
if: env.DEVICE_NAME == 'rocm'
uses: pytorch/pytorch/./.github/actions/setup-rocm@main

- name: Setup benchmark tests
env:
MODEL: ${{ matrix.model }}
run: |
set -eux

pushd vllm-benchmarks/vllm
rm .buildkite/nightly-benchmarks/tests/*.json
popd

# Set the list of benchmarks we want to cover in this runner
python3 .github/scripts/setup_vllm_benchmark.py \
--from-benchmark-configs-dir vllm-benchmarks/benchmarks \
--to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \
--models "${MODEL}" \
--device "${DEVICE_NAME}"

pushd vllm-benchmarks/vllm
ls -lah .buildkite/nightly-benchmarks/tests
find .buildkite/nightly-benchmarks/tests -type f -exec cat {} \;


- name: Run vLLM gpt-oss benchmark
env:
# To login to public.ecr.aws
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
DOCKER_IMAGE: ${{ matrix.docker-image }}
# vLLM-related environment variables
ENGINE_VERSION: v1
SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
MODEL: ${{ matrix.model }}
run: |
set -eux

if [[ "${DEVICE_TYPE}" == *B200* ]]; then
# Just to unblock this change on B200
aws configure set aws_access_key_id "${AWS_ACCESS_KEY_ID}"
aws configure set aws_secret_access_key "${AWS_SECRET_ACCESS_KEY}"
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
fi

# Leaving 1GB for the runner and other things
TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
# https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
# comes from https://github.com/pytorch/test-infra/pull/6058
TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))

container_name=$(docker run \
${GPU_FLAG:-} \
-e MODEL \
-e DEVICE_NAME \
-e DEVICE_TYPE \
-e HF_TOKEN \
-e ENGINE_VERSION \
-e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
--memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
--memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
--ipc=host \
--tty \
--detach \
--security-opt seccomp=unconfined \
--shm-size=4g \
-v "${GITHUB_WORKSPACE}:/tmp/workspace" \
-w /tmp/workspace \
"${DOCKER_IMAGE}"
)

# Run perf tests
docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_benchmarks.sh

# Run accuracy check (turning this on later if needed)
docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_accuracy_checks.sh

- name: Authenticate with AWS
# AWS CUDA runners already have access to the bucket via its runner IAM role
if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200')
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
# The max duration enforced by the server side
role-duration-seconds: 18000
aws-region: us-east-1

- name: Upload the benchmark results
continue-on-error: true
env:
BENCHMARK_RESULTS: vllm-benchmarks/vllm/benchmarks/results
MODEL: ${{ matrix.model }}
run: |
set -eux

sudo chown -R ${UID} "${BENCHMARK_RESULTS}"
ls -lah "${BENCHMARK_RESULTS}"

SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g")
SANITIZED_MODEL="${MODEL//\//_}"

echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV
echo "SANITIZED_MODEL=$SANITIZED_MODEL" >> $GITHUB_ENV

python3 .github/scripts/upload_benchmark_results.py \
--repo vllm-benchmarks/vllm \
--benchmark-name "vLLM benchmark" \
--benchmark-results "${BENCHMARK_RESULTS}" \
--device-name "${DEVICE_NAME}" \
--device-type "${SANITIZED_DEVICE_TYPE}" \
--model "${MODEL//\//_}"

# Keep a copy of the benchmark results on GitHub for reference
- uses: actions/upload-artifact@v4
with:
name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }}
path: vllm-benchmarks/vllm/benchmarks/results

# Keep a copy of the accuracy results on GitHub for reference
- uses: actions/upload-artifact@v4
with:
name: accuracy-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }}
path: |
vllm-benchmarks/gpt-oss/gpqa_openai
vllm-benchmarks/gpt-oss/aime25_openai
4 changes: 0 additions & 4 deletions .github/workflows/vllm-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,6 @@ on:
required: true
type: string
default: h100,rocm,spr,b200
pull_request:
paths:
- .github/workflows/vllm-benchmark.yml
- vllm-benchmarks/**

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
Expand Down
Loading
Loading