pytorch · huydhn · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025
diff --git a/.github/scripts/gpt-oss/run_accuracy_checks.sh b/.github/scripts/gpt-oss/run_accuracy_checks.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+set -eux
+
+# https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html
+if [[ "${DEVICE_TYPE}" == *B200* ]]; then
+  export VLLM_USE_TRTLLM_ATTENTION=1
+  export VLLM_USE_TRTLLM_DECODE_ATTENTION=1
+  export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1
+  export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1
+elif [[ "${DEVICE_NAME}" == *rocm* ]]; then
+  export VLLM_ROCM_USE_AITER=1
+  export VLLM_USE_AITER_UNIFIED_ATTENTION=1
+  export VLLM_ROCM_USE_AITER_MHA=0
+fi
+
+tp=0
+if [[ "${MODEL}" == "openai/gpt-oss-120b" ]]; then
+  tp=4
+elif [[ "${MODEL}" == "openai/gpt-oss-20b" ]]; then
+  tp=1
+fi
+
+echo $tp
+# Prepare the accuracy test
+vllm serve $MODEL --tensor_parallel_size $tp &
+server_pid=$!
+
+wait_for_server() {
+  timeout 1200 bash -c '
+    until curl -X POST localhost:8000/v1/completions; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+if wait_for_server; then
+  echo "vLLM server is up and running"
+else
+  echo "vLLM failed to start within the timeout period"
+fi
+
+pushd vllm-benchmarks/gpt-oss
+mkdir -p /tmp/gpqa_openai
+mkdir -p /tmp/aime25_openai
+
+if [[ "${DEVICE_NAME}" == "rocm" ]]; then
+  # Not sure why this is needed on ROCm
+  pushd gpt_oss
+  # Low
+  OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \
+    --model $MODEL \
+    --eval aime25 \
+    --reasoning-effort low \
+    --n-threads $(expr $(nproc) / 2)
+
+  # Mid
+  OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \
+    --model $MODEL \
+    --eval aime25 \
+    --reasoning-effort medium \
+    --n-threads $(expr $(nproc) / 2)
+
+  # High
+  OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \
+    --model $MODEL \
+    --eval aime25 \
+    --reasoning-effort high \
+    --n-threads $(expr $(nproc) / 2)
+  popd
+else
+  # Low
+  OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \
+    --model $MODEL \
+    --eval aime25 \
+    --reasoning-effort low \
+    --n-threads $(expr $(nproc) / 2)
+
+  # Mid
+  OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \
+    --model $MODEL \
+    --eval aime25 \
+    --reasoning-effort medium \
+    --n-threads $(expr $(nproc) / 2)
+
+  # High
+  OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \
+    --model $MODEL \
+    --eval aime25 \
+    --reasoning-effort high \
+    --n-threads $(expr $(nproc) / 2)
+fi
+
+mv /tmp/gpqa_openai .
+mv /tmp/aime25_openai .
+popd
+
+kill -9 $server_pid
diff --git a/.github/scripts/gpt-oss/run_benchmarks.sh b/.github/scripts/gpt-oss/run_benchmarks.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -eux
+
+# https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html
+if [[ "${DEVICE_TYPE}" == *B200* ]]; then
+  export VLLM_USE_TRTLLM_ATTENTION=1
+  export VLLM_USE_TRTLLM_DECODE_ATTENTION=1
+  export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1
+  export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1
+elif [[ "${DEVICE_NAME}" == *rocm* ]]; then
+  export VLLM_ROCM_USE_AITER=1
+  export VLLM_USE_AITER_UNIFIED_ATTENTION=1
+  export VLLM_ROCM_USE_AITER_MHA=0
+fi
+
+pushd vllm-benchmarks/vllm
+cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true
+
+if [[ "${DEVICE_NAME}" != "rocm" ]]; then
+  pip install -U openai transformers setuptools
+  pip install --pre vllm==0.10.1+gptoss \
+    --extra-index-url https://wheels.vllm.ai/gpt-oss/ \
+    --extra-index-url https://download.pytorch.org/whl/nightly/cu128
+fi
+
+pip freeze
+# Just run accuracy tests for now
+bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+popd
diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml
@@ -0,0 +1,244 @@
+name: gpt-oss benchmark
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/gpt-oss-benchmark.yml
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  benchmarks:
+    name: Run gpt-oss benchmarks
+    strategy:
+      matrix:
+        include:
+          # gpt-oss-120b
+          - runner: linux.aws.h100.8
+            model: openai/gpt-oss-120b
+            docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b'
+          - runner: linux.dgx.b200.8
+            model: openai/gpt-oss-120b
+            docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b'
+          - runner: linux.rocm.gpu.gfx942.8
+            model: openai/gpt-oss-120b
+            docker-image: rocm/vllm-dev:open-mi300-08052025
+          # gpt-oss-20b
+          - runner: linux.aws.h100.4
+            model: openai/gpt-oss-20b
+            docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b'
+          - runner: linux.dgx.b200.8
+            model: openai/gpt-oss-20b
+            docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b'
+          - runner: linux.rocm.gpu.gfx942.8
+            model: openai/gpt-oss-20b
+            docker-image: rocm/vllm-dev:open-mi300-08052025
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    environment: pytorch-x-vllm
+    permissions:
+      id-token: write
+      contents: read
+    timeout-minutes: 720
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Checkout vLLM repository
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          path: vllm-benchmarks/vllm
+
+      - name: Checkout gpt-oss repository
+        uses: actions/checkout@v4
+        with:
+          repository: openai/gpt-oss
+          path: vllm-benchmarks/gpt-oss
+
+      - uses: actions/setup-python@v5
+        # Amazon Linux fails on this step
+        continue-on-error: true
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+
+      - name: Check if the device is supported
+        shell: bash
+        run: |
+          set -eux
+
+          if command -v nvidia-smi; then
+            DEVICE_NAME=cuda
+            nvidia-smi
+          elif command -v rocm-smi; then
+            DEVICE_NAME=rocm
+            rocm-smi
+          else
+            DEVICE_NAME=cpu
+            lscpu
+          fi
+          echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
+
+      - name: Set GPU name and type
+        working-directory: vllm-benchmarks
+        shell: bash
+        run: |
+          set -eux
+
+          if [[ "${DEVICE_NAME}" == "cuda" ]]; then
+            DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
+          elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
+            DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
+          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
+          fi
+          echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV
+
+      - name: Install dependencies
+        shell: bash
+        run: |
+          set -eux
+
+          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
+            pip install -r .github/scripts/requirements.txt \
+              --extra-index-url https://download.pytorch.org/whl/rocm6.3
+          else
+            pip install -r .github/scripts/requirements.txt \
+              --extra-index-url https://download.pytorch.org/whl/cu128
+          fi
+
+      - name: Setup CUDA GPU_FLAG for docker run
+        if: env.DEVICE_NAME == 'cuda'
+        run: |
+          echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+
+      - name: Setup ROCm
+        if: env.DEVICE_NAME == 'rocm'
+        uses: pytorch/pytorch/./.github/actions/setup-rocm@main
+
+      - name: Setup benchmark tests
+        env:
+          MODEL: ${{ matrix.model }}
+        run: |
+          set -eux
+
+          pushd vllm-benchmarks/vllm
+          rm .buildkite/nightly-benchmarks/tests/*.json
+          popd
+
+          # Set the list of benchmarks we want to cover in this runner
+          python3 .github/scripts/setup_vllm_benchmark.py \
+            --from-benchmark-configs-dir vllm-benchmarks/benchmarks \
+            --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \
+            --models "${MODEL}" \
+            --device "${DEVICE_NAME}"
+
+          pushd vllm-benchmarks/vllm
+          ls -lah .buildkite/nightly-benchmarks/tests
+          find .buildkite/nightly-benchmarks/tests -type f -exec cat {} \;
+
+
+      - name: Run vLLM gpt-oss benchmark
+        env:
+          # To login to public.ecr.aws
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          DOCKER_IMAGE: ${{ matrix.docker-image }}
+          # vLLM-related environment variables
+          ENGINE_VERSION: v1
+          SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
+          MODEL: ${{ matrix.model }}
+        run: |
+          set -eux
+
+          if [[ "${DEVICE_TYPE}" == *B200* ]]; then
+            # Just to unblock this change on B200
+            aws configure set aws_access_key_id "${AWS_ACCESS_KEY_ID}"
+            aws configure set aws_secret_access_key "${AWS_SECRET_ACCESS_KEY}"
+            aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
+          fi
+
+          # Leaving 1GB for the runner and other things
+          TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
+          # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
+          # comes from https://github.com/pytorch/test-infra/pull/6058
+          TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
+
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e MODEL \
+            -e DEVICE_NAME \
+            -e DEVICE_TYPE \
+            -e HF_TOKEN \
+            -e ENGINE_VERSION \
+            -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
+            --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
+            --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
+            --ipc=host \
+            --tty \
+            --detach \
+            --security-opt seccomp=unconfined \
+            --shm-size=4g \
+            -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
+            -w /tmp/workspace \
+            "${DOCKER_IMAGE}"
+          )
+
+          # Run perf tests
+          docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_benchmarks.sh
+
+          # Run accuracy check (turning this on later if needed)
+          docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_accuracy_checks.sh
+
+      - name: Authenticate with AWS
+        # AWS CUDA runners already have access to the bucket via its runner IAM role
+        if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200')
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
+          # The max duration enforced by the server side
+          role-duration-seconds: 18000
+          aws-region: us-east-1
+
+      - name: Upload the benchmark results
+        continue-on-error: true
+        env:
+          BENCHMARK_RESULTS: vllm-benchmarks/vllm/benchmarks/results
+          MODEL: ${{ matrix.model }}
+        run: |
+          set -eux
+
+          sudo chown -R ${UID} "${BENCHMARK_RESULTS}"
+          ls -lah "${BENCHMARK_RESULTS}"
+
+          SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g")
+          SANITIZED_MODEL="${MODEL//\//_}"
+
+          echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV
+          echo "SANITIZED_MODEL=$SANITIZED_MODEL" >> $GITHUB_ENV
+
+          python3 .github/scripts/upload_benchmark_results.py \
+            --repo vllm-benchmarks/vllm \
+            --benchmark-name "vLLM benchmark" \
+            --benchmark-results "${BENCHMARK_RESULTS}" \
+            --device-name "${DEVICE_NAME}" \
+            --device-type "${SANITIZED_DEVICE_TYPE}" \
+            --model "${MODEL//\//_}"
+
+      # Keep a copy of the benchmark results on GitHub for reference
+      - uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }}
+          path: vllm-benchmarks/vllm/benchmarks/results
+
+      # Keep a copy of the accuracy results on GitHub for reference
+      - uses: actions/upload-artifact@v4
+        with:
+          name: accuracy-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }}
+          path: |
+            vllm-benchmarks/gpt-oss/gpqa_openai
+            vllm-benchmarks/gpt-oss/aime25_openai
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
@@ -26,10 +26,6 @@ on:
         required: true
         type: string
         default: h100,rocm,spr,b200
-  pull_request:
-    paths:
-      - .github/workflows/vllm-benchmark.yml
-      - vllm-benchmarks/**
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}