pytorch · rzabarazesh · Oct 3, 2025 · Oct 6, 2025 · Oct 6, 2025 · Oct 7, 2025
diff --git a/torchci/clickhouse_queries/vllm/ci_reliability/params.json b/torchci/clickhouse_queries/vllm/ci_reliability/params.json
@@ -0,0 +1,18 @@
+{
+  "params": {
+    "granularity": "String",
+    "repo": "String",
+    "pipelineName": "String",
+    "startTime": "DateTime64(3)",
+    "stopTime": "DateTime64(3)"
+  },
+  "tests": [
+    {
+      "granularity": "day",
+      "repo": "https://github.com/vllm-project/vllm.git",
+      "pipelineName": "CI",
+      "startTime": "2025-09-26T00:00:00.000",
+      "stopTime": "2025-10-03T00:00:00.000"
+    }
+  ]
+}
diff --git a/torchci/clickhouse_queries/vllm/ci_reliability/query.sql b/torchci/clickhouse_queries/vllm/ci_reliability/query.sql
@@ -0,0 +1,66 @@
+-- vLLM CI reliability metrics
+-- Computes CI success rate, failure rate over time for Buildkite builds
+-- Daily breakdown of build states (passed, failed, canceled)
+-- Overall success rate and job-level reliability
+
+WITH builds AS (
+    SELECT
+        tupleElement(pipeline, 'repository') AS repository,
+        tupleElement(pipeline, 'name') AS pipeline_name,
+        toUInt32(tupleElement(build, 'number')) AS build_number,
+        tupleElement(build, 'started_at') AS build_started_at,
+        tupleElement(build, 'finished_at') AS build_finished_at,
+        tupleElement(build, 'state') AS build_state,
+        formatDateTime(
+            DATE_TRUNC(
+                {granularity: String },
+                tupleElement(build, 'started_at')
+            ),
+            '%Y-%m-%d'
+        ) AS bucket
+    FROM vllm.vllm_buildkite_jobs
+    WHERE
+        tupleElement(pipeline, 'repository') = {repo: String }
+        AND tupleElement(pipeline, 'name') = {pipelineName: String }
+        AND tupleElement(build, 'started_at') IS NOT NULL
+        AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) }
+        AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) }
+    GROUP BY
+        repository,
+        pipeline_name,
+        build_number,
+        build_started_at,
+        build_finished_at,
+        build_state,
+        bucket
+),
+
+daily_stats AS (
+    SELECT
+        bucket,
+        countIf(lowerUTF8(build_state) IN ('passed', 'finished', 'success'))
+            AS passed_count,
+        countIf(lowerUTF8(build_state) = 'failed') AS failed_count,
+        countIf(lowerUTF8(build_state) IN ('canceled', 'cancelled'))
+            AS canceled_count,
+        passed_count + failed_count + canceled_count AS total_count,
+        passed_count + failed_count AS non_canceled_count,
+        if(
+            non_canceled_count > 0,
+            round(passed_count / non_canceled_count, 4),
+            NULL
+        ) AS success_rate
+    FROM builds
+    GROUP BY bucket
+)
+
+SELECT
+    bucket AS granularity_bucket,
+    passed_count,
+    failed_count,
+    canceled_count,
+    total_count,
+    non_canceled_count,
+    success_rate
+FROM daily_stats
+ORDER BY granularity_bucket ASC
diff --git a/torchci/clickhouse_queries/vllm/ci_run_duration/params.json b/torchci/clickhouse_queries/vllm/ci_run_duration/params.json
@@ -0,0 +1,16 @@
+{
+  "params": {
+    "repo": "String",
+    "pipelineName": "String",
+    "startTime": "DateTime64(3)",
+    "stopTime": "DateTime64(3)"
+  },
+  "tests": [
+    {
+      "repo": "vllm-project/vllm",
+      "pipelineName": "CI",
+      "startTime": "2025-09-26T00:00:00.000",
+      "stopTime": "2025-10-03T00:00:00.000"
+    }
+  ]
+}
diff --git a/torchci/clickhouse_queries/vllm/ci_run_duration/query.sql b/torchci/clickhouse_queries/vllm/ci_run_duration/query.sql
@@ -0,0 +1,32 @@
+-- vLLM CI run durations (Buildkite builds)
+-- Lists per-build durations based on build.started_at and build.finished_at
+
+WITH b AS (
+    SELECT
+        tupleElement(pipeline, 'repository') AS repository,
+        tupleElement(pipeline, 'name') AS pipeline_name,
+        toUInt32(tupleElement(build, 'number')) AS build_number,
+        tupleElement(build, 'started_at') AS build_started_at,
+        tupleElement(build, 'finished_at') AS build_finished_at,
+        tupleElement(build, 'state') AS build_state
+    FROM vllm.vllm_buildkite_jobs
+    WHERE
+        tupleElement(pipeline, 'repository') = {repo: String }
+        AND tupleElement(pipeline, 'name') = {pipelineName: String }
+        AND tupleElement(build, 'started_at') IS NOT NULL
+        AND tupleElement(build, 'finished_at') IS NOT NULL
+        AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) }
+        AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) }
+)
+
+SELECT
+    pipeline_name,
+    build_number,
+    max(build_started_at) AS started_at,
+    max(build_finished_at) AS finished_at,
+    any(build_state) AS build_state,
+    dateDiff('second', started_at, finished_at) AS duration_seconds,
+    round(duration_seconds / 3600.0, 3) AS duration_hours
+FROM b
+GROUP BY pipeline_name, build_number
+ORDER BY started_at ASC
diff --git a/torchci/clickhouse_queries/vllm/job_reliability/params.json b/torchci/clickhouse_queries/vllm/job_reliability/params.json
@@ -0,0 +1,18 @@
+{
+  "params": {
+    "repo": "String",
+    "pipelineName": "String",
+    "startTime": "DateTime64(3)",
+    "stopTime": "DateTime64(3)",
+    "minRuns": "UInt32"
+  },
+  "tests": [
+    {
+      "repo": "https://github.com/vllm-project/vllm.git",
+      "pipelineName": "CI",
+      "startTime": "2025-09-26T00:00:00.000",
+      "stopTime": "2025-10-03T00:00:00.000",
+      "minRuns": 3
+    }
+  ]
+}
diff --git a/torchci/clickhouse_queries/vllm/job_reliability/query.sql b/torchci/clickhouse_queries/vllm/job_reliability/query.sql
@@ -0,0 +1,56 @@
+-- vLLM per-job reliability metrics
+-- Computes success rate for each individual job in the CI pipeline
+-- Shows which jobs are most/least reliable
+
+WITH jobs AS (
+    SELECT
+        tupleElement(pipeline, 'repository') AS repository,
+        tupleElement(pipeline, 'name') AS pipeline_name,
+        toUInt32(tupleElement(build, 'number')) AS build_number,
+        tupleElement(job, 'name') AS job_name,
+        tupleElement(job, 'state') AS job_state,
+        tupleElement(job, 'soft_failed') AS soft_failed,
+        tupleElement(job, 'finished_at') AS job_finished_at
+    FROM vllm.vllm_buildkite_jobs
+    WHERE
+        tupleElement(pipeline, 'repository') = {repo: String }
+        AND tupleElement(pipeline, 'name') = {pipelineName: String }
+        AND tupleElement(job, 'finished_at') IS NOT NULL
+        AND tupleElement(job, 'finished_at') >= {startTime: DateTime64(3) }
+        AND tupleElement(job, 'finished_at') < {stopTime: DateTime64(3) }
+        -- Exclude soft-failed jobs from reliability calculation
+        AND tupleElement(job, 'soft_failed') = 'false'
+),
+
+job_stats AS (
+    SELECT
+        job_name,
+        countIf(lowerUTF8(job_state) IN ('passed', 'finished', 'success'))
+            AS passed_count,
+        countIf(lowerUTF8(job_state) = 'failed') AS failed_count,
+        countIf(lowerUTF8(job_state) IN ('canceled', 'cancelled'))
+            AS canceled_count,
+        passed_count + failed_count + canceled_count AS total_count,
+        passed_count + failed_count AS non_canceled_count,
+        if(
+            non_canceled_count > 0,
+            round(passed_count / non_canceled_count, 4),
+            NULL
+        ) AS success_rate
+    FROM jobs
+    GROUP BY job_name
+    HAVING non_canceled_count >= {minRuns: UInt32}
+)
+
+SELECT
+    job_name,
+    passed_count,
+    failed_count,
+    canceled_count,
+    total_count,
+    non_canceled_count,
+    success_rate
+FROM job_stats
+ORDER BY
+    success_rate ASC,
+    non_canceled_count DESC
diff --git a/torchci/clickhouse_queries/vllm/merges_percentage/query.sql b/torchci/clickhouse_queries/vllm/merges_percentage/query.sql
@@ -108,7 +108,7 @@ manual_merged_prs AS (
 manual_merged_prs_with_failures AS (
     SELECT
         bucket,
-        count(number) AS manual_merged_with_failures_count
+        count(DISTINCT number) AS manual_merged_with_failures_count
     FROM
         merged_prs
         LEFT JOIN latest_buildkite_jobs ON toString(merged_prs.number) = latest_buildkite_jobs.number
@@ -118,6 +118,19 @@ manual_merged_prs_with_failures AS (
     GROUP BY
         bucket
 ),
+manual_merged_prs_pending AS (
+    SELECT
+        bucket,
+        count(DISTINCT number) AS manual_merged_pending_count
+    FROM
+        merged_prs
+        LEFT JOIN latest_buildkite_jobs ON toString(merged_prs.number) = latest_buildkite_jobs.number
+    WHERE
+        tupleElement(auto_merge, 'merge_method') = ''
+        AND job_state IN ('running', 'pending', 'scheduled')
+    GROUP BY
+        bucket
+),
 auto_merged_prs AS (
     SELECT
         bucket,
@@ -137,14 +150,16 @@ results AS (
         abandon_count,
         auto_merged_count,
         manual_merged_count,
-        manual_merged_with_failures_count
+        manual_merged_with_failures_count,
+        manual_merged_pending_count
     FROM
         total_prs
         LEFT JOIN open_prs ON total_prs.bucket = open_prs.bucket
         LEFT JOIN abandon_prs ON total_prs.bucket = abandon_prs.bucket
         LEFT JOIN auto_merged_prs ON total_prs.bucket = auto_merged_prs.bucket
         LEFT JOIN manual_merged_prs ON total_prs.bucket = manual_merged_prs.bucket
         LEFT JOIN manual_merged_prs_with_failures ON total_prs.bucket = manual_merged_prs_with_failures.bucket
+        LEFT JOIN manual_merged_prs_pending ON total_prs.bucket = manual_merged_prs_pending.bucket
 )
 SELECT
     *

diff --git a/torchci/clickhouse_queries/vllm/pr_cycle_time_breakdown/params.json b/torchci/clickhouse_queries/vllm/pr_cycle_time_breakdown/params.json
@@ -0,0 +1,14 @@
+{
+  "params": {
+    "repo": "String",
+    "startTime": "DateTime64(3)",
+    "stopTime": "DateTime64(3)"
+  },
+  "tests": [
+    {
+      "repo": "vllm-project/vllm",
+      "startTime": "2025-09-22T00:00:00.000",
+      "stopTime": "2025-09-29T00:00:00.000"
+    }
+  ]
+}