Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions torchci/clickhouse_queries/vllm/ci_reliability/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"params": {
"granularity": "String",
"repo": "String",
"pipelineName": "String",
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)"
},
"tests": [
{
"granularity": "day",
"repo": "https://github.com/vllm-project/vllm.git",
"pipelineName": "CI",
"startTime": "2025-09-26T00:00:00.000",
"stopTime": "2025-10-03T00:00:00.000"
}
]
}
66 changes: 66 additions & 0 deletions torchci/clickhouse_queries/vllm/ci_reliability/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
-- vLLM CI reliability metrics
-- Computes CI success rate, failure rate over time for Buildkite builds
-- Daily breakdown of build states (passed, failed, canceled)
-- Overall success rate and job-level reliability

WITH builds AS (
SELECT
tupleElement(pipeline, 'repository') AS repository,
tupleElement(pipeline, 'name') AS pipeline_name,
toUInt32(tupleElement(build, 'number')) AS build_number,
tupleElement(build, 'started_at') AS build_started_at,
tupleElement(build, 'finished_at') AS build_finished_at,
tupleElement(build, 'state') AS build_state,
formatDateTime(
DATE_TRUNC(
{granularity: String },
tupleElement(build, 'started_at')
),
'%Y-%m-%d'
) AS bucket
FROM vllm.vllm_buildkite_jobs
WHERE
tupleElement(pipeline, 'repository') = {repo: String }
AND tupleElement(pipeline, 'name') = {pipelineName: String }
AND tupleElement(build, 'started_at') IS NOT NULL
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) }
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) }
GROUP BY
repository,
pipeline_name,
build_number,
build_started_at,
build_finished_at,
build_state,
bucket
),

daily_stats AS (
SELECT
bucket,
countIf(lowerUTF8(build_state) IN ('passed', 'finished', 'success'))
AS passed_count,
countIf(lowerUTF8(build_state) = 'failed') AS failed_count,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to split this up into actual failures and soft failed? Maybe we only care about the former category

Copy link
Author

@rzabarazesh rzabarazesh Oct 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct. We mostly care about hard failures. I added a commit to be more explicit about soft-failures

countIf(lowerUTF8(build_state) IN ('canceled', 'cancelled'))
AS canceled_count,
passed_count + failed_count + canceled_count AS total_count,
passed_count + failed_count AS non_canceled_count,
if(
non_canceled_count > 0,
round(passed_count / non_canceled_count, 4),
NULL
) AS success_rate
FROM builds
GROUP BY bucket
)

SELECT
bucket AS granularity_bucket,
passed_count,
failed_count,
canceled_count,
total_count,
non_canceled_count,
success_rate
FROM daily_stats
ORDER BY granularity_bucket ASC
16 changes: 16 additions & 0 deletions torchci/clickhouse_queries/vllm/ci_run_duration/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"params": {
"repo": "String",
"pipelineName": "String",
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)"
},
"tests": [
{
"repo": "vllm-project/vllm",
"pipelineName": "CI",
"startTime": "2025-09-26T00:00:00.000",
"stopTime": "2025-10-03T00:00:00.000"
}
]
}
32 changes: 32 additions & 0 deletions torchci/clickhouse_queries/vllm/ci_run_duration/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
-- vLLM CI run durations (Buildkite builds)
-- Lists per-build durations based on build.started_at and build.finished_at

WITH b AS (
SELECT
tupleElement(pipeline, 'repository') AS repository,
tupleElement(pipeline, 'name') AS pipeline_name,
toUInt32(tupleElement(build, 'number')) AS build_number,
tupleElement(build, 'started_at') AS build_started_at,
tupleElement(build, 'finished_at') AS build_finished_at,
tupleElement(build, 'state') AS build_state
FROM vllm.vllm_buildkite_jobs
WHERE
tupleElement(pipeline, 'repository') = {repo: String }
AND tupleElement(pipeline, 'name') = {pipelineName: String }
AND tupleElement(build, 'started_at') IS NOT NULL
AND tupleElement(build, 'finished_at') IS NOT NULL
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) }
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) }
)

SELECT
pipeline_name,
build_number,
max(build_started_at) AS started_at,
max(build_finished_at) AS finished_at,
any(build_state) AS build_state,
dateDiff('second', started_at, finished_at) AS duration_seconds,
round(duration_seconds / 3600.0, 3) AS duration_hours
FROM b
GROUP BY pipeline_name, build_number
ORDER BY started_at ASC
18 changes: 18 additions & 0 deletions torchci/clickhouse_queries/vllm/job_reliability/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"params": {
"repo": "String",
"pipelineName": "String",
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)",
"minRuns": "UInt32"
},
"tests": [
{
"repo": "https://github.com/vllm-project/vllm.git",
"pipelineName": "CI",
"startTime": "2025-09-26T00:00:00.000",
"stopTime": "2025-10-03T00:00:00.000",
"minRuns": 3
}
]
}
56 changes: 56 additions & 0 deletions torchci/clickhouse_queries/vllm/job_reliability/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
-- vLLM per-job reliability metrics
-- Computes success rate for each individual job in the CI pipeline
-- Shows which jobs are most/least reliable

WITH jobs AS (
SELECT
tupleElement(pipeline, 'repository') AS repository,
tupleElement(pipeline, 'name') AS pipeline_name,
toUInt32(tupleElement(build, 'number')) AS build_number,
tupleElement(job, 'name') AS job_name,
tupleElement(job, 'state') AS job_state,
tupleElement(job, 'soft_failed') AS soft_failed,
tupleElement(job, 'finished_at') AS job_finished_at
FROM vllm.vllm_buildkite_jobs
WHERE
tupleElement(pipeline, 'repository') = {repo: String }
AND tupleElement(pipeline, 'name') = {pipelineName: String }
AND tupleElement(job, 'finished_at') IS NOT NULL
AND tupleElement(job, 'finished_at') >= {startTime: DateTime64(3) }
AND tupleElement(job, 'finished_at') < {stopTime: DateTime64(3) }
-- Exclude soft-failed jobs from reliability calculation
AND tupleElement(job, 'soft_failed') = 'false'
),

job_stats AS (
SELECT
job_name,
countIf(lowerUTF8(job_state) IN ('passed', 'finished', 'success'))
AS passed_count,
countIf(lowerUTF8(job_state) = 'failed') AS failed_count,
countIf(lowerUTF8(job_state) IN ('canceled', 'cancelled'))
AS canceled_count,
passed_count + failed_count + canceled_count AS total_count,
passed_count + failed_count AS non_canceled_count,
if(
non_canceled_count > 0,
round(passed_count / non_canceled_count, 4),
NULL
) AS success_rate
FROM jobs
GROUP BY job_name
HAVING non_canceled_count >= {minRuns: UInt32}
)

SELECT
job_name,
passed_count,
failed_count,
canceled_count,
total_count,
non_canceled_count,
success_rate
FROM job_stats
ORDER BY
success_rate ASC,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A curious q: My understand is that this query would return the worst job first. Why is this on the preview all the jobs with 100% success rate are show first? I guess we want to focus on those that are not in a good state and we should show unreliable jobs first, right?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. Changed it to show the worst jobs first.

non_canceled_count DESC
19 changes: 17 additions & 2 deletions torchci/clickhouse_queries/vllm/merges_percentage/query.sql
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ manual_merged_prs AS (
manual_merged_prs_with_failures AS (
SELECT
bucket,
count(number) AS manual_merged_with_failures_count
count(DISTINCT number) AS manual_merged_with_failures_count
FROM
merged_prs
LEFT JOIN latest_buildkite_jobs ON toString(merged_prs.number) = latest_buildkite_jobs.number
Expand All @@ -118,6 +118,19 @@ manual_merged_prs_with_failures AS (
GROUP BY
bucket
),
manual_merged_prs_pending AS (
Copy link
Contributor

@huydhn huydhn Oct 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's chat more on this one because I don't think it would work like this. My understanding is that job_state is a field that is updated when the job progresses changing from scheduled to pending to running, then successed or failured or cancelled etc. A manual merge due to impatience means that the job is scheduled or pending or running at the time the merge occurs. So, it's a snapshot in time. However, the job information we have here is only the latest state. This means that this query changes depending on when you query it.

If you are agree with this, we could exclude this KPI to implement it later in a different PR as I need to double check if the above snapshot is even kept in the database instead of being overwritten. If it's indeed being overwritten, we need to think about a way to persist the snapshot of all jobs at the time of a merge. Just FYI, PyTorch keeps that in a table call merges although I don't think we could reuse that one.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right. Removed it for now

SELECT
bucket,
count(DISTINCT number) AS manual_merged_pending_count
FROM
merged_prs
LEFT JOIN latest_buildkite_jobs ON toString(merged_prs.number) = latest_buildkite_jobs.number
WHERE
tupleElement(auto_merge, 'merge_method') = ''
AND job_state IN ('running', 'pending', 'scheduled')
GROUP BY
bucket
),
auto_merged_prs AS (
SELECT
bucket,
Expand All @@ -137,14 +150,16 @@ results AS (
abandon_count,
auto_merged_count,
manual_merged_count,
manual_merged_with_failures_count
manual_merged_with_failures_count,
manual_merged_pending_count
FROM
total_prs
LEFT JOIN open_prs ON total_prs.bucket = open_prs.bucket
LEFT JOIN abandon_prs ON total_prs.bucket = abandon_prs.bucket
LEFT JOIN auto_merged_prs ON total_prs.bucket = auto_merged_prs.bucket
LEFT JOIN manual_merged_prs ON total_prs.bucket = manual_merged_prs.bucket
LEFT JOIN manual_merged_prs_with_failures ON total_prs.bucket = manual_merged_prs_with_failures.bucket
LEFT JOIN manual_merged_prs_pending ON total_prs.bucket = manual_merged_prs_pending.bucket
)
SELECT
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"params": {
"repo": "String",
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)"
},
"tests": [
{
"repo": "vllm-project/vllm",
"startTime": "2025-09-22T00:00:00.000",
"stopTime": "2025-09-29T00:00:00.000"
}
]
}
Loading