-
Notifications
You must be signed in to change notification settings - Fork 105
vllm - Add initial set of metrics #7285
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"params": { | ||
"granularity": "String", | ||
"repo": "String", | ||
"pipelineName": "String", | ||
"startTime": "DateTime64(3)", | ||
"stopTime": "DateTime64(3)" | ||
}, | ||
"tests": [ | ||
{ | ||
"granularity": "day", | ||
"repo": "https://github.com/vllm-project/vllm.git", | ||
"pipelineName": "CI", | ||
"startTime": "2025-09-26T00:00:00.000", | ||
"stopTime": "2025-10-03T00:00:00.000" | ||
} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
-- vLLM CI reliability metrics | ||
-- Computes CI success rate, failure rate over time for Buildkite builds | ||
-- Daily breakdown of build states (passed, failed, canceled) | ||
-- Overall success rate and job-level reliability | ||
|
||
WITH builds AS ( | ||
SELECT | ||
tupleElement(pipeline, 'repository') AS repository, | ||
tupleElement(pipeline, 'name') AS pipeline_name, | ||
toUInt32(tupleElement(build, 'number')) AS build_number, | ||
tupleElement(build, 'started_at') AS build_started_at, | ||
tupleElement(build, 'finished_at') AS build_finished_at, | ||
tupleElement(build, 'state') AS build_state, | ||
formatDateTime( | ||
DATE_TRUNC( | ||
{granularity: String }, | ||
tupleElement(build, 'started_at') | ||
), | ||
'%Y-%m-%d' | ||
) AS bucket | ||
FROM vllm.vllm_buildkite_jobs | ||
WHERE | ||
tupleElement(pipeline, 'repository') = {repo: String } | ||
AND tupleElement(pipeline, 'name') = {pipelineName: String } | ||
AND tupleElement(build, 'started_at') IS NOT NULL | ||
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) } | ||
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) } | ||
GROUP BY | ||
repository, | ||
pipeline_name, | ||
build_number, | ||
build_started_at, | ||
build_finished_at, | ||
build_state, | ||
bucket | ||
), | ||
|
||
daily_stats AS ( | ||
SELECT | ||
bucket, | ||
countIf(lowerUTF8(build_state) IN ('passed', 'finished', 'success')) | ||
AS passed_count, | ||
countIf(lowerUTF8(build_state) = 'failed') AS failed_count, | ||
countIf(lowerUTF8(build_state) IN ('canceled', 'cancelled')) | ||
AS canceled_count, | ||
passed_count + failed_count + canceled_count AS total_count, | ||
passed_count + failed_count AS non_canceled_count, | ||
if( | ||
non_canceled_count > 0, | ||
round(passed_count / non_canceled_count, 4), | ||
NULL | ||
) AS success_rate | ||
FROM builds | ||
GROUP BY bucket | ||
) | ||
|
||
SELECT | ||
bucket AS granularity_bucket, | ||
passed_count, | ||
failed_count, | ||
canceled_count, | ||
total_count, | ||
non_canceled_count, | ||
success_rate | ||
FROM daily_stats | ||
ORDER BY granularity_bucket ASC |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
{ | ||
"params": { | ||
"repo": "String", | ||
"pipelineName": "String", | ||
"startTime": "DateTime64(3)", | ||
"stopTime": "DateTime64(3)" | ||
}, | ||
"tests": [ | ||
{ | ||
"repo": "vllm-project/vllm", | ||
"pipelineName": "CI", | ||
"startTime": "2025-09-26T00:00:00.000", | ||
"stopTime": "2025-10-03T00:00:00.000" | ||
} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
-- vLLM CI run durations (Buildkite builds) | ||
-- Lists per-build durations based on build.started_at and build.finished_at | ||
|
||
WITH b AS ( | ||
SELECT | ||
tupleElement(pipeline, 'repository') AS repository, | ||
tupleElement(pipeline, 'name') AS pipeline_name, | ||
toUInt32(tupleElement(build, 'number')) AS build_number, | ||
tupleElement(build, 'started_at') AS build_started_at, | ||
tupleElement(build, 'finished_at') AS build_finished_at, | ||
tupleElement(build, 'state') AS build_state | ||
FROM vllm.vllm_buildkite_jobs | ||
WHERE | ||
tupleElement(pipeline, 'repository') = {repo: String } | ||
AND tupleElement(pipeline, 'name') = {pipelineName: String } | ||
AND tupleElement(build, 'started_at') IS NOT NULL | ||
AND tupleElement(build, 'finished_at') IS NOT NULL | ||
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) } | ||
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) } | ||
) | ||
|
||
SELECT | ||
pipeline_name, | ||
build_number, | ||
max(build_started_at) AS started_at, | ||
max(build_finished_at) AS finished_at, | ||
any(build_state) AS build_state, | ||
dateDiff('second', started_at, finished_at) AS duration_seconds, | ||
round(duration_seconds / 3600.0, 3) AS duration_hours | ||
FROM b | ||
GROUP BY pipeline_name, build_number | ||
ORDER BY started_at ASC |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"params": { | ||
"repo": "String", | ||
"pipelineName": "String", | ||
"startTime": "DateTime64(3)", | ||
"stopTime": "DateTime64(3)", | ||
"minRuns": "UInt32" | ||
}, | ||
"tests": [ | ||
{ | ||
"repo": "https://github.com/vllm-project/vllm.git", | ||
"pipelineName": "CI", | ||
"startTime": "2025-09-26T00:00:00.000", | ||
"stopTime": "2025-10-03T00:00:00.000", | ||
"minRuns": 3 | ||
} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
-- vLLM per-job reliability metrics | ||
-- Computes success rate for each individual job in the CI pipeline | ||
-- Shows which jobs are most/least reliable | ||
|
||
WITH jobs AS ( | ||
SELECT | ||
tupleElement(pipeline, 'repository') AS repository, | ||
tupleElement(pipeline, 'name') AS pipeline_name, | ||
toUInt32(tupleElement(build, 'number')) AS build_number, | ||
tupleElement(job, 'name') AS job_name, | ||
tupleElement(job, 'state') AS job_state, | ||
tupleElement(job, 'soft_failed') AS soft_failed, | ||
tupleElement(job, 'finished_at') AS job_finished_at | ||
FROM vllm.vllm_buildkite_jobs | ||
WHERE | ||
tupleElement(pipeline, 'repository') = {repo: String } | ||
AND tupleElement(pipeline, 'name') = {pipelineName: String } | ||
AND tupleElement(job, 'finished_at') IS NOT NULL | ||
AND tupleElement(job, 'finished_at') >= {startTime: DateTime64(3) } | ||
AND tupleElement(job, 'finished_at') < {stopTime: DateTime64(3) } | ||
-- Exclude soft-failed jobs from reliability calculation | ||
AND tupleElement(job, 'soft_failed') = 'false' | ||
), | ||
|
||
job_stats AS ( | ||
SELECT | ||
job_name, | ||
countIf(lowerUTF8(job_state) IN ('passed', 'finished', 'success')) | ||
AS passed_count, | ||
countIf(lowerUTF8(job_state) = 'failed') AS failed_count, | ||
countIf(lowerUTF8(job_state) IN ('canceled', 'cancelled')) | ||
AS canceled_count, | ||
passed_count + failed_count + canceled_count AS total_count, | ||
passed_count + failed_count AS non_canceled_count, | ||
if( | ||
non_canceled_count > 0, | ||
round(passed_count / non_canceled_count, 4), | ||
NULL | ||
) AS success_rate | ||
FROM jobs | ||
GROUP BY job_name | ||
HAVING non_canceled_count >= {minRuns: UInt32} | ||
) | ||
|
||
SELECT | ||
job_name, | ||
passed_count, | ||
failed_count, | ||
canceled_count, | ||
total_count, | ||
non_canceled_count, | ||
success_rate | ||
FROM job_stats | ||
ORDER BY | ||
success_rate ASC, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A curious q: My understand is that this query would return the worst job first. Why is this on the preview all the jobs with 100% success rate are show first? I guess we want to focus on those that are not in a good state and we should show unreliable jobs first, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. Changed it to show the worst jobs first. |
||
non_canceled_count DESC |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -108,7 +108,7 @@ manual_merged_prs AS ( | |
manual_merged_prs_with_failures AS ( | ||
SELECT | ||
bucket, | ||
count(number) AS manual_merged_with_failures_count | ||
count(DISTINCT number) AS manual_merged_with_failures_count | ||
FROM | ||
merged_prs | ||
LEFT JOIN latest_buildkite_jobs ON toString(merged_prs.number) = latest_buildkite_jobs.number | ||
|
@@ -118,6 +118,19 @@ manual_merged_prs_with_failures AS ( | |
GROUP BY | ||
bucket | ||
), | ||
manual_merged_prs_pending AS ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's chat more on this one because I don't think it would work like this. My understanding is that If you are agree with this, we could exclude this KPI to implement it later in a different PR as I need to double check if the above snapshot is even kept in the database instead of being overwritten. If it's indeed being overwritten, we need to think about a way to persist the snapshot of all jobs at the time of a merge. Just FYI, PyTorch keeps that in a table call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are right. Removed it for now |
||
SELECT | ||
bucket, | ||
count(DISTINCT number) AS manual_merged_pending_count | ||
FROM | ||
merged_prs | ||
LEFT JOIN latest_buildkite_jobs ON toString(merged_prs.number) = latest_buildkite_jobs.number | ||
WHERE | ||
tupleElement(auto_merge, 'merge_method') = '' | ||
AND job_state IN ('running', 'pending', 'scheduled') | ||
GROUP BY | ||
bucket | ||
), | ||
auto_merged_prs AS ( | ||
SELECT | ||
bucket, | ||
|
@@ -137,14 +150,16 @@ results AS ( | |
abandon_count, | ||
auto_merged_count, | ||
manual_merged_count, | ||
manual_merged_with_failures_count | ||
manual_merged_with_failures_count, | ||
manual_merged_pending_count | ||
FROM | ||
total_prs | ||
LEFT JOIN open_prs ON total_prs.bucket = open_prs.bucket | ||
LEFT JOIN abandon_prs ON total_prs.bucket = abandon_prs.bucket | ||
LEFT JOIN auto_merged_prs ON total_prs.bucket = auto_merged_prs.bucket | ||
LEFT JOIN manual_merged_prs ON total_prs.bucket = manual_merged_prs.bucket | ||
LEFT JOIN manual_merged_prs_with_failures ON total_prs.bucket = manual_merged_prs_with_failures.bucket | ||
LEFT JOIN manual_merged_prs_pending ON total_prs.bucket = manual_merged_prs_pending.bucket | ||
) | ||
SELECT | ||
* | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"params": { | ||
"repo": "String", | ||
"startTime": "DateTime64(3)", | ||
"stopTime": "DateTime64(3)" | ||
}, | ||
"tests": [ | ||
{ | ||
"repo": "vllm-project/vllm", | ||
"startTime": "2025-09-22T00:00:00.000", | ||
"stopTime": "2025-09-29T00:00:00.000" | ||
} | ||
] | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you want to split this up into actual failures and soft failed? Maybe we only care about the former category
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Correct. We mostly care about hard failures. I added a commit to be more explicit about soft-failures