From 62bca1b37b624e37c7152e318190a27d0b5bf1d9 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Sun, 31 Aug 2025 11:15:09 -0700 Subject: [PATCH 01/27] addid --- .../params.json | 17 +++ .../query.sql | 110 ++++++++++++++++++ .../components/metrics/panels/TablePanel.tsx | 1 + .../benchmark/compiler_benmark_time_series.ts | 40 +++++++ torchci/pages/api/benchmark/group_data.ts | 1 + 5 files changed, 169 insertions(+) create mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json create mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql create mode 100644 torchci/pages/api/benchmark/compiler_benmark_time_series.ts diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json new file mode 100644 index 0000000000..95f00e1501 --- /dev/null +++ b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json @@ -0,0 +1,17 @@ +{ + "params": { + "branches": "Array(String)", + "commits": "Array(String)", + "compilers": "Array(String)", + "device": "String", + "arch": "String", + "dtype": "String", + "granularity": "String", + "mode": "String", + "startTime": "DateTime64(3)", + "stopTime": "DateTime64(3)", + "suites": "Array(String)", + "workflowId": "Int64" + }, + "tests": [] +} diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql new file mode 100644 index 0000000000..7ef1efb232 --- /dev/null +++ b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql @@ -0,0 +1,110 @@ +-- This query is used to get the PT2 benchmark results from different experiments +-- to powers the TorchInductor benchmark dashboard +WITH benchmarks AS ( + SELECT + workflow_id, + job_id, + suite, + model_name, + metric_name, + value, + metric_extra_info AS extra_info, + DATE_TRUNC( + {granularity: String }, + fromUnixTimestamp(timestamp) + ) AS granularity_bucket, + -- Filters + benchmark_dtype, + benchmark_mode, + device, + arch, + replaceOne(head_branch, 'refs/heads/', '') AS head_branch, + benchmark_extra_info['output'] AS output, + REGEXP_REPLACE( + output, + CONCAT( + '_', + suite, + '_', + { dtype: String }, + '_', + {mode: String }, + '_', + {device: String }, + '_.*' + ), + '' + ) AS temp + FROM + benchmark.oss_ci_benchmark_torchinductor + WHERE + timestamp >= toUnixTimestamp({startTime: DateTime64(3) }) + AND timestamp < toUnixTimestamp({stopTime: DateTime64(3) }) + AND ( + has({commits: Array(String) }, head_sha) + OR empty({commits: Array(String) }) + ) + AND ( + has({suites: Array(String) }, suite) + OR empty({suites: Array(String) }) + ) + AND ( + workflow_id = {workflowId: Int64} + OR {workflowId: Int64} = 0 + ) +) + +SELECT + workflow_id, + job_id, + REGEXP_REPLACE(temp, '.*/', '') AS backend, + suite, + model_name AS model, + metric_name AS metric, + value, + extra_info, + output, + granularity_bucket +FROM + benchmarks +WHERE + ( + has({branches: Array(String) }, head_branch) + OR empty({branches: Array(String) }) + ) + -- TODO (huydhn): Clean up the output field and how it's used in the query + -- in 6 months + AND ( + ( + ({arch: String } = '' OR {arch: String } = 'a100') + AND output LIKE CONCAT( + '%\_', + {dtype: String }, + '\_', + {mode: String }, + '\_', + {device: String }, + '\_%' + ) + ) + OR ( + {arch: String } != '' + AND output LIKE CONCAT( + '%\_', + {dtype: String }, + '\_', + {mode: String }, + '\_', + {device: String }, + '\_', + {arch: String }, + '\_%' + ) + ) + OR ( + benchmark_dtype = {dtype: String } + AND benchmark_mode = {mode: String } + AND device = {device: String } + AND arch = {arch: String } + ) + ) diff --git a/torchci/components/metrics/panels/TablePanel.tsx b/torchci/components/metrics/panels/TablePanel.tsx index d53d0ab8d6..023223d238 100644 --- a/torchci/components/metrics/panels/TablePanel.tsx +++ b/torchci/components/metrics/panels/TablePanel.tsx @@ -7,6 +7,7 @@ import useSWR from "swr"; const fetcher = (url: string) => fetch(url).then((res) => res.json()); + export default function TablePanel({ // Human-readable title for this panel. title, diff --git a/torchci/pages/api/benchmark/compiler_benmark_time_series.ts b/torchci/pages/api/benchmark/compiler_benmark_time_series.ts new file mode 100644 index 0000000000..4e8254766d --- /dev/null +++ b/torchci/pages/api/benchmark/compiler_benmark_time_series.ts @@ -0,0 +1,40 @@ + +import { queryClickhouseSaved } from "lib/clickhouse"; +import type { NextApiRequest, NextApiResponse } from "next"; + +const DEFAULT_TABLE_GROUP = [ + "device", + "backend", + "model", + "dtype", + "backend", + "arch", +]; +const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2"; + +type QueryParams = { + startTime: string; // ISO timestamp + stopTime: string; // ISO timestamp + [k: string]: any; // other parameters +}; + + +export default async function handler( + req: NextApiRequest, + res: NextApiResponse +) { + + console.log("compiler_benmark_time_series."); + const inputparams = JSON.parse(req.query.parameters as string) + console.log("inputs", inputparams); + + const start = Date.now(); + const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams); + const end = Date.now(); + console.log(`Process took ${end - start}ms`); + + console.log("merged rows:", rows.length); + res.status(200).json({ data: rows }); +} + + diff --git a/torchci/pages/api/benchmark/group_data.ts b/torchci/pages/api/benchmark/group_data.ts index 713ed480d9..d8703d8a8b 100644 --- a/torchci/pages/api/benchmark/group_data.ts +++ b/torchci/pages/api/benchmark/group_data.ts @@ -32,6 +32,7 @@ export default async function handler( details: formatZodError(request.error), }); } + const qp = request.data; const groupTableByFields = qp.group_table_by_fields || deepClone(DEFAULT_TABLE_GROUP); From 6a5f5006a29069d2a6cbec50df2e2dfa43a5ef48 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Sun, 31 Aug 2025 18:12:51 -0700 Subject: [PATCH 02/27] addid --- .../query.sql | 155 +++++--------- .../components/metrics/panels/TablePanel.tsx | 2 +- torchci/lib/benchmark/compilerUtils.ts | 48 +++++ .../benchmark/compiler_benmark_time_series.ts | 40 ---- .../pages/api/benchmark/get_time_series.ts | 202 ++++++++++++++++++ torchci/pages/api/benchmark/group_data.ts | 2 +- 6 files changed, 305 insertions(+), 144 deletions(-) delete mode 100644 torchci/pages/api/benchmark/compiler_benmark_time_series.ts diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql index 7ef1efb232..bf233f6ff8 100644 --- a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql +++ b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql @@ -1,110 +1,61 @@ --- This query is used to get the PT2 benchmark results from different experiments --- to powers the TorchInductor benchmark dashboard WITH benchmarks AS ( - SELECT - workflow_id, - job_id, - suite, - model_name, - metric_name, - value, - metric_extra_info AS extra_info, - DATE_TRUNC( - {granularity: String }, - fromUnixTimestamp(timestamp) - ) AS granularity_bucket, - -- Filters - benchmark_dtype, - benchmark_mode, - device, - arch, - replaceOne(head_branch, 'refs/heads/', '') AS head_branch, - benchmark_extra_info['output'] AS output, - REGEXP_REPLACE( - output, - CONCAT( - '_', - suite, - '_', - { dtype: String }, - '_', - {mode: String }, - '_', - {device: String }, - '_.*' - ), - '' - ) AS temp - FROM - benchmark.oss_ci_benchmark_torchinductor - WHERE - timestamp >= toUnixTimestamp({startTime: DateTime64(3) }) - AND timestamp < toUnixTimestamp({stopTime: DateTime64(3) }) - AND ( - has({commits: Array(String) }, head_sha) - OR empty({commits: Array(String) }) - ) - AND ( - has({suites: Array(String) }, suite) - OR empty({suites: Array(String) }) - ) - AND ( - workflow_id = {workflowId: Int64} - OR {workflowId: Int64} = 0 - ) -) - -SELECT + SELECT workflow_id, job_id, - REGEXP_REPLACE(temp, '.*/', '') AS backend, suite, - model_name AS model, - metric_name AS metric, + model_name, + metric_name, value, - extra_info, - output, - granularity_bucket -FROM - benchmarks + metric_extra_info AS extra_info, + DATE_TRUNC({granularity:String}, fromUnixTimestamp(timestamp)) AS granularity_bucket, + benchmark_dtype, + benchmark_mode, + device, + arch, + replaceOne(head_branch, 'refs/heads/', '') AS head_branch, + + benchmark_extra_info['output'] AS output, + + REGEXP_REPLACE( + benchmark_extra_info['output'], + CONCAT('_', suite, '_', {dtype:String}, '_', {mode:String}, '_', {device:String}, '_.*'), + '' + ) AS temp + + FROM benchmark.oss_ci_benchmark_torchinductor + WHERE + timestamp >= toUnixTimestamp({startTime:DateTime64(3)}) AND + timestamp < toUnixTimestamp({stopTime:DateTime64(3)}) AND + (has({commits:Array(String)}, head_sha) OR empty({commits:Array(String)})) AND + (has({suites:Array(String)}, suite) OR empty({suites:Array(String)})) AND + (workflow_id = {workflowId:Int64} OR {workflowId:Int64} = 0) +) + +SELECT + workflow_id, + job_id, + REGEXP_REPLACE(temp, '.*/', '') AS backend, + suite, + model_name AS model, + metric_name AS metric, + value, + output, + granularity_bucket, + extra_info, +FROM benchmarks WHERE + (has({branches:Array(String)}, head_branch) OR empty({branches:Array(String)})) + AND ( ( - has({branches: Array(String) }, head_branch) - OR empty({branches: Array(String) }) - ) - -- TODO (huydhn): Clean up the output field and how it's used in the query - -- in 6 months - AND ( - ( - ({arch: String } = '' OR {arch: String } = 'a100') - AND output LIKE CONCAT( - '%\_', - {dtype: String }, - '\_', - {mode: String }, - '\_', - {device: String }, - '\_%' - ) - ) - OR ( - {arch: String } != '' - AND output LIKE CONCAT( - '%\_', - {dtype: String }, - '\_', - {mode: String }, - '\_', - {device: String }, - '\_', - {arch: String }, - '\_%' - ) - ) - OR ( - benchmark_dtype = {dtype: String } - AND benchmark_mode = {mode: String } - AND device = {device: String } - AND arch = {arch: String } - ) + ({arch:String} = '' OR {arch:String} = 'a100') AND + output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_%') + ) OR ( + {arch:String} != '' AND + output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_', {arch:String}, '\_%') + ) OR ( + benchmark_dtype = {dtype:String} AND + benchmark_mode = {mode:String} AND + device = {device:String} AND + arch = {arch:String} ) + ); diff --git a/torchci/components/metrics/panels/TablePanel.tsx b/torchci/components/metrics/panels/TablePanel.tsx index 023223d238..c53b29f4e3 100644 --- a/torchci/components/metrics/panels/TablePanel.tsx +++ b/torchci/components/metrics/panels/TablePanel.tsx @@ -1,13 +1,13 @@ import HelpIcon from "@mui/icons-material/Help"; import { Box, Skeleton, Typography } from "@mui/material"; import IconButton from "@mui/material/IconButton"; +import { Box } from "@mui/system"; import { DataGrid, GridColDef } from "@mui/x-data-grid"; import { CSSProperties } from "react"; import useSWR from "swr"; const fetcher = (url: string) => fetch(url).then((res) => res.json()); - export default function TablePanel({ // Human-readable title for this panel. title, diff --git a/torchci/lib/benchmark/compilerUtils.ts b/torchci/lib/benchmark/compilerUtils.ts index 06e4542269..00212177b3 100644 --- a/torchci/lib/benchmark/compilerUtils.ts +++ b/torchci/lib/benchmark/compilerUtils.ts @@ -456,3 +456,51 @@ export function convertToCompilerPerformanceData(data: BenchmarkData[]) { return Object.values(convertData); } + +export function computePassrateSimple(data: any[]) { + if (!Array.isArray(data) || data.length === 0) return []; + + const blocked = new Set(BLOCKLIST_COMPILERS); + const passingAcc = new Set(PASSING_ACCURACY); + const toDisplay = (c: string) => COMPILER_NAMES_TO_DISPLAY_NAMES[c] ?? c; + + const totalCount = new Map(); + const passCount = new Map(); + + for (const r of data) { + const compilerDisp = toDisplay(r.compiler); + if (blocked.has(compilerDisp)) continue; + + const key = `${r.granularity_bucket}+${r.workflow_id}+${r.suite}+${compilerDisp}`; + + // 计总 + totalCount.set(key, (totalCount.get(key) ?? 0) + 1); + + const acc = r.accuracy ?? ""; + const speed = r.speedup ?? 0; + const pass = + (passingAcc.has(acc) && (speed !== 0 || compilerDisp === "export")) || + acc === "pass_due_to_skip"; + + if (pass) passCount.set(key, (passCount.get(key) ?? 0) + 1); + } + const out: any[] = []; + for (const [key, tc] of totalCount) { + const pc = passCount.get(key) ?? 0; + const p = tc > 0 ? pc / tc : 0; + + const [bucket, wfStr, suite, compiler] = key.split("+"); + out.push({ + metirc: "passrate", + granularity_bucket: bucket, + workflow_id: Number(wfStr), + suite, + compiler, + passrate: p, + pass_count: pc, + total_count: tc, + passrate_display: `${(p * 100).toFixed(0)}%, ${pc}/${tc}`, + }); + } + return out; +} diff --git a/torchci/pages/api/benchmark/compiler_benmark_time_series.ts b/torchci/pages/api/benchmark/compiler_benmark_time_series.ts deleted file mode 100644 index 4e8254766d..0000000000 --- a/torchci/pages/api/benchmark/compiler_benmark_time_series.ts +++ /dev/null @@ -1,40 +0,0 @@ - -import { queryClickhouseSaved } from "lib/clickhouse"; -import type { NextApiRequest, NextApiResponse } from "next"; - -const DEFAULT_TABLE_GROUP = [ - "device", - "backend", - "model", - "dtype", - "backend", - "arch", -]; -const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2"; - -type QueryParams = { - startTime: string; // ISO timestamp - stopTime: string; // ISO timestamp - [k: string]: any; // other parameters -}; - - -export default async function handler( - req: NextApiRequest, - res: NextApiResponse -) { - - console.log("compiler_benmark_time_series."); - const inputparams = JSON.parse(req.query.parameters as string) - console.log("inputs", inputparams); - - const start = Date.now(); - const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams); - const end = Date.now(); - console.log(`Process took ${end - start}ms`); - - console.log("merged rows:", rows.length); - res.status(200).json({ data: rows }); -} - - diff --git a/torchci/pages/api/benchmark/get_time_series.ts b/torchci/pages/api/benchmark/get_time_series.ts index ce069f5590..7f05d6b492 100644 --- a/torchci/pages/api/benchmark/get_time_series.ts +++ b/torchci/pages/api/benchmark/get_time_series.ts @@ -1,6 +1,28 @@ +<<<<<<< HEAD import { getCompilerBenchmarkData } from "lib/benchmark/api_helper/compilers/precompute"; import { readApiGetParams } from "lib/benchmark/api_helper/utils"; import type { NextApiRequest, NextApiResponse } from "next"; +======= +import { + computeGeomean, + computePassrate, + computePeakMemoryUsage, + convertToCompilerPerformanceData, + getPassingModels, +} from "lib/benchmark/compilerUtils"; +import { queryClickhouseSaved } from "lib/clickhouse"; +import type { NextApiRequest, NextApiResponse } from "next"; +import { getNestedField } from "./group_data"; + +type GroupInfo = Record; +type Subgroup = { group_Info: GroupInfo; data: T[] }; +type GroupedItem = { + group_Info: GroupInfo; + rows: Record>; +}; +type Params = Record; +const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2"; +>>>>>>> 52d0ced7a (addid) /** * API Route: /api/benchmark/get_time_series @@ -22,12 +44,20 @@ export default async function handler( req: NextApiRequest, res: NextApiResponse ) { +<<<<<<< HEAD +======= + +>>>>>>> 52d0ced7a (addid) if (req.method !== "GET" && req.method !== "POST") { res.setHeader("Allow", "GET, POST"); return res.status(405).json({ error: "Only GET and POST allowed" }); } +<<<<<<< HEAD const params = readApiGetParams(req); +======= + const params = readParams(req); +>>>>>>> 52d0ced7a (addid) console.log("[API]get_time_series, received request:", params); // validate params @@ -39,6 +69,10 @@ export default async function handler( ) { return res.status(400).json({ error: "Missing parameters" }); } +<<<<<<< HEAD +======= + +>>>>>>> 52d0ced7a (addid) // get time series data try { const { name, query_params } = params; @@ -61,3 +95,171 @@ async function getBenmarkTimeSeriesData( throw new Error(`Unsupported request_name: ${request_name}`); } } +<<<<<<< HEAD +======= + +// Utility to extract params from either GET or POST +// it accepts both ?parameters= and POST with JSON body +function readParams(req: NextApiRequest): Params { + // 1) If POST with parsed JSON body + if (req.method === "POST" && req.body && typeof req.body === "object") { + return req.body as Params; + } + + // 2) If POST with raw string body + if ( + req.method === "POST" && + typeof req.body === "string" && + req.body.trim() + ) { + try { + return JSON.parse(req.body) as Params; + } catch {} + } + + // 3) If GET with ?parameters= + const raw = req.query.parameters as string | undefined; + if (raw) { + try { + return JSON.parse(raw) as Params; + } catch {} + } + + // 4) Fallback: use query params directly + const q: Params = {}; + Object.entries(req.query).forEach(([k, v]) => { + if (k !== "parameters") q[k] = Array.isArray(v) ? v[0] : v; + }); + return q; +} + +/** + * Group data by `keys`, and inside each group further subgroup by `subGroupKeys`. + */ +function groupBy( + data: T[], + keys: string[], + subGroupKeys: string[] = [] +): GroupedItem[] { + const groups = new Map>>(); + const mainInfo = new Map(); + + for (const row of data as any[]) { + // build main group key + const mainKeyParts = keys.map((k) => String(getNestedField(row, k))); + const mainKey = mainKeyParts.join("|"); + if (!mainInfo.has(mainKey)) { + const info: GroupInfo = {}; + keys.forEach((k, i) => (info[k] = mainKeyParts[i])); + mainInfo.set(mainKey, info); + } + + // build subgroup key + const subKeyParts = + subGroupKeys.length > 0 + ? subGroupKeys.map((k) => String(getNestedField(row, k))) + : ["__ALL__"]; // default single subgroup if none provided + const subKey = subKeyParts.join("|"); + const subInfo: GroupInfo = {}; + + subGroupKeys.forEach((k, i) => (subInfo[k] = subKeyParts[i])); + + if (!groups.has(mainKey)) groups.set(mainKey, new Map()); + const subMap = groups.get(mainKey)!; + + if (!subMap.has(subKey)) { + subMap.set(subKey, { group_Info: subInfo, data: [] }); + } + subMap.get(subKey)!.data.push(row as T); + } + + // build result array + const result: GroupedItem[] = []; + for (const [mainKey, subMap] of groups.entries()) { + const rowsObj = Object.fromEntries(subMap.entries()); + result.push({ + group_Info: mainInfo.get(mainKey)!, + rows: rowsObj, + }); + } + return result; +} + +async function getCompilerBenchmarkData(inputparams: any) { + const start = Date.now(); + const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams); + const end = Date.now(); + const result = toPrecomputeCompiler(rows, inputparams, "time_series"); + console.log("time to get data", end - start); + return result; +} + +function toPrecomputeCompiler( + rawData: any[], + inputparams: any, + type: string = "time_series" +) { + const data = convertToCompilerPerformanceData(rawData); + const models = getPassingModels(data); + + const passrate = computePassrate(data, models); + const geomean = computeGeomean(data, models); + const peakMemory = computePeakMemoryUsage(data, models); + + const all_data = [passrate, geomean, peakMemory].flat(); + + all_data.map((row) => { + row["dtype"] = inputparams["dtype"]; + row["arch"] = inputparams["arch"]; + row["device"] = inputparams["device"]; + row["mode"] = inputparams["mode"]; + }); + + let res: any[] = []; + switch (type) { + case "time_series": + // grouping data by comipler, device, arch, dtype, suite, metric, mode + // then sorted it with granularity_bucket in ascending order + const tsd = groupBy( + all_data, + ["dtype", "arch", "device", "suite", "compiler", "metric", "mode"], + ["workflow_id"] + ); + res = tsd.map((group) => { + const group_info = group.group_Info; + const group_data = group.rows; + + // no need for the group_info for subgroup, directly get the data + const ts_list = Object.values(group_data) + .filter((item) => item.data.length > 0) + .map((item) => item.data[0]) + .sort( + (a, b) => + new Date(a.granularity_bucket).getTime() - + new Date(b.granularity_bucket).getTime() + ); + return { + group_info, + num_of_dp: ts_list.length, + result: ts_list, + }; + }); + return res; + case "table": + res = groupBy( + all_data, + [ + "dtype", + "arch", + "device", + "mode", + "workflow_id", + "granularity_bucket", + ], + ["metric", "compiler"] + ); + } + + return res; +} +>>>>>>> 52d0ced7a (addid) diff --git a/torchci/pages/api/benchmark/group_data.ts b/torchci/pages/api/benchmark/group_data.ts index d8703d8a8b..030d79cc24 100644 --- a/torchci/pages/api/benchmark/group_data.ts +++ b/torchci/pages/api/benchmark/group_data.ts @@ -16,7 +16,7 @@ const DEFAULT_TABLE_GROUP = [ const DEFAULT_ROW_GROUP = ["workflow_id", "job_id", "metadata_info.timestamp"]; const BENCNMARK_TABLE_NAME = "oss_ci_benchmark_llms"; -function getNestedField(obj: any, path: string): any { +export function getNestedField(obj: any, path: string): any { return path.split(".").reduce((o, key) => (o && key in o ? o[key] : ""), obj); } From 9de60df4400fefffaf60050d7cfe4e49cb56db78 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Sun, 31 Aug 2025 18:15:02 -0700 Subject: [PATCH 03/27] addid --- .../params.json | 17 ------ .../query.sql | 61 ------------------- .../pages/api/benchmark/get_time_series.ts | 4 ++ 3 files changed, 4 insertions(+), 78 deletions(-) delete mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json delete mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json deleted file mode 100644 index 95f00e1501..0000000000 --- a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "params": { - "branches": "Array(String)", - "commits": "Array(String)", - "compilers": "Array(String)", - "device": "String", - "arch": "String", - "dtype": "String", - "granularity": "String", - "mode": "String", - "startTime": "DateTime64(3)", - "stopTime": "DateTime64(3)", - "suites": "Array(String)", - "workflowId": "Int64" - }, - "tests": [] -} diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql deleted file mode 100644 index bf233f6ff8..0000000000 --- a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql +++ /dev/null @@ -1,61 +0,0 @@ -WITH benchmarks AS ( - SELECT - workflow_id, - job_id, - suite, - model_name, - metric_name, - value, - metric_extra_info AS extra_info, - DATE_TRUNC({granularity:String}, fromUnixTimestamp(timestamp)) AS granularity_bucket, - benchmark_dtype, - benchmark_mode, - device, - arch, - replaceOne(head_branch, 'refs/heads/', '') AS head_branch, - - benchmark_extra_info['output'] AS output, - - REGEXP_REPLACE( - benchmark_extra_info['output'], - CONCAT('_', suite, '_', {dtype:String}, '_', {mode:String}, '_', {device:String}, '_.*'), - '' - ) AS temp - - FROM benchmark.oss_ci_benchmark_torchinductor - WHERE - timestamp >= toUnixTimestamp({startTime:DateTime64(3)}) AND - timestamp < toUnixTimestamp({stopTime:DateTime64(3)}) AND - (has({commits:Array(String)}, head_sha) OR empty({commits:Array(String)})) AND - (has({suites:Array(String)}, suite) OR empty({suites:Array(String)})) AND - (workflow_id = {workflowId:Int64} OR {workflowId:Int64} = 0) -) - -SELECT - workflow_id, - job_id, - REGEXP_REPLACE(temp, '.*/', '') AS backend, - suite, - model_name AS model, - metric_name AS metric, - value, - output, - granularity_bucket, - extra_info, -FROM benchmarks -WHERE - (has({branches:Array(String)}, head_branch) OR empty({branches:Array(String)})) - AND ( - ( - ({arch:String} = '' OR {arch:String} = 'a100') AND - output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_%') - ) OR ( - {arch:String} != '' AND - output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_', {arch:String}, '\_%') - ) OR ( - benchmark_dtype = {dtype:String} AND - benchmark_mode = {mode:String} AND - device = {device:String} AND - arch = {arch:String} - ) - ); diff --git a/torchci/pages/api/benchmark/get_time_series.ts b/torchci/pages/api/benchmark/get_time_series.ts index 7f05d6b492..5de906f554 100644 --- a/torchci/pages/api/benchmark/get_time_series.ts +++ b/torchci/pages/api/benchmark/get_time_series.ts @@ -21,8 +21,12 @@ type GroupedItem = { rows: Record>; }; type Params = Record; +<<<<<<< HEAD const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2"; >>>>>>> 52d0ced7a (addid) +======= +const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance"; +>>>>>>> e259e2663 (addid) /** * API Route: /api/benchmark/get_time_series From 94fbf8da51985218a8a09d709dcc406236499459 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Sun, 31 Aug 2025 18:26:33 -0700 Subject: [PATCH 04/27] addid --- .../pages/api/benchmark/get_time_series.ts | 206 ------------------ 1 file changed, 206 deletions(-) diff --git a/torchci/pages/api/benchmark/get_time_series.ts b/torchci/pages/api/benchmark/get_time_series.ts index 5de906f554..ce069f5590 100644 --- a/torchci/pages/api/benchmark/get_time_series.ts +++ b/torchci/pages/api/benchmark/get_time_series.ts @@ -1,32 +1,6 @@ -<<<<<<< HEAD import { getCompilerBenchmarkData } from "lib/benchmark/api_helper/compilers/precompute"; import { readApiGetParams } from "lib/benchmark/api_helper/utils"; import type { NextApiRequest, NextApiResponse } from "next"; -======= -import { - computeGeomean, - computePassrate, - computePeakMemoryUsage, - convertToCompilerPerformanceData, - getPassingModels, -} from "lib/benchmark/compilerUtils"; -import { queryClickhouseSaved } from "lib/clickhouse"; -import type { NextApiRequest, NextApiResponse } from "next"; -import { getNestedField } from "./group_data"; - -type GroupInfo = Record; -type Subgroup = { group_Info: GroupInfo; data: T[] }; -type GroupedItem = { - group_Info: GroupInfo; - rows: Record>; -}; -type Params = Record; -<<<<<<< HEAD -const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2"; ->>>>>>> 52d0ced7a (addid) -======= -const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance"; ->>>>>>> e259e2663 (addid) /** * API Route: /api/benchmark/get_time_series @@ -48,20 +22,12 @@ export default async function handler( req: NextApiRequest, res: NextApiResponse ) { -<<<<<<< HEAD -======= - ->>>>>>> 52d0ced7a (addid) if (req.method !== "GET" && req.method !== "POST") { res.setHeader("Allow", "GET, POST"); return res.status(405).json({ error: "Only GET and POST allowed" }); } -<<<<<<< HEAD const params = readApiGetParams(req); -======= - const params = readParams(req); ->>>>>>> 52d0ced7a (addid) console.log("[API]get_time_series, received request:", params); // validate params @@ -73,10 +39,6 @@ export default async function handler( ) { return res.status(400).json({ error: "Missing parameters" }); } -<<<<<<< HEAD -======= - ->>>>>>> 52d0ced7a (addid) // get time series data try { const { name, query_params } = params; @@ -99,171 +61,3 @@ async function getBenmarkTimeSeriesData( throw new Error(`Unsupported request_name: ${request_name}`); } } -<<<<<<< HEAD -======= - -// Utility to extract params from either GET or POST -// it accepts both ?parameters= and POST with JSON body -function readParams(req: NextApiRequest): Params { - // 1) If POST with parsed JSON body - if (req.method === "POST" && req.body && typeof req.body === "object") { - return req.body as Params; - } - - // 2) If POST with raw string body - if ( - req.method === "POST" && - typeof req.body === "string" && - req.body.trim() - ) { - try { - return JSON.parse(req.body) as Params; - } catch {} - } - - // 3) If GET with ?parameters= - const raw = req.query.parameters as string | undefined; - if (raw) { - try { - return JSON.parse(raw) as Params; - } catch {} - } - - // 4) Fallback: use query params directly - const q: Params = {}; - Object.entries(req.query).forEach(([k, v]) => { - if (k !== "parameters") q[k] = Array.isArray(v) ? v[0] : v; - }); - return q; -} - -/** - * Group data by `keys`, and inside each group further subgroup by `subGroupKeys`. - */ -function groupBy( - data: T[], - keys: string[], - subGroupKeys: string[] = [] -): GroupedItem[] { - const groups = new Map>>(); - const mainInfo = new Map(); - - for (const row of data as any[]) { - // build main group key - const mainKeyParts = keys.map((k) => String(getNestedField(row, k))); - const mainKey = mainKeyParts.join("|"); - if (!mainInfo.has(mainKey)) { - const info: GroupInfo = {}; - keys.forEach((k, i) => (info[k] = mainKeyParts[i])); - mainInfo.set(mainKey, info); - } - - // build subgroup key - const subKeyParts = - subGroupKeys.length > 0 - ? subGroupKeys.map((k) => String(getNestedField(row, k))) - : ["__ALL__"]; // default single subgroup if none provided - const subKey = subKeyParts.join("|"); - const subInfo: GroupInfo = {}; - - subGroupKeys.forEach((k, i) => (subInfo[k] = subKeyParts[i])); - - if (!groups.has(mainKey)) groups.set(mainKey, new Map()); - const subMap = groups.get(mainKey)!; - - if (!subMap.has(subKey)) { - subMap.set(subKey, { group_Info: subInfo, data: [] }); - } - subMap.get(subKey)!.data.push(row as T); - } - - // build result array - const result: GroupedItem[] = []; - for (const [mainKey, subMap] of groups.entries()) { - const rowsObj = Object.fromEntries(subMap.entries()); - result.push({ - group_Info: mainInfo.get(mainKey)!, - rows: rowsObj, - }); - } - return result; -} - -async function getCompilerBenchmarkData(inputparams: any) { - const start = Date.now(); - const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams); - const end = Date.now(); - const result = toPrecomputeCompiler(rows, inputparams, "time_series"); - console.log("time to get data", end - start); - return result; -} - -function toPrecomputeCompiler( - rawData: any[], - inputparams: any, - type: string = "time_series" -) { - const data = convertToCompilerPerformanceData(rawData); - const models = getPassingModels(data); - - const passrate = computePassrate(data, models); - const geomean = computeGeomean(data, models); - const peakMemory = computePeakMemoryUsage(data, models); - - const all_data = [passrate, geomean, peakMemory].flat(); - - all_data.map((row) => { - row["dtype"] = inputparams["dtype"]; - row["arch"] = inputparams["arch"]; - row["device"] = inputparams["device"]; - row["mode"] = inputparams["mode"]; - }); - - let res: any[] = []; - switch (type) { - case "time_series": - // grouping data by comipler, device, arch, dtype, suite, metric, mode - // then sorted it with granularity_bucket in ascending order - const tsd = groupBy( - all_data, - ["dtype", "arch", "device", "suite", "compiler", "metric", "mode"], - ["workflow_id"] - ); - res = tsd.map((group) => { - const group_info = group.group_Info; - const group_data = group.rows; - - // no need for the group_info for subgroup, directly get the data - const ts_list = Object.values(group_data) - .filter((item) => item.data.length > 0) - .map((item) => item.data[0]) - .sort( - (a, b) => - new Date(a.granularity_bucket).getTime() - - new Date(b.granularity_bucket).getTime() - ); - return { - group_info, - num_of_dp: ts_list.length, - result: ts_list, - }; - }); - return res; - case "table": - res = groupBy( - all_data, - [ - "dtype", - "arch", - "device", - "mode", - "workflow_id", - "granularity_bucket", - ], - ["metric", "compiler"] - ); - } - - return res; -} ->>>>>>> 52d0ced7a (addid) From 5127ca62423c3dd44510d5c1d7c4b2c39d026f7e Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 2 Sep 2025 11:00:06 -0700 Subject: [PATCH 05/27] addid --- torchci/pages/api/benchmark/group_data.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torchci/pages/api/benchmark/group_data.ts b/torchci/pages/api/benchmark/group_data.ts index 030d79cc24..713ed480d9 100644 --- a/torchci/pages/api/benchmark/group_data.ts +++ b/torchci/pages/api/benchmark/group_data.ts @@ -16,7 +16,7 @@ const DEFAULT_TABLE_GROUP = [ const DEFAULT_ROW_GROUP = ["workflow_id", "job_id", "metadata_info.timestamp"]; const BENCNMARK_TABLE_NAME = "oss_ci_benchmark_llms"; -export function getNestedField(obj: any, path: string): any { +function getNestedField(obj: any, path: string): any { return path.split(".").reduce((o, key) => (o && key in o ? o[key] : ""), obj); } @@ -32,7 +32,6 @@ export default async function handler( details: formatZodError(request.error), }); } - const qp = request.data; const groupTableByFields = qp.group_table_by_fields || deepClone(DEFAULT_TABLE_GROUP); From 0b8e72cd2b1500e64cb0523244d6b98586354901 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 2 Sep 2025 13:02:53 -0700 Subject: [PATCH 06/27] addid --- torchci/lib/benchmark/api_helper/compilers/precompute.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/torchci/lib/benchmark/api_helper/compilers/precompute.ts b/torchci/lib/benchmark/api_helper/compilers/precompute.ts index 0a11ab44b3..9dbc888aa2 100644 --- a/torchci/lib/benchmark/api_helper/compilers/precompute.ts +++ b/torchci/lib/benchmark/api_helper/compilers/precompute.ts @@ -60,6 +60,7 @@ function toPrecomputeCompiler( type: string = "time_series" ) { const data = convertToCompilerPerformanceData(rawData); + const models = getPassingModels(data); const passrate = computePassrate(data, models); From 0ec6dcbefddc3bce215ce76528d3858a88a68c85 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 2 Sep 2025 17:49:05 -0700 Subject: [PATCH 07/27] addid --- torchci/lib/benchmark/compilerUtils.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/torchci/lib/benchmark/compilerUtils.ts b/torchci/lib/benchmark/compilerUtils.ts index 00212177b3..68a4c322a4 100644 --- a/torchci/lib/benchmark/compilerUtils.ts +++ b/torchci/lib/benchmark/compilerUtils.ts @@ -4,6 +4,7 @@ import { PASSING_ACCURACY, SCALE, } from "components/benchmark/compilers/common"; +import { number } from "echarts"; import { BenchmarkData, CompilerPerformanceData } from "lib/types"; export function getPassingModels(data: CompilerPerformanceData[]) { From 96233a32638e6e13c2c9b299d586d68f07bcb85e Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 2 Sep 2025 17:49:36 -0700 Subject: [PATCH 08/27] addid --- torchci/lib/benchmark/compilerUtils.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/torchci/lib/benchmark/compilerUtils.ts b/torchci/lib/benchmark/compilerUtils.ts index 68a4c322a4..00212177b3 100644 --- a/torchci/lib/benchmark/compilerUtils.ts +++ b/torchci/lib/benchmark/compilerUtils.ts @@ -4,7 +4,6 @@ import { PASSING_ACCURACY, SCALE, } from "components/benchmark/compilers/common"; -import { number } from "echarts"; import { BenchmarkData, CompilerPerformanceData } from "lib/types"; export function getPassingModels(data: CompilerPerformanceData[]) { From b1c79ce3991062d04d0b43f728b78c3dcefd509e Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 3 Sep 2025 02:01:45 -0700 Subject: [PATCH 09/27] addid --- .gitignore | 1 + torchci/components/benchmark/compilers/SummaryGraphPanel.tsx | 2 ++ torchci/lib/benchmark/api_helper/compilers/precompute.ts | 1 - 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3be92dafed..e324f591ec 100644 --- a/.gitignore +++ b/.gitignore @@ -72,3 +72,4 @@ aws/tools/cleanup-ssm/**/*.rs.bk # Remove the python version file from pyenv .python-version + diff --git a/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx b/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx index 3faab6f3cd..dcdcbf4a15 100644 --- a/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx +++ b/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx @@ -88,6 +88,8 @@ function SuiteGraphPanel({ JSON.stringify(queryParamsWithSuite) )}`; + console.log("url params", queryParamsWithSuite); + let { data, error } = useSWR(url, fetcher, { refreshInterval: 60 * 60 * 1000, // refresh every hour }); diff --git a/torchci/lib/benchmark/api_helper/compilers/precompute.ts b/torchci/lib/benchmark/api_helper/compilers/precompute.ts index 9dbc888aa2..0a11ab44b3 100644 --- a/torchci/lib/benchmark/api_helper/compilers/precompute.ts +++ b/torchci/lib/benchmark/api_helper/compilers/precompute.ts @@ -60,7 +60,6 @@ function toPrecomputeCompiler( type: string = "time_series" ) { const data = convertToCompilerPerformanceData(rawData); - const models = getPassingModels(data); const passrate = computePassrate(data, models); From a21f91adfe4ab7e6e265222651b81a888c14f194 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 3 Sep 2025 10:40:55 -0700 Subject: [PATCH 10/27] addid --- .../api_helper/compilers/precompute.ts | 2 +- torchci/lib/benchmark/compilerUtils.ts | 48 ------------------- 2 files changed, 1 insertion(+), 49 deletions(-) diff --git a/torchci/lib/benchmark/api_helper/compilers/precompute.ts b/torchci/lib/benchmark/api_helper/compilers/precompute.ts index 0a11ab44b3..e9a9f89799 100644 --- a/torchci/lib/benchmark/api_helper/compilers/precompute.ts +++ b/torchci/lib/benchmark/api_helper/compilers/precompute.ts @@ -163,7 +163,7 @@ function toPrecomputeCompiler( } const response: BenchmarkTimeSeriesResponse = { - time_series: res, + time_series: rawData, time_range: { start: new Date(earliest_timestamp).toISOString(), end: new Date(latest_timestamp).toISOString(), diff --git a/torchci/lib/benchmark/compilerUtils.ts b/torchci/lib/benchmark/compilerUtils.ts index 00212177b3..06e4542269 100644 --- a/torchci/lib/benchmark/compilerUtils.ts +++ b/torchci/lib/benchmark/compilerUtils.ts @@ -456,51 +456,3 @@ export function convertToCompilerPerformanceData(data: BenchmarkData[]) { return Object.values(convertData); } - -export function computePassrateSimple(data: any[]) { - if (!Array.isArray(data) || data.length === 0) return []; - - const blocked = new Set(BLOCKLIST_COMPILERS); - const passingAcc = new Set(PASSING_ACCURACY); - const toDisplay = (c: string) => COMPILER_NAMES_TO_DISPLAY_NAMES[c] ?? c; - - const totalCount = new Map(); - const passCount = new Map(); - - for (const r of data) { - const compilerDisp = toDisplay(r.compiler); - if (blocked.has(compilerDisp)) continue; - - const key = `${r.granularity_bucket}+${r.workflow_id}+${r.suite}+${compilerDisp}`; - - // 计总 - totalCount.set(key, (totalCount.get(key) ?? 0) + 1); - - const acc = r.accuracy ?? ""; - const speed = r.speedup ?? 0; - const pass = - (passingAcc.has(acc) && (speed !== 0 || compilerDisp === "export")) || - acc === "pass_due_to_skip"; - - if (pass) passCount.set(key, (passCount.get(key) ?? 0) + 1); - } - const out: any[] = []; - for (const [key, tc] of totalCount) { - const pc = passCount.get(key) ?? 0; - const p = tc > 0 ? pc / tc : 0; - - const [bucket, wfStr, suite, compiler] = key.split("+"); - out.push({ - metirc: "passrate", - granularity_bucket: bucket, - workflow_id: Number(wfStr), - suite, - compiler, - passrate: p, - pass_count: pc, - total_count: tc, - passrate_display: `${(p * 100).toFixed(0)}%, ${pc}/${tc}`, - }); - } - return out; -} From d0b325eab6e1db88813106162facae267a709246 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 3 Sep 2025 10:42:29 -0700 Subject: [PATCH 11/27] addid --- torchci/lib/benchmark/api_helper/compilers/precompute.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchci/lib/benchmark/api_helper/compilers/precompute.ts b/torchci/lib/benchmark/api_helper/compilers/precompute.ts index e9a9f89799..0a11ab44b3 100644 --- a/torchci/lib/benchmark/api_helper/compilers/precompute.ts +++ b/torchci/lib/benchmark/api_helper/compilers/precompute.ts @@ -163,7 +163,7 @@ function toPrecomputeCompiler( } const response: BenchmarkTimeSeriesResponse = { - time_series: rawData, + time_series: res, time_range: { start: new Date(earliest_timestamp).toISOString(), end: new Date(latest_timestamp).toISOString(), From 7347feeaf5c1954a2f9348b94380d78d931fad60 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 3 Sep 2025 10:54:06 -0700 Subject: [PATCH 12/27] addid --- .gitignore | 1 - torchci/components/benchmark/compilers/SummaryGraphPanel.tsx | 2 -- 2 files changed, 3 deletions(-) diff --git a/.gitignore b/.gitignore index e324f591ec..3be92dafed 100644 --- a/.gitignore +++ b/.gitignore @@ -72,4 +72,3 @@ aws/tools/cleanup-ssm/**/*.rs.bk # Remove the python version file from pyenv .python-version - diff --git a/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx b/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx index dcdcbf4a15..3faab6f3cd 100644 --- a/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx +++ b/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx @@ -88,8 +88,6 @@ function SuiteGraphPanel({ JSON.stringify(queryParamsWithSuite) )}`; - console.log("url params", queryParamsWithSuite); - let { data, error } = useSWR(url, fetcher, { refreshInterval: 60 * 60 * 1000, // refresh every hour }); From 986178e6ccade372d2d02f45ddebafca70ab9914 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 3 Sep 2025 12:57:16 -0700 Subject: [PATCH 13/27] addid --- torchci/components/metrics/panels/TablePanel.tsx | 1 - 1 file changed, 1 deletion(-) diff --git a/torchci/components/metrics/panels/TablePanel.tsx b/torchci/components/metrics/panels/TablePanel.tsx index c53b29f4e3..d53d0ab8d6 100644 --- a/torchci/components/metrics/panels/TablePanel.tsx +++ b/torchci/components/metrics/panels/TablePanel.tsx @@ -1,7 +1,6 @@ import HelpIcon from "@mui/icons-material/Help"; import { Box, Skeleton, Typography } from "@mui/material"; import IconButton from "@mui/material/IconButton"; -import { Box } from "@mui/system"; import { DataGrid, GridColDef } from "@mui/x-data-grid"; import { CSSProperties } from "react"; import useSWR from "swr"; From ee4732790c86cafce88ec002a3f2d782c169ccdc Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Sun, 31 Aug 2025 11:15:09 -0700 Subject: [PATCH 14/27] addid --- .../params.json | 17 +++ .../query.sql | 110 ++++++++++++++++++ .../components/metrics/panels/TablePanel.tsx | 1 + .../benchmark/compiler_benmark_time_series.ts | 40 +++++++ torchci/pages/api/benchmark/group_data.ts | 1 + 5 files changed, 169 insertions(+) create mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json create mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql create mode 100644 torchci/pages/api/benchmark/compiler_benmark_time_series.ts diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json new file mode 100644 index 0000000000..95f00e1501 --- /dev/null +++ b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json @@ -0,0 +1,17 @@ +{ + "params": { + "branches": "Array(String)", + "commits": "Array(String)", + "compilers": "Array(String)", + "device": "String", + "arch": "String", + "dtype": "String", + "granularity": "String", + "mode": "String", + "startTime": "DateTime64(3)", + "stopTime": "DateTime64(3)", + "suites": "Array(String)", + "workflowId": "Int64" + }, + "tests": [] +} diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql new file mode 100644 index 0000000000..7ef1efb232 --- /dev/null +++ b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql @@ -0,0 +1,110 @@ +-- This query is used to get the PT2 benchmark results from different experiments +-- to powers the TorchInductor benchmark dashboard +WITH benchmarks AS ( + SELECT + workflow_id, + job_id, + suite, + model_name, + metric_name, + value, + metric_extra_info AS extra_info, + DATE_TRUNC( + {granularity: String }, + fromUnixTimestamp(timestamp) + ) AS granularity_bucket, + -- Filters + benchmark_dtype, + benchmark_mode, + device, + arch, + replaceOne(head_branch, 'refs/heads/', '') AS head_branch, + benchmark_extra_info['output'] AS output, + REGEXP_REPLACE( + output, + CONCAT( + '_', + suite, + '_', + { dtype: String }, + '_', + {mode: String }, + '_', + {device: String }, + '_.*' + ), + '' + ) AS temp + FROM + benchmark.oss_ci_benchmark_torchinductor + WHERE + timestamp >= toUnixTimestamp({startTime: DateTime64(3) }) + AND timestamp < toUnixTimestamp({stopTime: DateTime64(3) }) + AND ( + has({commits: Array(String) }, head_sha) + OR empty({commits: Array(String) }) + ) + AND ( + has({suites: Array(String) }, suite) + OR empty({suites: Array(String) }) + ) + AND ( + workflow_id = {workflowId: Int64} + OR {workflowId: Int64} = 0 + ) +) + +SELECT + workflow_id, + job_id, + REGEXP_REPLACE(temp, '.*/', '') AS backend, + suite, + model_name AS model, + metric_name AS metric, + value, + extra_info, + output, + granularity_bucket +FROM + benchmarks +WHERE + ( + has({branches: Array(String) }, head_branch) + OR empty({branches: Array(String) }) + ) + -- TODO (huydhn): Clean up the output field and how it's used in the query + -- in 6 months + AND ( + ( + ({arch: String } = '' OR {arch: String } = 'a100') + AND output LIKE CONCAT( + '%\_', + {dtype: String }, + '\_', + {mode: String }, + '\_', + {device: String }, + '\_%' + ) + ) + OR ( + {arch: String } != '' + AND output LIKE CONCAT( + '%\_', + {dtype: String }, + '\_', + {mode: String }, + '\_', + {device: String }, + '\_', + {arch: String }, + '\_%' + ) + ) + OR ( + benchmark_dtype = {dtype: String } + AND benchmark_mode = {mode: String } + AND device = {device: String } + AND arch = {arch: String } + ) + ) diff --git a/torchci/components/metrics/panels/TablePanel.tsx b/torchci/components/metrics/panels/TablePanel.tsx index d53d0ab8d6..023223d238 100644 --- a/torchci/components/metrics/panels/TablePanel.tsx +++ b/torchci/components/metrics/panels/TablePanel.tsx @@ -7,6 +7,7 @@ import useSWR from "swr"; const fetcher = (url: string) => fetch(url).then((res) => res.json()); + export default function TablePanel({ // Human-readable title for this panel. title, diff --git a/torchci/pages/api/benchmark/compiler_benmark_time_series.ts b/torchci/pages/api/benchmark/compiler_benmark_time_series.ts new file mode 100644 index 0000000000..4e8254766d --- /dev/null +++ b/torchci/pages/api/benchmark/compiler_benmark_time_series.ts @@ -0,0 +1,40 @@ + +import { queryClickhouseSaved } from "lib/clickhouse"; +import type { NextApiRequest, NextApiResponse } from "next"; + +const DEFAULT_TABLE_GROUP = [ + "device", + "backend", + "model", + "dtype", + "backend", + "arch", +]; +const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2"; + +type QueryParams = { + startTime: string; // ISO timestamp + stopTime: string; // ISO timestamp + [k: string]: any; // other parameters +}; + + +export default async function handler( + req: NextApiRequest, + res: NextApiResponse +) { + + console.log("compiler_benmark_time_series."); + const inputparams = JSON.parse(req.query.parameters as string) + console.log("inputs", inputparams); + + const start = Date.now(); + const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams); + const end = Date.now(); + console.log(`Process took ${end - start}ms`); + + console.log("merged rows:", rows.length); + res.status(200).json({ data: rows }); +} + + diff --git a/torchci/pages/api/benchmark/group_data.ts b/torchci/pages/api/benchmark/group_data.ts index 713ed480d9..d8703d8a8b 100644 --- a/torchci/pages/api/benchmark/group_data.ts +++ b/torchci/pages/api/benchmark/group_data.ts @@ -32,6 +32,7 @@ export default async function handler( details: formatZodError(request.error), }); } + const qp = request.data; const groupTableByFields = qp.group_table_by_fields || deepClone(DEFAULT_TABLE_GROUP); From 86b69990cb8a259748494b011ef5efbb3e61d674 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Sun, 31 Aug 2025 18:12:51 -0700 Subject: [PATCH 15/27] addid --- .../query.sql | 155 +++++--------- .../components/metrics/panels/TablePanel.tsx | 2 +- torchci/lib/benchmark/compilerUtils.ts | 54 +++++ .../benchmark/compiler_benmark_time_series.ts | 40 ---- .../pages/api/benchmark/get_time_series.ts | 190 ++++++++++++++++++ torchci/pages/api/benchmark/group_data.ts | 2 +- 6 files changed, 299 insertions(+), 144 deletions(-) delete mode 100644 torchci/pages/api/benchmark/compiler_benmark_time_series.ts diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql index 7ef1efb232..bf233f6ff8 100644 --- a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql +++ b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql @@ -1,110 +1,61 @@ --- This query is used to get the PT2 benchmark results from different experiments --- to powers the TorchInductor benchmark dashboard WITH benchmarks AS ( - SELECT - workflow_id, - job_id, - suite, - model_name, - metric_name, - value, - metric_extra_info AS extra_info, - DATE_TRUNC( - {granularity: String }, - fromUnixTimestamp(timestamp) - ) AS granularity_bucket, - -- Filters - benchmark_dtype, - benchmark_mode, - device, - arch, - replaceOne(head_branch, 'refs/heads/', '') AS head_branch, - benchmark_extra_info['output'] AS output, - REGEXP_REPLACE( - output, - CONCAT( - '_', - suite, - '_', - { dtype: String }, - '_', - {mode: String }, - '_', - {device: String }, - '_.*' - ), - '' - ) AS temp - FROM - benchmark.oss_ci_benchmark_torchinductor - WHERE - timestamp >= toUnixTimestamp({startTime: DateTime64(3) }) - AND timestamp < toUnixTimestamp({stopTime: DateTime64(3) }) - AND ( - has({commits: Array(String) }, head_sha) - OR empty({commits: Array(String) }) - ) - AND ( - has({suites: Array(String) }, suite) - OR empty({suites: Array(String) }) - ) - AND ( - workflow_id = {workflowId: Int64} - OR {workflowId: Int64} = 0 - ) -) - -SELECT + SELECT workflow_id, job_id, - REGEXP_REPLACE(temp, '.*/', '') AS backend, suite, - model_name AS model, - metric_name AS metric, + model_name, + metric_name, value, - extra_info, - output, - granularity_bucket -FROM - benchmarks + metric_extra_info AS extra_info, + DATE_TRUNC({granularity:String}, fromUnixTimestamp(timestamp)) AS granularity_bucket, + benchmark_dtype, + benchmark_mode, + device, + arch, + replaceOne(head_branch, 'refs/heads/', '') AS head_branch, + + benchmark_extra_info['output'] AS output, + + REGEXP_REPLACE( + benchmark_extra_info['output'], + CONCAT('_', suite, '_', {dtype:String}, '_', {mode:String}, '_', {device:String}, '_.*'), + '' + ) AS temp + + FROM benchmark.oss_ci_benchmark_torchinductor + WHERE + timestamp >= toUnixTimestamp({startTime:DateTime64(3)}) AND + timestamp < toUnixTimestamp({stopTime:DateTime64(3)}) AND + (has({commits:Array(String)}, head_sha) OR empty({commits:Array(String)})) AND + (has({suites:Array(String)}, suite) OR empty({suites:Array(String)})) AND + (workflow_id = {workflowId:Int64} OR {workflowId:Int64} = 0) +) + +SELECT + workflow_id, + job_id, + REGEXP_REPLACE(temp, '.*/', '') AS backend, + suite, + model_name AS model, + metric_name AS metric, + value, + output, + granularity_bucket, + extra_info, +FROM benchmarks WHERE + (has({branches:Array(String)}, head_branch) OR empty({branches:Array(String)})) + AND ( ( - has({branches: Array(String) }, head_branch) - OR empty({branches: Array(String) }) - ) - -- TODO (huydhn): Clean up the output field and how it's used in the query - -- in 6 months - AND ( - ( - ({arch: String } = '' OR {arch: String } = 'a100') - AND output LIKE CONCAT( - '%\_', - {dtype: String }, - '\_', - {mode: String }, - '\_', - {device: String }, - '\_%' - ) - ) - OR ( - {arch: String } != '' - AND output LIKE CONCAT( - '%\_', - {dtype: String }, - '\_', - {mode: String }, - '\_', - {device: String }, - '\_', - {arch: String }, - '\_%' - ) - ) - OR ( - benchmark_dtype = {dtype: String } - AND benchmark_mode = {mode: String } - AND device = {device: String } - AND arch = {arch: String } - ) + ({arch:String} = '' OR {arch:String} = 'a100') AND + output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_%') + ) OR ( + {arch:String} != '' AND + output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_', {arch:String}, '\_%') + ) OR ( + benchmark_dtype = {dtype:String} AND + benchmark_mode = {mode:String} AND + device = {device:String} AND + arch = {arch:String} ) + ); diff --git a/torchci/components/metrics/panels/TablePanel.tsx b/torchci/components/metrics/panels/TablePanel.tsx index 023223d238..c53b29f4e3 100644 --- a/torchci/components/metrics/panels/TablePanel.tsx +++ b/torchci/components/metrics/panels/TablePanel.tsx @@ -1,13 +1,13 @@ import HelpIcon from "@mui/icons-material/Help"; import { Box, Skeleton, Typography } from "@mui/material"; import IconButton from "@mui/material/IconButton"; +import { Box } from "@mui/system"; import { DataGrid, GridColDef } from "@mui/x-data-grid"; import { CSSProperties } from "react"; import useSWR from "swr"; const fetcher = (url: string) => fetch(url).then((res) => res.json()); - export default function TablePanel({ // Human-readable title for this panel. title, diff --git a/torchci/lib/benchmark/compilerUtils.ts b/torchci/lib/benchmark/compilerUtils.ts index 06e4542269..6d3e0902d9 100644 --- a/torchci/lib/benchmark/compilerUtils.ts +++ b/torchci/lib/benchmark/compilerUtils.ts @@ -100,7 +100,10 @@ export function computePassrate( const [bucket, workflowId, suite, compiler] = key.split("+"); passrate.push({ metric: "passrate", +<<<<<<< HEAD value: p, +======= +>>>>>>> 556c0ef04 (addid) granularity_bucket: bucket, workflow_id: workflowId, suite: suite, @@ -166,7 +169,10 @@ export function computeGeomean( const [bucket, workflowId, suite, compiler] = key.split("+"); returnedGeomean.push({ metric: "geomean", +<<<<<<< HEAD value: Number(gm), +======= +>>>>>>> 556c0ef04 (addid) granularity_bucket: bucket, workflow_id: workflowId, suite: suite, @@ -456,3 +462,51 @@ export function convertToCompilerPerformanceData(data: BenchmarkData[]) { return Object.values(convertData); } + +export function computePassrateSimple(data: any[]) { + if (!Array.isArray(data) || data.length === 0) return []; + + const blocked = new Set(BLOCKLIST_COMPILERS); + const passingAcc = new Set(PASSING_ACCURACY); + const toDisplay = (c: string) => COMPILER_NAMES_TO_DISPLAY_NAMES[c] ?? c; + + const totalCount = new Map(); + const passCount = new Map(); + + for (const r of data) { + const compilerDisp = toDisplay(r.compiler); + if (blocked.has(compilerDisp)) continue; + + const key = `${r.granularity_bucket}+${r.workflow_id}+${r.suite}+${compilerDisp}`; + + // 计总 + totalCount.set(key, (totalCount.get(key) ?? 0) + 1); + + const acc = r.accuracy ?? ""; + const speed = r.speedup ?? 0; + const pass = + (passingAcc.has(acc) && (speed !== 0 || compilerDisp === "export")) || + acc === "pass_due_to_skip"; + + if (pass) passCount.set(key, (passCount.get(key) ?? 0) + 1); + } + const out: any[] = []; + for (const [key, tc] of totalCount) { + const pc = passCount.get(key) ?? 0; + const p = tc > 0 ? pc / tc : 0; + + const [bucket, wfStr, suite, compiler] = key.split("+"); + out.push({ + metirc: "passrate", + granularity_bucket: bucket, + workflow_id: Number(wfStr), + suite, + compiler, + passrate: p, + pass_count: pc, + total_count: tc, + passrate_display: `${(p * 100).toFixed(0)}%, ${pc}/${tc}`, + }); + } + return out; +} diff --git a/torchci/pages/api/benchmark/compiler_benmark_time_series.ts b/torchci/pages/api/benchmark/compiler_benmark_time_series.ts deleted file mode 100644 index 4e8254766d..0000000000 --- a/torchci/pages/api/benchmark/compiler_benmark_time_series.ts +++ /dev/null @@ -1,40 +0,0 @@ - -import { queryClickhouseSaved } from "lib/clickhouse"; -import type { NextApiRequest, NextApiResponse } from "next"; - -const DEFAULT_TABLE_GROUP = [ - "device", - "backend", - "model", - "dtype", - "backend", - "arch", -]; -const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2"; - -type QueryParams = { - startTime: string; // ISO timestamp - stopTime: string; // ISO timestamp - [k: string]: any; // other parameters -}; - - -export default async function handler( - req: NextApiRequest, - res: NextApiResponse -) { - - console.log("compiler_benmark_time_series."); - const inputparams = JSON.parse(req.query.parameters as string) - console.log("inputs", inputparams); - - const start = Date.now(); - const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams); - const end = Date.now(); - console.log(`Process took ${end - start}ms`); - - console.log("merged rows:", rows.length); - res.status(200).json({ data: rows }); -} - - diff --git a/torchci/pages/api/benchmark/get_time_series.ts b/torchci/pages/api/benchmark/get_time_series.ts index ce069f5590..a51a4f77cd 100644 --- a/torchci/pages/api/benchmark/get_time_series.ts +++ b/torchci/pages/api/benchmark/get_time_series.ts @@ -1,6 +1,28 @@ +<<<<<<< HEAD import { getCompilerBenchmarkData } from "lib/benchmark/api_helper/compilers/precompute"; import { readApiGetParams } from "lib/benchmark/api_helper/utils"; import type { NextApiRequest, NextApiResponse } from "next"; +======= +import { + computeGeomean, + computePassrate, + computePeakMemoryUsage, + convertToCompilerPerformanceData, + getPassingModels, +} from "lib/benchmark/compilerUtils"; +import { queryClickhouseSaved } from "lib/clickhouse"; +import type { NextApiRequest, NextApiResponse } from "next"; +import { getNestedField } from "./group_data"; + +type GroupInfo = Record; +type Subgroup = { group_Info: GroupInfo; data: T[] }; +type GroupedItem = { + group_Info: GroupInfo; + rows: Record>; +}; +type Params = Record; +const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2"; +>>>>>>> 556c0ef04 (addid) /** * API Route: /api/benchmark/get_time_series @@ -61,3 +83,171 @@ async function getBenmarkTimeSeriesData( throw new Error(`Unsupported request_name: ${request_name}`); } } +<<<<<<< HEAD +======= + +// Utility to extract params from either GET or POST +// it accepts both ?parameters= and POST with JSON body +function readParams(req: NextApiRequest): Params { + // 1) If POST with parsed JSON body + if (req.method === "POST" && req.body && typeof req.body === "object") { + return req.body as Params; + } + + // 2) If POST with raw string body + if ( + req.method === "POST" && + typeof req.body === "string" && + req.body.trim() + ) { + try { + return JSON.parse(req.body) as Params; + } catch {} + } + + // 3) If GET with ?parameters= + const raw = req.query.parameters as string | undefined; + if (raw) { + try { + return JSON.parse(raw) as Params; + } catch {} + } + + // 4) Fallback: use query params directly + const q: Params = {}; + Object.entries(req.query).forEach(([k, v]) => { + if (k !== "parameters") q[k] = Array.isArray(v) ? v[0] : v; + }); + return q; +} + +/** + * Group data by `keys`, and inside each group further subgroup by `subGroupKeys`. + */ +function groupBy( + data: T[], + keys: string[], + subGroupKeys: string[] = [] +): GroupedItem[] { + const groups = new Map>>(); + const mainInfo = new Map(); + + for (const row of data as any[]) { + // build main group key + const mainKeyParts = keys.map((k) => String(getNestedField(row, k))); + const mainKey = mainKeyParts.join("|"); + if (!mainInfo.has(mainKey)) { + const info: GroupInfo = {}; + keys.forEach((k, i) => (info[k] = mainKeyParts[i])); + mainInfo.set(mainKey, info); + } + + // build subgroup key + const subKeyParts = + subGroupKeys.length > 0 + ? subGroupKeys.map((k) => String(getNestedField(row, k))) + : ["__ALL__"]; // default single subgroup if none provided + const subKey = subKeyParts.join("|"); + const subInfo: GroupInfo = {}; + + subGroupKeys.forEach((k, i) => (subInfo[k] = subKeyParts[i])); + + if (!groups.has(mainKey)) groups.set(mainKey, new Map()); + const subMap = groups.get(mainKey)!; + + if (!subMap.has(subKey)) { + subMap.set(subKey, { group_Info: subInfo, data: [] }); + } + subMap.get(subKey)!.data.push(row as T); + } + + // build result array + const result: GroupedItem[] = []; + for (const [mainKey, subMap] of groups.entries()) { + const rowsObj = Object.fromEntries(subMap.entries()); + result.push({ + group_Info: mainInfo.get(mainKey)!, + rows: rowsObj, + }); + } + return result; +} + +async function getCompilerBenchmarkData(inputparams: any) { + const start = Date.now(); + const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams); + const end = Date.now(); + const result = toPrecomputeCompiler(rows, inputparams, "time_series"); + console.log("time to get data", end - start); + return result; +} + +function toPrecomputeCompiler( + rawData: any[], + inputparams: any, + type: string = "time_series" +) { + const data = convertToCompilerPerformanceData(rawData); + const models = getPassingModels(data); + + const passrate = computePassrate(data, models); + const geomean = computeGeomean(data, models); + const peakMemory = computePeakMemoryUsage(data, models); + + const all_data = [passrate, geomean, peakMemory].flat(); + + all_data.map((row) => { + row["dtype"] = inputparams["dtype"]; + row["arch"] = inputparams["arch"]; + row["device"] = inputparams["device"]; + row["mode"] = inputparams["mode"]; + }); + + let res: any[] = []; + switch (type) { + case "time_series": + // grouping data by comipler, device, arch, dtype, suite, metric, mode + // then sorted it with granularity_bucket in ascending order + const tsd = groupBy( + all_data, + ["dtype", "arch", "device", "suite", "compiler", "metric", "mode"], + ["workflow_id"] + ); + res = tsd.map((group) => { + const group_info = group.group_Info; + const group_data = group.rows; + + // no need for the group_info for subgroup, directly get the data + const ts_list = Object.values(group_data) + .filter((item) => item.data.length > 0) + .map((item) => item.data[0]) + .sort( + (a, b) => + new Date(a.granularity_bucket).getTime() - + new Date(b.granularity_bucket).getTime() + ); + return { + group_info, + num_of_dp: ts_list.length, + result: ts_list, + }; + }); + return res; + case "table": + res = groupBy( + all_data, + [ + "dtype", + "arch", + "device", + "mode", + "workflow_id", + "granularity_bucket", + ], + ["metric", "compiler"] + ); + } + + return res; +} +>>>>>>> 556c0ef04 (addid) diff --git a/torchci/pages/api/benchmark/group_data.ts b/torchci/pages/api/benchmark/group_data.ts index d8703d8a8b..030d79cc24 100644 --- a/torchci/pages/api/benchmark/group_data.ts +++ b/torchci/pages/api/benchmark/group_data.ts @@ -16,7 +16,7 @@ const DEFAULT_TABLE_GROUP = [ const DEFAULT_ROW_GROUP = ["workflow_id", "job_id", "metadata_info.timestamp"]; const BENCNMARK_TABLE_NAME = "oss_ci_benchmark_llms"; -function getNestedField(obj: any, path: string): any { +export function getNestedField(obj: any, path: string): any { return path.split(".").reduce((o, key) => (o && key in o ? o[key] : ""), obj); } From c70d56c6aad541adef905177d3a9410c7c057bd8 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 2 Sep 2025 12:29:30 -0700 Subject: [PATCH 16/27] addid --- .../.gitignore | 3 + .../Makefile | 19 + .../lambda_function.py | 385 ++++++++++++++++++ .../requirements.txt | 5 + 4 files changed, 412 insertions(+) create mode 100644 aws/lambda/benchmark_regression_summary_report/.gitignore create mode 100644 aws/lambda/benchmark_regression_summary_report/Makefile create mode 100644 aws/lambda/benchmark_regression_summary_report/lambda_function.py create mode 100644 aws/lambda/benchmark_regression_summary_report/requirements.txt diff --git a/aws/lambda/benchmark_regression_summary_report/.gitignore b/aws/lambda/benchmark_regression_summary_report/.gitignore new file mode 100644 index 0000000000..bd92f6376a --- /dev/null +++ b/aws/lambda/benchmark_regression_summary_report/.gitignore @@ -0,0 +1,3 @@ +*.zip +deployment/ +venv/ diff --git a/aws/lambda/benchmark_regression_summary_report/Makefile b/aws/lambda/benchmark_regression_summary_report/Makefile new file mode 100644 index 0000000000..478548770a --- /dev/null +++ b/aws/lambda/benchmark_regression_summary_report/Makefile @@ -0,0 +1,19 @@ +all: run-local + +clean: + rm -rf deployment + rm -rf venv + rm -rf deployment.zip + +venv/bin/python: + virtualenv venv + venv/bin/pip install -r requirements.txt + +deployment.zip: + mkdir -p deployment + cp lambda_function.py ./deployment/. + pip3.10 install -r requirements.txt -t ./deployment/. --platform manylinux2014_x86_64 --only-binary=:all: --implementation cp --python-version 3.10 --upgrade + cd ./deployment && zip -q -r ../deployment.zip . + +.PHONY: create-deployment-package +create-deployment-package: deployment.zip diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py new file mode 100644 index 0000000000..5e57d49e99 --- /dev/null +++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py @@ -0,0 +1,385 @@ +#!/usr/bin/env python +import argparse +import json +import logging +import os +import threading +from collections import defaultdict +from concurrent.futures import as_completed, ThreadPoolExecutor +from datetime import datetime, timedelta, timezone + +# Local imports +from typing import Any, Dict, Iterable, List, Optional, Set + +import clickhouse_connect +import yaml +from dateutil.parser import parse +from github import Auth, Github + + +logging.basicConfig( + level=logging.INFO, +) +logger = logging.getLogger() +logger.setLevel("INFO") + +ENVS = { + "GITHUB_ACCESS_TOKEN": os.getenv("GITHUB_ACCESS_TOKEN", ""), + "CLICKHOUSE_ENDPOINT": os.getenv("CLICKHOUSE_ENDPOINT", ""), + "CLICKHOUSE_PASSWORD": os.getenv("CLICKHOUSE_PASSWORD", ""), + "CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME", ""), +} + + +def get_clickhouse_client( + host: str, user: str, password: str +) -> clickhouse_connect.driver.client.Client: + # for local testing only, disable SSL verification + # return clickhouse_connect.get_client(host=host, user=user, password=password,secure=True, verify=False) + + return clickhouse_connect.get_client( + host=host, user=user, password=password, secure=True + ) + + +def get_clickhouse_client_environment() -> clickhouse_connect.driver.client.Client: + for name, env_val in ENVS.items(): + if not env_val: + raise ValueError(f"Missing environment variable {name}") + return get_clickhouse_client( + host=ENVS["CLICKHOUSE_ENDPOINT"], + user=ENVS["CLICKHOUSE_USERNAME"], + password=ENVS["CLICKHOUSE_PASSWORD"], + ) + + +def is_unix_timestamp(value: str) -> bool: + """Check if the string is a valid Unix timestamp.""" + if value.isdigit(): # Ensure it's numeric + try: + timestamp = int(value) + # Check if it's within a reasonable range (1970 to 2100) + datetime.fromtimestamp(timestamp) + return True + except (ValueError, OSError): + return False + return False + + +def to_timestap_str(time: datetime) -> str: + return str(int(time.timestamp())) + + +def write_to_file(data: Any, filename="", path=""): + """ + Writes data to a specified file. If no path is provided, writes to the current directory. + + :param data: The content to write to the file. + :param filename: The name of the file (default: 'output.txt'). + :param path: The directory where the file should be saved (default: current directory). + """ + + if not filename: + filename = "output_snapshot.json" + if not path: + path = "." + + # Ensure the path exists + os.makedirs(path, exist_ok=True) + + # Construct full file path + file_path = os.path.join(path, filename) + + # Write data to file + with open(file_path, "w", encoding="utf-8") as file: + file.write(data) + logger.info(f"File written to: {os.path.abspath(file_path)}") + + + +class BenchmarkSummaryProcessor: + """ + """ + + def __init__( + self, + is_dry_run: bool = False, + local_output: bool = False, + output_snapshot_file_name: str = "summary_report_snapshot", + output_snapshot_file_path: str = "", + ) -> None: + self.is_dry_run = is_dry_run + self.is_dry_run = is_dry_run + self.output_snapshot_file_name = output_snapshot_file_name + self.output_snapshot_file_path = output_snapshot_file_path + self.local_output = local_output and is_dry_run + + def process( + self, + start_time: datetime, + end_time: datetime, + cc: Optional[clickhouse_connect.driver.client.Client] = None, + args: Optional[argparse.Namespace] = None, + ) -> Dict[str, Any]: + # ensure each thread has its own clickhouse client. clickhouse client + # is not thread-safe. + if cc is None: + tlocal = threading.local() + if not hasattr(tlocal, "cc") or tlocal.cc is None: + if args: + tlocal.cc = get_clickhouse_client( + args.clickhouse_endpoint, + args.clickhouse_username, + args.clickhouse_password, + ) + else: + tlocal.cc = get_clickhouse_client_environment() + cc = tlocal.cc + + # fetches config to get time series from api + + + + queued_jobs = self._fetch_snapshot_from_db(cc, start_time, end_time, repo) + + if len(queued_jobs) == 0: + logger.info( + f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] " + + f"No jobs in queue in time range: [{start_time},{end_time}]" + ) + + # add runner labels to each job based on machine type + self._add_runner_labels( + queued_jobs, + start_time, + meta_runner_config_retriever, + lf_runner_config_retriever, + old_lf_lf_runner_config_retriever, + ) + + if len(queued_jobs) == 0: + logger.info( + f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] " + + "No queued jobs, skipping generating histogram records.." + ) + + records = QueuedJobHistogramGenerator().generate_histogram_records( + queued_jobs, + datetime.now(timezone.utc), + "half-hour-mark-queue-time-histogram", + end_time, + ) + + if len(records) == 0: + logger.info( + f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] " + + "No histogram records, skipping writing.." + ) + + if self.is_dry_run: + logger.info( + f" [Dry Run Mode][Snapshot {to_timestap_str(end_time)}] " + + "Writing results to terminal/local file ..." + ) + self._output_record(queued_jobs, end_time, type="queued_jobs") + self._output_record(records, end_time, type="records") + logger.info( + f" [Dry Run Mode][Snapshot {to_timestap_str(end_time)}] " + + "Done. Write results to terminal/local file ." + ) + else: + self._write_to_db_table(cc, records) + + return { + "start_time": to_timestap_str(start_time), + "end_time": to_timestap_str(end_time), + "jobs_count": len(queued_jobs), + "records_count": len(records), + } + + +class WorkerPoolHandler: + """ + WorkerPoolHandler runs workers in parallel to generate benchmark regression report + and writes the results to the target destination. + + """ + + def __init__( + self, + benchmark_summary_processor: BenchmarkSummaryProcessor, + max_workers: int = 4, + ): + self.benchmark_summary_processor = benchmark_summary_processor + self.max_workers = max_workers + + def start( + self, + config: Dict[str, Any], + args: Optional[argparse.Namespace] = None, + ) -> None: + logger.info( + "[WorkerPoolHandler] start to process benchmark summary data with config %s", config["name"] + ) + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + futures = [] + for interval in time_intervals: + future = executor.submit( + self.benchmark_summary_processor.process, + config, + cc=None, + args=args, + ) + futures.append(future) + results = [] + errors = [] + + # handle results from parallel processing + for future in as_completed(futures): + try: + result = future.result() + # This will raise an exception if one occurred + results.append(result) + except Exception as e: + logger.warning(f"Error processing future: {e}") + errors.append({"error": str(e)}) + +def main( + args: Optional[argparse.Namespace] = None, + github_access_token: str = "", + is_dry_run: bool = False, + local_output: bool = False, + output_snapshot_file_name: str = "job_queue_times_snapshot", + output_snapshot_file_path: str = "", +): + """ + Main method to run in both local environment and lambda handler. + 1. generate intervals[start_time,end_time] using latest timestamp from source table and target table + 2. call WorkerPoolHandler to geneterate and write histogram data for each interval in parallel + """ + # gets config retrievers, this is used to generate runner labels for histgram + if not github_access_token: + raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN") + config_retrievers = get_config_retrievers(github_access_token) + + # get time intervals. + logger.info(" [Main] generating time intervals ....") + if args: + cc = get_clickhouse_client( + args.clickhouse_endpoint, args.clickhouse_username, args.clickhouse_password + ) + else: + cc = get_clickhouse_client_environment() + time_intervals = TimeIntervalGenerator().generate(cc) + + + # get jobs in queue from clickhouse for list of time intervals, in parallel + handler = WorkerPoolHandler( + config_retrievers, + BenchmarkSummaryProcessor( + is_dry_run=is_dry_run, + local_output=local_output, + output_snapshot_file_name=output_snapshot_file_name, + output_snapshot_file_path=output_snapshot_file_path, + ), + ) + handler.start(time_intervals, args) + logger.info(" [Main] Done. work completed.") + + +def lambda_handler(event: Any, context: Any) -> None: + """ + Main method to run in aws lambda environment + """ + main( + None, + github_access_token=ENVS["GITHUB_ACCESS_TOKEN"], + ) + return + + +def parse_args() -> argparse.Namespace: + """ + Parse command line args, this is mainly used for local test environment. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--clickhouse-endpoint", + default=ENVS["CLICKHOUSE_ENDPOINT"], + type=str, + help="the clickhouse endpoint, the clickhouse_endpoint " + + "name is https://{clickhouse_endpoint}:{port} for full url ", + ) + parser.add_argument( + "--clickhouse-username", + type=str, + default=ENVS["CLICKHOUSE_USERNAME"], + help="the clickhouse username", + ) + parser.add_argument( + "--clickhouse-password", + type=str, + default=ENVS["CLICKHOUSE_PASSWORD"], + help="the clickhouse password for the user name", + ) + parser.add_argument( + "--github-access-token", + type=str, + default=ENVS["GITHUB_ACCESS_TOKEN"], + help="the github access token to access github api", + ) + parser.add_argument( + "--local-output", + action="store_true", + help="when set, generate json result in local environment. " + + "this is only used for local test environment when dry-run is enabled", + ) + parser.add_argument( + "--not-dry-run", + action="store_true", + help="when set, writing results to destination from local " + + "environment. By default, we run in dry-run mode for local " + + "environment", + ) + parser.add_argument( + "--output-file-name", + type=str, + default="job_queue_times_snapshot.json", + help="the name of output file for local environment. this " + + "is only used for local test environment when local-output is enabled", + ) + parser.add_argument( + "--output-file-path", + type=str, + default="", + help="the path of output file for local environment. this is " + + "only used for local test environment when local-output is enabled", + ) + args, _ = parser.parse_known_args() + return args + + +def local_run() -> None: + """ + method to run in local test environment + """ + + args = parse_args() + + # update environment variables for input parameters + + # always run in dry-run mode in local environment, unless it's disabled. + is_dry_run = not args.not_dry_run + + main( + args, + args.github_access_token, + is_dry_run=is_dry_run, + local_output=args.local_output, + output_snapshot_file_name=args.output_file_name, + output_snapshot_file_path=args.output_file_path, + ) + + +if __name__ == "__main__": + local_run() diff --git a/aws/lambda/benchmark_regression_summary_report/requirements.txt b/aws/lambda/benchmark_regression_summary_report/requirements.txt new file mode 100644 index 0000000000..87c33c2e7f --- /dev/null +++ b/aws/lambda/benchmark_regression_summary_report/requirements.txt @@ -0,0 +1,5 @@ +clickhouse_connect==0.8.5 +boto3==1.35.33 +PyGithub==1.59.0 +python-dateutil==2.8.2 +PyYAML==6.0.1 From 6c17c24bdc29ca650fd8eb184c22236bd77d138b Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 2 Sep 2025 14:24:21 -0700 Subject: [PATCH 17/27] addid --- .../Makefile | 3 +- .../lambda_function.py | 38 +++--- .../lib/benchmark_time_series_api_model.py | 27 ++++ .../lib/config.py | 55 ++++++++ .../lib/config_model.py | 128 ++++++++++++++++++ 5 files changed, 232 insertions(+), 19 deletions(-) create mode 100644 aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py create mode 100644 aws/lambda/benchmark_regression_summary_report/lib/config.py create mode 100644 aws/lambda/benchmark_regression_summary_report/lib/config_model.py diff --git a/aws/lambda/benchmark_regression_summary_report/Makefile b/aws/lambda/benchmark_regression_summary_report/Makefile index 478548770a..3db1a588ca 100644 --- a/aws/lambda/benchmark_regression_summary_report/Makefile +++ b/aws/lambda/benchmark_regression_summary_report/Makefile @@ -11,7 +11,8 @@ venv/bin/python: deployment.zip: mkdir -p deployment - cp lambda_function.py ./deployment/. + cp lambda_function.py lib ./deployment/. + pip3.10 install -r requirements.txt -t ./deployment/. --platform manylinux2014_x86_64 --only-binary=:all: --implementation cp --python-version 3.10 --upgrade cd ./deployment && zip -q -r ../deployment.zip . diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py index 5e57d49e99..8b7ae7cd5e 100644 --- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py +++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py @@ -7,6 +7,8 @@ from collections import defaultdict from concurrent.futures import as_completed, ThreadPoolExecutor from datetime import datetime, timedelta, timezone +from lib.config import BENCHMARK_REGRESSION_CONFIG +from jinja2 import Template # Local imports from typing import Any, Dict, Iterable, List, Optional, Set @@ -69,7 +71,6 @@ def is_unix_timestamp(value: str) -> bool: def to_timestap_str(time: datetime) -> str: return str(int(time.timestamp())) - def write_to_file(data: Any, filename="", path=""): """ Writes data to a specified file. If no path is provided, writes to the current directory. @@ -95,6 +96,21 @@ def write_to_file(data: Any, filename="", path=""): file.write(data) logger.info(f"File written to: {os.path.abspath(file_path)}") +BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = "benchmark_regression_summary_report" + + +def get_runtime_config(name: str, start: datetime, end: datetime): + + try: + config = BENCHMARK_REGRESSION_CONFIG[name] + tmpl = Template(config.source.api_endpoint_params_template) + rendered = tmpl.render( + startTime=start.isoformat(timespec="milliseconds") + "Z", + stopTime=end.isoformat(timespec="milliseconds") + "Z", + ) + cfg = dict(config) # shallow copy + cfg["api_endpoint_params"] = json.loads(rendered) + return cfg class BenchmarkSummaryProcessor: @@ -109,15 +125,10 @@ def __init__( output_snapshot_file_path: str = "", ) -> None: self.is_dry_run = is_dry_run - self.is_dry_run = is_dry_run - self.output_snapshot_file_name = output_snapshot_file_name - self.output_snapshot_file_path = output_snapshot_file_path - self.local_output = local_output and is_dry_run def process( self, - start_time: datetime, - end_time: datetime, + config: Dict[str, Any], cc: Optional[clickhouse_connect.driver.client.Client] = None, args: Optional[argparse.Namespace] = None, ) -> Dict[str, Any]: @@ -138,8 +149,6 @@ def process( # fetches config to get time series from api - - queued_jobs = self._fetch_snapshot_from_db(cc, start_time, end_time, repo) if len(queued_jobs) == 0: @@ -148,15 +157,6 @@ def process( + f"No jobs in queue in time range: [{start_time},{end_time}]" ) - # add runner labels to each job based on machine type - self._add_runner_labels( - queued_jobs, - start_time, - meta_runner_config_retriever, - lf_runner_config_retriever, - old_lf_lf_runner_config_retriever, - ) - if len(queued_jobs) == 0: logger.info( f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] " @@ -190,6 +190,7 @@ def process( else: self._write_to_db_table(cc, records) + return { "start_time": to_timestap_str(start_time), "end_time": to_timestap_str(end_time), @@ -198,6 +199,7 @@ def process( } + class WorkerPoolHandler: """ WorkerPoolHandler runs workers in parallel to generate benchmark regression report diff --git a/aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py b/aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py new file mode 100644 index 0000000000..93b8131a3a --- /dev/null +++ b/aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py @@ -0,0 +1,27 @@ + +from dataclasses import dataclass, field +from os import error +from typing import Any, Dict, List, Optional + + +@dataclass +class TimeRange: + start: str + end: str + +@dataclass +class TimeSeriesItem: + group_info: Dict[str, Any] # flexible, could make a stricter dataclass if schema is known + num_of_dp: int + data: List[Dict[str, Any]] = field(default_factory=list) + +@dataclass +class ApiData: + time_series: List[TimeSeriesItem] + time_range: TimeRange + + +@dataclass +class ApiResponse: + data: Optional[ApiData] = None # present if success + error: Optional[str] = None # present if failure diff --git a/aws/lambda/benchmark_regression_summary_report/lib/config.py b/aws/lambda/benchmark_regression_summary_report/lib/config.py new file mode 100644 index 0000000000..e5465b0419 --- /dev/null +++ b/aws/lambda/benchmark_regression_summary_report/lib/config.py @@ -0,0 +1,55 @@ + + +from lib.config_model import BenchmarkApiSource, BenchmarkConfig, BenchmarkRegressionConfigBook, DayRangeWindow, Frequency, RegressionPolicy, Policy, RangeConfig + + +# compiler benchmark regression config +COMPILER_BENCHMARK_CONFIG = BenchmarkConfig( + name="Compiler Benchmark Regression", + id = "compiler_regression", + source=BenchmarkApiSource( + api_query_url="http://localhost:3000/api/benchmark/get_time_series", + # currently we only detect the regression for h100 with dtype bfloat16, and mode inference + # we can extend this to other devices, dtypes and mode in the future + api_endpoint_params_template=""" + { + "name": "compiler_precompute", + "query_params": { + "commits": [], + "compilers": [], + "arch": "h100", + "device": "cuda", + "dtype": "bfloat16", + "granularity": "hour", + "mode": "inference", + "startTime": "{{ startTime }}", + "stopTime": "{{ stopTime }}", + "suites": ["torchbench", "huggingface", "timm_models"], + "workflowId": 0, + "branches": ["main"] + } + } + """ + ), + # set baseline from past 7 days using avg, and compare with the last 1 day + policy=Policy( + frequency=Frequency(value=7, unit="days"), + range=RangeConfig( + baseline=DayRangeWindow(value=7), + comparison=DayRangeWindow(value=1), + ), + metrics={ + "passrate": RegressionPolicy(name="passrate",condition="greater_than", threshold=0.9), + "geomean": RegressionPolicy(name="geomean",condition="greater_than", threshold=0.95), + "dynamo_peak_mem": RegressionPolicy( + name="dynamo_peak_mem",condition="greater_than", threshold=0.9 + ), + }, + ), + ) + +BENCHMARK_REGRESSION_CONFIG = BenchmarkRegressionConfigBook( + configs={ + "compiler_regression":COMPILER_BENCHMARK_CONFIG, + } +) diff --git a/aws/lambda/benchmark_regression_summary_report/lib/config_model.py b/aws/lambda/benchmark_regression_summary_report/lib/config_model.py new file mode 100644 index 0000000000..774a136ed6 --- /dev/null +++ b/aws/lambda/benchmark_regression_summary_report/lib/config_model.py @@ -0,0 +1,128 @@ +from __future__ import annotations +from dataclasses import dataclass, field +from typing import Any, Dict, Literal, Optional, Set +from datetime import datetime, timedelta +from jinja2 import Environment, Template, meta +import json + + +# -------- Frequency -------- +@dataclass +class Frequency: + value: int + unit: Literal["days", "weeks"] + + def to_timedelta(self) -> timedelta: + """Convert frequency into a datetime.timedelta.""" + if self.unit == "days": + return timedelta(days=self.value) + elif self.unit == "weeks": + return timedelta(weeks=self.value) + else: + raise ValueError(f"Unsupported unit: {self.unit}") + + +# -------- Source -------- +_JINJA_ENV = Environment(autoescape=False) + +@dataclass +class BenchmarkApiSource: + api_query_url: str + api_endpoint_params_template: str + default_ctx: Dict[str, Any] = field(default_factory=dict) + + def required_template_vars(self) -> set[str]: + ast = _JINJA_ENV.parse(self.api_endpoint_params_template) + return set(meta.find_undeclared_variables(ast)) + + def render(self, ctx: Dict[str, Any], strict: bool = True) -> dict: + """Render with caller-supplied context (no special casing for start/end).""" + merged = {**self.default_ctx, **ctx} + + if strict: + required = self.required_template_vars() + missing = required - merged.keys() + if missing: + raise ValueError(f"Missing required vars: {missing}") + rendered = Template(self.api_endpoint_params_template).render(**merged) + return json.loads(rendered) + + +# -------- Policy: range windows -------- +@dataclass +class DayRangeWindow: + value: int + # raw indicates fetch from the source data + source: Literal["raw"] = "raw" + + + + + + + +@dataclass +class RangeConfig: + baseline: RangeWindow + comparison: RangeWindow + + +# -------- Policy: metrics -------- +@dataclass +class RegressionPolicy: + name: str + # Meaning: + # - "greater_than": higher is better; violation if value < baseline * threshold + # - "less_than": lower is better; violation if value > baseline * threshold + # - "equal_to": value should be ~= baseline * threshold within rel_tol + condition: Literal["greater_than", "less_than", "equal_to"] + threshold: float + rel_tol: float = 1e-3 # used only for "equal_to" + + def is_violation(self, value: float, baseline: float) -> bool: + target = baseline * self.threshold + + if self.condition == "greater_than": + # value should be >= target + return value < target + + if self.condition == "less_than": + # value should be <= target + return value > target + + # equal_to: |value - target| should be within rel_tol * max(1, |target|) + denom = max(1.0, abs(target)) + return abs(value - target) > self.rel_tol * denom + +@dataclass +class Policy: + frequency: Frequency + range: RangeConfig + metrics: Dict[str, RegressionPolicy] + + +# -------- Top-level benchmark regression config -------- +@dataclass +class BenchmarkConfig: + """ + BenchmarkConfig defines the benchmark regression config for a given benchmark. + source: defines the source of the benchmark data we want to query_params + policy: defines the policy for the benchmark regressions + name: the name of the benchmark + id: the id of the benchmark, this must be unique for each benchmark, and cannot be changed once set + """ + name: str + id: str + source: Source + policy: Policy + + +@dataclass +class BenchmarkRegressionConfigBook: + configs: Dict[str, BenchmarkConfig] = field(default_factory=dict) + + def __getitem__(self, key: str) -> BenchmarkConfig: + config = self.configs.get(key, None) + if not config: + raise KeyError(f"Config {key} not found") + return config From 118a7167fb1edcb9631cf7edffaa3216b5c29f81 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 2 Sep 2025 17:41:44 -0700 Subject: [PATCH 18/27] addid --- .../common/benchmark_time_series_api_model.py | 54 +++++ .../{lib => common}/config.py | 14 +- .../common/config_model.py | 209 +++++++++++++++++ .../lambda_function.py | 211 +++++++++++------- .../lib/benchmark_time_series_api_model.py | 27 --- .../lib/config_model.py | 128 ----------- 6 files changed, 407 insertions(+), 236 deletions(-) create mode 100644 aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py rename aws/lambda/benchmark_regression_summary_report/{lib => common}/config.py (78%) create mode 100644 aws/lambda/benchmark_regression_summary_report/common/config_model.py delete mode 100644 aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py delete mode 100644 aws/lambda/benchmark_regression_summary_report/lib/config_model.py diff --git a/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py new file mode 100644 index 0000000000..3a68648dd3 --- /dev/null +++ b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py @@ -0,0 +1,54 @@ + +from dataclasses import dataclass, field +from typing import Optional, List, Dict, Any +import requests + +@dataclass +class TimeRange: + start: str + end: str + +@dataclass +class BenchmarkTimeSeriesItem: + group_info: Dict[str, Any] + num_of_dp: int + data: List[Dict[str, Any]] = field(default_factory=list) + +@dataclass +class BenchmarkTimeSeriesApiData: + time_series: List[BenchmarkTimeSeriesItem] + time_range: TimeRange + +@dataclass +class BenchmarkTimeSeriesApiResponse: + data: BenchmarkTimeSeriesApiData + + @classmethod + def from_request(cls, url: str, query: dict, timeout: int = 60) -> "BenchmarkTimeSeriesApiResponse": + """ + Send a POST request and parse into BenchmarkTimeSeriesApiResponse. + + Args: + url: API endpoint + query: JSON payload must + timeout: max seconds to wait for connect + response (default: 30) + Returns: + ApiResponse + Raises: + requests.exceptions.RequestException if network/timeout/HTTP error + RuntimeError if the API returns an "error" field or malformed data + """ + resp = requests.post(url, json=query, timeout=timeout) + resp.raise_for_status() + payload = resp.json() + + if "error" in payload: + raise RuntimeError(f"API error: {payload['error']}") + + try: + tr = TimeRange(**payload["data"]["time_range"]) + ts = [BenchmarkTimeSeriesItem(**item) for item in payload["data"]["time_series"]] + except Exception as e: + raise RuntimeError(f"Malformed API payload: {e}") + + return cls(data=BenchmarkTimeSeriesApiData(time_series=ts, time_range=tr)) diff --git a/aws/lambda/benchmark_regression_summary_report/lib/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py similarity index 78% rename from aws/lambda/benchmark_regression_summary_report/lib/config.py rename to aws/lambda/benchmark_regression_summary_report/common/config.py index e5465b0419..b2677b3ce9 100644 --- a/aws/lambda/benchmark_regression_summary_report/lib/config.py +++ b/aws/lambda/benchmark_regression_summary_report/common/config.py @@ -1,14 +1,14 @@ -from lib.config_model import BenchmarkApiSource, BenchmarkConfig, BenchmarkRegressionConfigBook, DayRangeWindow, Frequency, RegressionPolicy, Policy, RangeConfig - - +from common.config_model import BenchmarkApiSource, BenchmarkConfig, BenchmarkRegressionConfigBook, DayRangeWindow, Frequency, RegressionPolicy, Policy, RangeConfig # compiler benchmark regression config +# todo(elainewy): eventually each team should configure their own benchmark regression config, currenlty place here for lambda COMPILER_BENCHMARK_CONFIG = BenchmarkConfig( name="Compiler Benchmark Regression", id = "compiler_regression", source=BenchmarkApiSource( api_query_url="http://localhost:3000/api/benchmark/get_time_series", + type="benchmark_time_series_api", # currently we only detect the regression for h100 with dtype bfloat16, and mode inference # we can extend this to other devices, dtypes and mode in the future api_endpoint_params_template=""" @@ -33,7 +33,7 @@ ), # set baseline from past 7 days using avg, and compare with the last 1 day policy=Policy( - frequency=Frequency(value=7, unit="days"), + frequency=Frequency(value=1, unit="days"), range=RangeConfig( baseline=DayRangeWindow(value=7), comparison=DayRangeWindow(value=1), @@ -45,9 +45,13 @@ name="dynamo_peak_mem",condition="greater_than", threshold=0.9 ), }, + notification_config={ + "type":"github", + "repo":"pytorch/test-infra", + "issue": "7081" + } ), ) - BENCHMARK_REGRESSION_CONFIG = BenchmarkRegressionConfigBook( configs={ "compiler_regression":COMPILER_BENCHMARK_CONFIG, diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py new file mode 100644 index 0000000000..ffc09c8980 --- /dev/null +++ b/aws/lambda/benchmark_regression_summary_report/common/config_model.py @@ -0,0 +1,209 @@ +from __future__ import annotations +from dataclasses import dataclass, field, fields +from typing import Any, ClassVar, Dict, Literal, Optional, Set, Type, Union +from datetime import datetime, timedelta +from jinja2 import Environment, Template, meta +import requests +import json + + +# -------- Frequency -------- +@dataclass(frozen=True) +class Frequency: + """ + The frequency of how often the report should be generated. + The minimum frequency we support is 1 day. + Attributes: + value: Number of units (e.g., 7 for 7 days). + unit: Unit of time, either "days" or "weeks". + + Methods: + to_timedelta: Convert frequency into a datetime.timedelta. + get_text: return the frequency in text format + """ + value: int + unit: Literal["days", "weeks"] + def to_timedelta(self) -> timedelta: + """Convert frequency N days or M weeks into a datetime.timedelta.""" + if self.unit == "days": + return timedelta(days=self.value) + elif self.unit == "weeks": + return timedelta(weeks=self.value) + else: + raise ValueError(f"Unsupported unit: {self.unit}") + + def get_text(self): + return f"{self.value} {self.unit}" + + +# -------- Source -------- +_JINJA_ENV = Environment(autoescape=False) + +@dataclass +class BenchmarkApiSource: + """ + Defines the source of the benchmark data we want to query + api_query_url: the url of the api to query + api_endpoint_params_template: the jinjia2 template of the api endpoint's query params + default_ctx: the default context to use when rendering the api_endpoint_params_template + """ + api_query_url: str + api_endpoint_params_template: str + type: Literal["benchmark_time_series_api", "other"] = "benchmark_time_series_api" + default_ctx: Dict[str, Any] = field(default_factory=dict) + + def required_template_vars(self) -> set[str]: + ast = _JINJA_ENV.parse(self.api_endpoint_params_template) + return set(meta.find_undeclared_variables(ast)) + + def render(self, ctx: Dict[str, Any], strict: bool = True) -> dict: + """Render with caller-supplied context (no special casing for start/end).""" + merged = {**self.default_ctx, **ctx} + + if strict: + required = self.required_template_vars() + missing = required - merged.keys() + if missing: + raise ValueError(f"Missing required vars: {missing}") + rendered = Template(self.api_endpoint_params_template).render(**merged) + return json.loads(rendered) + + +# -------- Policy: range windows -------- +@dataclass +class DayRangeWindow: + value: int + # raw indicates fetch from the source data + source: Literal["raw"] = "raw" + +@dataclass +class RangeConfig: + """ + Defines the range of baseline and comparison windows for a given policy. + - baseline: the baseline window that build the baseline value + - comparison: the comparison window that we fetch data from to compare against the baseline value + """ + baseline: DayRangeWindow + comparison: DayRangeWindow + + def total_timedelta(self) -> timedelta: + return timedelta(days=self.baseline.value + self.comparison.value) + def comparison_timedelta(self) -> timedelta: + return timedelta(days=self.comparison.value) + def baseline_timedelta(self) -> timedelta: + return timedelta(days=self.baseline.value) + +# -------- Policy: metrics -------- +@dataclass +class RegressionPolicy: + name: str + # Meaning: + # - "greater_than": higher is better; violation if value < baseline * threshold + # - "less_than": lower is better; violation if value > baseline * threshold + # - "equal_to": value should be ~= baseline * threshold within rel_tol + condition: Literal["greater_than", "less_than", "equal_to"] + threshold: float + rel_tol: float = 1e-3 # used only for "equal_to" + + def is_violation(self, value: float, baseline: float) -> bool: + target = baseline * self.threshold + + if self.condition == "greater_than": + # value should be >= target + return value < target + + if self.condition == "less_than": + # value should be <= target + return value > target + + # equal_to: |value - target| should be within rel_tol * max(1, |target|) + denom = max(1.0, abs(target)) + return abs(value - target) > self.rel_tol * denom + +class BaseNotificationConfig: + # every subclass must override this + type_tag: ClassVar[str] + + @classmethod + def from_dict(cls: Type[T], d: Dict[str, Any]) -> T: + # pick only known fields for this dataclass + kwargs = {f.name: d.get(f.name) for f in fields(cls)} + return cls(**kwargs) # type: ignore + + @classmethod + def matches(cls, d: Dict[str, Any]) -> bool: + return d.get("type") == cls.type_tag + + +@dataclass +class GitHubNotificationConfig(BaseNotificationConfig): + type: str = "github" + repo: str = "" + issue_number: str = "" + type_tag: ClassVar[str] = "github" + + def create_github_comment(self, body: str, github_token: str) -> Dict[str, Any]: + """ + Create a new comment on a GitHub issue. + Args: + notification_config: dict with keys: + - type: must be "github" + - repo: "owner/repo" + - issue: issue number (string or int) + body: text of the comment + token: GitHub personal access token or GitHub Actions token + + Returns: + The GitHub API response as a dict (JSON). + """ + url = f"https://api.github.com/repos/{self.repo}/issues/{self.issue_number}/comments" + headers = { + "Authorization": f"token {github_token}", + "Accept": "application/vnd.github+json", + "User-Agent": "bench-reporter/1.0", + } + resp = requests.post(url, headers=headers, json={"body": body}) + resp.raise_for_status() + return resp.json() +@dataclass +class Policy: + frequency: Frequency + range: RangeConfig + metrics: Dict[str, RegressionPolicy] + notification_config: Optional[Dict[str, Any]] = None + + def get_github_notification_config(self) -> Optional[GitHubNotificationConfig]: + if not self.notification_config: + return None + if self.notification_config and self.notification_config.get("type") == "github": + return notification_from_dict(self.notification_config) # type: ignore + + + +# -------- Top-level benchmark regression config -------- +@dataclass +class BenchmarkConfig: + """ + Represents a single benchmark regression configuration. + + - BenchmarkConfig defines the benchmark regression config for a given benchmark. + - source: defines the source of the benchmark data we want to query + - policy: defines the policy for the benchmark regressions + - name: the name of the benchmark + - id: the id of the benchmark, this must be unique for each benchmark, and cannot be changed once set + """ + name: str + id: str + source: BenchmarkApiSource + policy: Policy + + +@dataclass +class BenchmarkRegressionConfigBook: + configs: Dict[str, BenchmarkConfig] = field(default_factory=dict) + + def __getitem__(self, key: str) -> BenchmarkConfig: + config = self.configs.get(key, None) + if not config: + raise KeyError(f"Config {key} not found") + return config diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py index 8b7ae7cd5e..2962e77d0a 100644 --- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py +++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py @@ -6,16 +6,17 @@ import threading from collections import defaultdict from concurrent.futures import as_completed, ThreadPoolExecutor -from datetime import datetime, timedelta, timezone -from lib.config import BENCHMARK_REGRESSION_CONFIG +import datetime as dt +from common.benchmark_time_series_api_model import BenchmarkTimeSeriesApiData, BenchmarkTimeSeriesApiResponse, TimeRange +from common.config_model import BenchmarkApiSource, BenchmarkConfig, Frequency, Policy, RangeConfig +from common.config import BENCHMARK_REGRESSION_CONFIG from jinja2 import Template +import requests +from dateutil.parser import isoparse -# Local imports from typing import Any, Dict, Iterable, List, Optional, Set - import clickhouse_connect import yaml -from dateutil.parser import parse from github import Auth, Github @@ -32,6 +33,11 @@ "CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME", ""), } +BENMARK_REGRESSION_REPORT_DB="fortesting.benchmark_regression_report" + +def truncate_to_hour(ts: dt.datetime) -> dt.datetime: + return ts.replace(minute=0, second=0, microsecond=0) + def get_clickhouse_client( host: str, user: str, password: str @@ -67,9 +73,8 @@ def is_unix_timestamp(value: str) -> bool: return False return False - -def to_timestap_str(time: datetime) -> str: - return str(int(time.timestamp())) +def to_hour_str(ts: dt.datetime) -> str: + return truncate_to_hour(ts).isoformat().replace("+00:00", "Z") def write_to_file(data: Any, filename="", path=""): """ @@ -98,20 +103,14 @@ def write_to_file(data: Any, filename="", path=""): BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = "benchmark_regression_summary_report" - -def get_runtime_config(name: str, start: datetime, end: datetime): - +def get_config(config_id: str)-> BenchmarkConfig: try: - config = BENCHMARK_REGRESSION_CONFIG[name] - tmpl = Template(config.source.api_endpoint_params_template) - rendered = tmpl.render( - startTime=start.isoformat(timespec="milliseconds") + "Z", - stopTime=end.isoformat(timespec="milliseconds") + "Z", - ) - cfg = dict(config) # shallow copy - cfg["api_endpoint_params"] = json.loads(rendered) - return cfg - + config: BenchmarkConfig = BENCHMARK_REGRESSION_CONFIG[config_id] + except KeyError: + raise ValueError(f"Invalid config id: {config_id}") + except Exception as e: + raise e + return config class BenchmarkSummaryProcessor: """ @@ -120,18 +119,47 @@ class BenchmarkSummaryProcessor: def __init__( self, is_dry_run: bool = False, - local_output: bool = False, - output_snapshot_file_name: str = "summary_report_snapshot", - output_snapshot_file_path: str = "", ) -> None: self.is_dry_run = is_dry_run + def should_generate_report( + self, + cc: clickhouse_connect.driver.client.Client, + end_time: dt.datetime, + config_id: str, + f: Frequency + ) -> bool: + """ + decide wether should generate the report based on the frequency in policy + """ + def get_latest_regression_report( + cc: clickhouse_connect.driver.Client, + config_id: str, + ): + result = cc.query( + "SELECT max(report_date) FROM benchmark_regression_report WHERE report_id = {config_id:String}", + parameters={"config_id": config_id}, + ) + if not result.result_rows or result.result_rows[0][0] is None: + return None + return result.result_rows[0][0] + freq_delta = f.to_timedelta() + latest_date = get_latest_regression_report(cc, config_id) + # No report exists yet, generate + if not latest_date: + return True + # we only verify by date to see if we should generate the data + cutoff = end_time.date() - freq_delta + return latest_date < cutoff + + def process( self, - config: Dict[str, Any], + config_id: str, + end_time: dt.datetime, cc: Optional[clickhouse_connect.driver.client.Client] = None, args: Optional[argparse.Namespace] = None, - ) -> Dict[str, Any]: + ): # ensure each thread has its own clickhouse client. clickhouse client # is not thread-safe. if cc is None: @@ -146,50 +174,41 @@ def process( else: tlocal.cc = get_clickhouse_client_environment() cc = tlocal.cc - - # fetches config to get time series from api - - queued_jobs = self._fetch_snapshot_from_db(cc, start_time, end_time, repo) - - if len(queued_jobs) == 0: - logger.info( - f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] " - + f"No jobs in queue in time range: [{start_time},{end_time}]" - ) - - if len(queued_jobs) == 0: - logger.info( - f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] " - + "No queued jobs, skipping generating histogram records.." - ) - - records = QueuedJobHistogramGenerator().generate_histogram_records( - queued_jobs, - datetime.now(timezone.utc), - "half-hour-mark-queue-time-histogram", - end_time, + config = get_config(config_id) + + # check if we should generate report for end_time + # currently we only verify if end_time > latest report date + policy.freq in db + report_freq = config.policy.frequency + should_generate = self.should_generate_report(cc, end_time,config_id,report_freq) + if not should_generate: + logger.info("[%s] Skip generate report for date: %s with frequency %s",config_id, end_time.date(), report_freq.get_text()) + return; + data_range = config.policy.range + total_timedelta = data_range.baseline_timedelta + logger.info("[%s] fetching benchmark data from source",config_id) + + if config.source.type!="benchmark_time_series_api": + logger.error(f"{config_id}: currently we only suppport benchmark_time_series_api to fetch source data") + return; + + # Comparison: [end_time - 1d, end_time) + comp_s = end_time - data_range.comparison_timedelta() + comp_e = end_time + comparison_data = self._fetch_from_benchmark_ts_api( + config_id=config_id, + start_time=baseline_s, + end_time=baseline_e, + source=config.source, ) - if len(records) == 0: - logger.info( - f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] " - + "No histogram records, skipping writing.." - ) - - if self.is_dry_run: - logger.info( - f" [Dry Run Mode][Snapshot {to_timestap_str(end_time)}] " - + "Writing results to terminal/local file ..." - ) - self._output_record(queued_jobs, end_time, type="queued_jobs") - self._output_record(records, end_time, type="records") - logger.info( - f" [Dry Run Mode][Snapshot {to_timestap_str(end_time)}] " - + "Done. Write results to terminal/local file ." - ) - else: - self._write_to_db_table(cc, records) + data = self._fetch_from_benchmark_ts_api(config_id, end_time, start_time, config.source) + latest_ts = data.time_range.end + # no data in the time range + if not latest_ts: + logger.info("[%s] No data found for report %s",config_id, end_time.date()) + return + regression_policy = config.policy.metrics return { "start_time": to_timestap_str(start_time), @@ -198,6 +217,54 @@ def process( "records_count": len(records), } + def get_basline(self, config: BenchmarkConfig,end_time: dt.datetime): + data_range = config.policy.range + baseline_s = end_time - data_range.total_timedelta() + baseline_e = end_time - data_range.comparison_timedelta() + + # fetch baseline from api + raw_data = self._fetch_from_benchmark_ts_api( + config_id=config.id, + start_time=baseline_s, + end_time=baseline_e, + source=config.source, + ) + + def to_baseline(data:BenchmarkTimeSeriesApiData): + data. + + + + + + + + + + def _detect_regression(self,end_time: dt.datetime, data: BenchmarkTimeSeriesApiData, policy: Policy): + metrics_dict = policy.metrics + baseline_range = policy.range.baseline_timedelta() + comparison = policy.range.comparison_timedelta() + + + + + return + def _fetch_from_benchmark_ts_api(self,config_id:str, end_time: dt.datetime,start_time:dt.datetime, source: BenchmarkApiSource): + str_end_time = end_time.isoformat() + str_start_time = start_time.isoformat() + query = source.render(ctx={ + "startTime": str_start_time, + "endTime": str_end_time, + }) + url = source.api_query_url + try: + resp:BenchmarkTimeSeriesApiResponse = BenchmarkTimeSeriesApiResponse.from_request(url, query) + + return resp.data + except Exception as e: + raise RuntimeError(f"[{config_id}]Fetch failed:", e) + class WorkerPoolHandler: @@ -250,19 +317,14 @@ def main( args: Optional[argparse.Namespace] = None, github_access_token: str = "", is_dry_run: bool = False, - local_output: bool = False, - output_snapshot_file_name: str = "job_queue_times_snapshot", - output_snapshot_file_path: str = "", ): """ Main method to run in both local environment and lambda handler. 1. generate intervals[start_time,end_time] using latest timestamp from source table and target table 2. call WorkerPoolHandler to geneterate and write histogram data for each interval in parallel """ - # gets config retrievers, this is used to generate runner labels for histgram if not github_access_token: raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN") - config_retrievers = get_config_retrievers(github_access_token) # get time intervals. logger.info(" [Main] generating time intervals ....") @@ -279,10 +341,7 @@ def main( handler = WorkerPoolHandler( config_retrievers, BenchmarkSummaryProcessor( - is_dry_run=is_dry_run, - local_output=local_output, - output_snapshot_file_name=output_snapshot_file_name, - output_snapshot_file_path=output_snapshot_file_path, + is_dry_run=is_dry_run ), ) handler.start(time_intervals, args) diff --git a/aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py b/aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py deleted file mode 100644 index 93b8131a3a..0000000000 --- a/aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py +++ /dev/null @@ -1,27 +0,0 @@ - -from dataclasses import dataclass, field -from os import error -from typing import Any, Dict, List, Optional - - -@dataclass -class TimeRange: - start: str - end: str - -@dataclass -class TimeSeriesItem: - group_info: Dict[str, Any] # flexible, could make a stricter dataclass if schema is known - num_of_dp: int - data: List[Dict[str, Any]] = field(default_factory=list) - -@dataclass -class ApiData: - time_series: List[TimeSeriesItem] - time_range: TimeRange - - -@dataclass -class ApiResponse: - data: Optional[ApiData] = None # present if success - error: Optional[str] = None # present if failure diff --git a/aws/lambda/benchmark_regression_summary_report/lib/config_model.py b/aws/lambda/benchmark_regression_summary_report/lib/config_model.py deleted file mode 100644 index 774a136ed6..0000000000 --- a/aws/lambda/benchmark_regression_summary_report/lib/config_model.py +++ /dev/null @@ -1,128 +0,0 @@ -from __future__ import annotations -from dataclasses import dataclass, field -from typing import Any, Dict, Literal, Optional, Set -from datetime import datetime, timedelta -from jinja2 import Environment, Template, meta -import json - - -# -------- Frequency -------- -@dataclass -class Frequency: - value: int - unit: Literal["days", "weeks"] - - def to_timedelta(self) -> timedelta: - """Convert frequency into a datetime.timedelta.""" - if self.unit == "days": - return timedelta(days=self.value) - elif self.unit == "weeks": - return timedelta(weeks=self.value) - else: - raise ValueError(f"Unsupported unit: {self.unit}") - - -# -------- Source -------- -_JINJA_ENV = Environment(autoescape=False) - -@dataclass -class BenchmarkApiSource: - api_query_url: str - api_endpoint_params_template: str - default_ctx: Dict[str, Any] = field(default_factory=dict) - - def required_template_vars(self) -> set[str]: - ast = _JINJA_ENV.parse(self.api_endpoint_params_template) - return set(meta.find_undeclared_variables(ast)) - - def render(self, ctx: Dict[str, Any], strict: bool = True) -> dict: - """Render with caller-supplied context (no special casing for start/end).""" - merged = {**self.default_ctx, **ctx} - - if strict: - required = self.required_template_vars() - missing = required - merged.keys() - if missing: - raise ValueError(f"Missing required vars: {missing}") - rendered = Template(self.api_endpoint_params_template).render(**merged) - return json.loads(rendered) - - -# -------- Policy: range windows -------- -@dataclass -class DayRangeWindow: - value: int - # raw indicates fetch from the source data - source: Literal["raw"] = "raw" - - - - - - - -@dataclass -class RangeConfig: - baseline: RangeWindow - comparison: RangeWindow - - -# -------- Policy: metrics -------- -@dataclass -class RegressionPolicy: - name: str - # Meaning: - # - "greater_than": higher is better; violation if value < baseline * threshold - # - "less_than": lower is better; violation if value > baseline * threshold - # - "equal_to": value should be ~= baseline * threshold within rel_tol - condition: Literal["greater_than", "less_than", "equal_to"] - threshold: float - rel_tol: float = 1e-3 # used only for "equal_to" - - def is_violation(self, value: float, baseline: float) -> bool: - target = baseline * self.threshold - - if self.condition == "greater_than": - # value should be >= target - return value < target - - if self.condition == "less_than": - # value should be <= target - return value > target - - # equal_to: |value - target| should be within rel_tol * max(1, |target|) - denom = max(1.0, abs(target)) - return abs(value - target) > self.rel_tol * denom - -@dataclass -class Policy: - frequency: Frequency - range: RangeConfig - metrics: Dict[str, RegressionPolicy] - - -# -------- Top-level benchmark regression config -------- -@dataclass -class BenchmarkConfig: - """ - BenchmarkConfig defines the benchmark regression config for a given benchmark. - source: defines the source of the benchmark data we want to query_params - policy: defines the policy for the benchmark regressions - name: the name of the benchmark - id: the id of the benchmark, this must be unique for each benchmark, and cannot be changed once set - """ - name: str - id: str - source: Source - policy: Policy - - -@dataclass -class BenchmarkRegressionConfigBook: - configs: Dict[str, BenchmarkConfig] = field(default_factory=dict) - - def __getitem__(self, key: str) -> BenchmarkConfig: - config = self.configs.get(key, None) - if not config: - raise KeyError(f"Config {key} not found") - return config From f7ed34b16d42978e1e8742672a3739ba7dfd5e93 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 2 Sep 2025 22:08:44 -0700 Subject: [PATCH 19/27] addid --- .../common/benchmark_time_series_api_model.py | 16 +- .../common/config.py | 76 +++--- .../common/config_model.py | 9 +- .../common/regression_utils.py | 205 +++++++++++++++ .../lambda_function.py | 248 +++++++++++------- torchci/lib/clickhouse.ts | 2 +- 6 files changed, 418 insertions(+), 138 deletions(-) create mode 100644 aws/lambda/benchmark_regression_summary_report/common/regression_utils.py diff --git a/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py index 3a68648dd3..2fa8960013 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py +++ b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py @@ -1,30 +1,35 @@ - from dataclasses import dataclass, field from typing import Optional, List, Dict, Any import requests + @dataclass class TimeRange: start: str end: str + @dataclass class BenchmarkTimeSeriesItem: group_info: Dict[str, Any] num_of_dp: int data: List[Dict[str, Any]] = field(default_factory=list) + @dataclass class BenchmarkTimeSeriesApiData: time_series: List[BenchmarkTimeSeriesItem] time_range: TimeRange + @dataclass class BenchmarkTimeSeriesApiResponse: data: BenchmarkTimeSeriesApiData @classmethod - def from_request(cls, url: str, query: dict, timeout: int = 60) -> "BenchmarkTimeSeriesApiResponse": + def from_request( + cls, url: str, query: dict, timeout: int = 60 + ) -> "BenchmarkTimeSeriesApiResponse": """ Send a POST request and parse into BenchmarkTimeSeriesApiResponse. @@ -44,11 +49,12 @@ def from_request(cls, url: str, query: dict, timeout: int = 60) -> "BenchmarkTim if "error" in payload: raise RuntimeError(f"API error: {payload['error']}") - try: tr = TimeRange(**payload["data"]["time_range"]) - ts = [BenchmarkTimeSeriesItem(**item) for item in payload["data"]["time_series"]] + ts = [ + BenchmarkTimeSeriesItem(**item) + for item in payload["data"]["time_series"] + ] except Exception as e: raise RuntimeError(f"Malformed API payload: {e}") - return cls(data=BenchmarkTimeSeriesApiData(time_series=ts, time_range=tr)) diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py index b2677b3ce9..18831f6070 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/config.py +++ b/aws/lambda/benchmark_regression_summary_report/common/config.py @@ -1,17 +1,25 @@ +from common.config_model import ( + BenchmarkApiSource, + BenchmarkConfig, + BenchmarkRegressionConfigBook, + DayRangeWindow, + Frequency, + RegressionPolicy, + Policy, + RangeConfig, +) - -from common.config_model import BenchmarkApiSource, BenchmarkConfig, BenchmarkRegressionConfigBook, DayRangeWindow, Frequency, RegressionPolicy, Policy, RangeConfig # compiler benchmark regression config # todo(elainewy): eventually each team should configure their own benchmark regression config, currenlty place here for lambda COMPILER_BENCHMARK_CONFIG = BenchmarkConfig( - name="Compiler Benchmark Regression", - id = "compiler_regression", - source=BenchmarkApiSource( - api_query_url="http://localhost:3000/api/benchmark/get_time_series", - type="benchmark_time_series_api", - # currently we only detect the regression for h100 with dtype bfloat16, and mode inference - # we can extend this to other devices, dtypes and mode in the future - api_endpoint_params_template=""" + name="Compiler Benchmark Regression", + id="compiler_regression", + source=BenchmarkApiSource( + api_query_url="http://localhost:3000/api/benchmark/get_time_series", + type="benchmark_time_series_api", + # currently we only detect the regression for h100 with dtype bfloat16, and mode inference + # we can extend this to other devices, dtypes and mode in the future + api_endpoint_params_template=""" { "name": "compiler_precompute", "query_params": { @@ -29,31 +37,35 @@ "branches": ["main"] } } - """ + """, + ), + # set baseline from past 7 days using avg, and compare with the last 1 day + policy=Policy( + frequency=Frequency(value=1, unit="days"), + range=RangeConfig( + baseline=DayRangeWindow(value=7), + comparison=DayRangeWindow(value=1), + ), + metrics={ + "passrate": RegressionPolicy( + name="passrate", condition="greater_than", threshold=0.9 ), - # set baseline from past 7 days using avg, and compare with the last 1 day - policy=Policy( - frequency=Frequency(value=1, unit="days"), - range=RangeConfig( - baseline=DayRangeWindow(value=7), - comparison=DayRangeWindow(value=1), - ), - metrics={ - "passrate": RegressionPolicy(name="passrate",condition="greater_than", threshold=0.9), - "geomean": RegressionPolicy(name="geomean",condition="greater_than", threshold=0.95), - "dynamo_peak_mem": RegressionPolicy( - name="dynamo_peak_mem",condition="greater_than", threshold=0.9 - ), - }, - notification_config={ - "type":"github", - "repo":"pytorch/test-infra", - "issue": "7081" - } + "geomean": RegressionPolicy( + name="geomean", condition="greater_than", threshold=0.95 ), - ) + "dynamo_peak_mem": RegressionPolicy( + name="dynamo_peak_mem", condition="greater_than", threshold=0.9 + ), + }, + notification_config={ + "type": "github", + "repo": "pytorch/test-infra", + "issue": "7081", + }, + ), +) BENCHMARK_REGRESSION_CONFIG = BenchmarkRegressionConfigBook( configs={ - "compiler_regression":COMPILER_BENCHMARK_CONFIG, + "compiler_regression": COMPILER_BENCHMARK_CONFIG, } ) diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py index ffc09c8980..dd892ccc45 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/config_model.py +++ b/aws/lambda/benchmark_regression_summary_report/common/config_model.py @@ -96,11 +96,12 @@ def baseline_timedelta(self) -> timedelta: # -------- Policy: metrics -------- @dataclass class RegressionPolicy: + """ + - "greater_than": higher is better; violation if value < baseline * threshold + - "less_than": lower is better; violation if value > baseline * threshold + - "equal_to": value should be ~= baseline * threshold within rel_tol + """ name: str - # Meaning: - # - "greater_than": higher is better; violation if value < baseline * threshold - # - "less_than": lower is better; violation if value > baseline * threshold - # - "equal_to": value should be ~= baseline * threshold within rel_tol condition: Literal["greater_than", "less_than", "equal_to"] threshold: float rel_tol: float = 1e-3 # used only for "equal_to" diff --git a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py new file mode 100644 index 0000000000..008986e4c5 --- /dev/null +++ b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py @@ -0,0 +1,205 @@ +import logging +import math +from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict +import statistics +from dateutil.parser import isoparse +from common.config_model import RegressionPolicy +from common.benchmark_time_series_api_model import ( + BenchmarkTimeSeriesApiData, + BenchmarkTimeSeriesItem, +) + +RegressionClassifyLabel = Literal[ + "regression", "suspicious", "no_regression", "insufficient_data" +] + + +class BaselineItem(TypedDict): + group_info: Dict[str, Any] + value: float + + +class LatestItem(TypedDict): + group_info: Dict[str, Any] + values: List[float] + + +def to_latest_data_map( + data: BenchmarkTimeSeriesApiData, field="value" +) -> Dict[tuple, LatestItem]: + result = {} + for ts_group in data.time_series: + group_keys = tuple(sorted(ts_group.group_info.items())) + values = [ + float(d[field]) + for d in sorted( + ts_group.data, + key=lambda d: isoparse(d["granularity_bucket"]), # convert to datetime + ) + if field in d + ] + result[group_keys] = { + "group_info": ts_group.group_info, + "values": values, + } + return result + + +def to_baseline_map( + baseline: BenchmarkTimeSeriesApiData, + mode: str = "mean", + field: str = "value", +) -> Dict[tuple, BaselineItem]: + """ + return + { + group_key[tuple]: { + "group_info": {...}, + "baseline": float + } + } + """ + result = {} + for ts_group in baseline.time_series: + group_keys = tuple(sorted(ts_group.group_info.items())) + values = [float(d[field]) for d in ts_group.data if field in d] + if not values: + continue + + if mode == "mean": + val = statistics.fmean(values) + elif mode == "p90": + val = percentile(values, 0.9) + else: + raise ValueError("mode must be 'mean' or 'p90'") + + result[group_keys] = { + "group_info": ts_group.group_info, + "baseline": val, + } + return result + + +def classify_flags(flags: list[bool], min_points: int = 3) -> RegressionClassifyLabel: + """ + Classify a sequence of boolean flags to detect regression. + + - regression: last run has >= 2 consecutive True values + - suspicious: there is a run of >= 3 consecutive True values, but not at the end + - no_regression: all other cases + - insufficient_data: not enough data points (< min_points) + + Special case: + - If min_points == 1, then just look at the last flag: + True -> regression + False -> no_regression + """ + n = len(flags) + if n == 0: + return "insufficient_data" + + if min_points == 1: + return "regression" if flags[-1] else "no_regression" + + if n < min_points: + return "insufficient_data" + + # trailing run length + t = 0 + for v in reversed(flags): + if v: + t += 1 + else: + break + if t >= 2: + return "regression" + + # longest run anywhere + longest = cur = 0 + for v in flags: + cur = cur + 1 if v else 0 + longest = max(longest, cur) + + if longest >= 3: + return "suspicious" + + return "no_regression" + + +def percentile(values, q: float): + if not values: + return None + v = sorted(values) + k = (len(v) - 1) * q + f = math.floor(k) + c = math.ceil(k) + if f == c: + return v[int(k)] + return v[f] + (v[c] - v[f]) * (k - f) + + +def _resolve_policy( + metric_policies: Dict[str, RegressionPolicy], + metric: str, +) -> Optional[RegressionPolicy]: + if not metric: + return None + m = metric.lower() + return metric_policies.get(m) + + +def detect_regressions_with_policies( + baseline_map: Dict[tuple, BaselineItem], + latest_map: Dict[tuple, LatestItem], + *, + metric_policies: Dict[str, RegressionPolicy], + min_points: int = 2, +) -> Tuple[List[Dict[str, Any]], bool]: + """ + For each group: + - choose policy by group_info['metric'] + - compute flags via policy.is_violation(value, baseline) + - classify with classify_flags + Returns a list of {group_info, baseline, values, flags, label, policy} + """ + results: List[Dict[str, Any]] = [] + + is_any_regression = False + + for key in sorted(latest_map.keys()): + cur_item = latest_map.get(key) + gi = cur_item["group_info"] if cur_item else {} + latest_vals = cur_item["values"] if cur_item else [] + policy = _resolve_policy(metric_policies, gi.get("metric", "")) + if not policy: + logging.warning( + f"no policy for metric %s with group_info=%s", gi.get("metric", ""), gi + ) + continue + + base_item = baseline_map.get(key) + baseline_value = base_item.get("value") if base_item else None + if not base_item or not baseline_value: + logging.warning( + f"no baseline for metric %s with group_info=%s", + gi.get("metric", ""), + gi, + ) + continue + + # Per-point violations (True = regression) + flags = [policy.is_violation(v, baseline_value) for v in latest_vals] + label = classify_flags(flags, min_points=min_points) + results.append( + { + "group_info": gi, + "baseline": baseline_value, + "values": latest_vals, + "flags": flags, + "label": label, + "policy": policy, + } + ) + if label == "regression": + is_any_regression = True + return results, is_any_regression diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py index 2962e77d0a..0c112156a8 100644 --- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py +++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py @@ -7,8 +7,24 @@ from collections import defaultdict from concurrent.futures import as_completed, ThreadPoolExecutor import datetime as dt -from common.benchmark_time_series_api_model import BenchmarkTimeSeriesApiData, BenchmarkTimeSeriesApiResponse, TimeRange -from common.config_model import BenchmarkApiSource, BenchmarkConfig, Frequency, Policy, RangeConfig +from common.regression_utils import ( + detect_regressions_with_policies, + to_baseline_map, + to_latest_data_map, + to_time_series_item_map, +) +from common.benchmark_time_series_api_model import ( + BenchmarkTimeSeriesApiData, + BenchmarkTimeSeriesApiResponse, + TimeRange, +) +from common.config_model import ( + BenchmarkApiSource, + BenchmarkConfig, + Frequency, + Policy, + RangeConfig, +) from common.config import BENCHMARK_REGRESSION_CONFIG from jinja2 import Template import requests @@ -33,7 +49,8 @@ "CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME", ""), } -BENMARK_REGRESSION_REPORT_DB="fortesting.benchmark_regression_report" +BENMARK_REGRESSION_REPORT_DB = "fortesting.benchmark_regression_report" + def truncate_to_hour(ts: dt.datetime) -> dt.datetime: return ts.replace(minute=0, second=0, microsecond=0) @@ -61,21 +78,6 @@ def get_clickhouse_client_environment() -> clickhouse_connect.driver.client.Clie ) -def is_unix_timestamp(value: str) -> bool: - """Check if the string is a valid Unix timestamp.""" - if value.isdigit(): # Ensure it's numeric - try: - timestamp = int(value) - # Check if it's within a reasonable range (1970 to 2100) - datetime.fromtimestamp(timestamp) - return True - except (ValueError, OSError): - return False - return False - -def to_hour_str(ts: dt.datetime) -> str: - return truncate_to_hour(ts).isoformat().replace("+00:00", "Z") - def write_to_file(data: Any, filename="", path=""): """ Writes data to a specified file. If no path is provided, writes to the current directory. @@ -101,9 +103,11 @@ def write_to_file(data: Any, filename="", path=""): file.write(data) logger.info(f"File written to: {os.path.abspath(file_path)}") + BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = "benchmark_regression_summary_report" -def get_config(config_id: str)-> BenchmarkConfig: + +def get_config(config_id: str) -> BenchmarkConfig: try: config: BenchmarkConfig = BENCHMARK_REGRESSION_CONFIG[config_id] except KeyError: @@ -112,9 +116,9 @@ def get_config(config_id: str)-> BenchmarkConfig: raise e return config + class BenchmarkSummaryProcessor: - """ - """ + """ """ def __init__( self, @@ -127,31 +131,48 @@ def should_generate_report( cc: clickhouse_connect.driver.client.Client, end_time: dt.datetime, config_id: str, - f: Frequency + f: Frequency, ) -> bool: """ - decide wether should generate the report based on the frequency in policy + decide wether should generate the report based on the frequency in policy """ - def get_latest_regression_report( + + def _get_latest_record_ts( cc: clickhouse_connect.driver.Client, config_id: str, - ): - result = cc.query( - "SELECT max(report_date) FROM benchmark_regression_report WHERE report_id = {config_id:String}", + ) -> Optional[dt.datetime]: + res = cc.query( + """ + SELECT max(last_record_ts) + FROM benchmark_regression_report + WHERE report_id = {config_id:String} + """, parameters={"config_id": config_id}, ) - if not result.result_rows or result.result_rows[0][0] is None: + if not res.result_rows or res.result_rows[0][0] is None: return None - return result.result_rows[0][0] + latest: dt.datetime = res.result_rows[0][ + 0 + ] # typically tz-aware UTC from clickhouse_connect + # If not tz-aware, force UTC: + if latest.tzinfo is None: + latest = latest.replace(tzinfo=dt.timezone.utc) + return latest + freq_delta = f.to_timedelta() - latest_date = get_latest_regression_report(cc, config_id) + latest_record_ts = _get_latest_record_ts(cc, config_id) + # No report exists yet, generate - if not latest_date: + if not latest_record_ts: return True - # we only verify by date to see if we should generate the data - cutoff = end_time.date() - freq_delta - return latest_date < cutoff + end_utc = ( + end_time if end_time.tzinfo else end_time.replace(tzinfo=dt.timezone.utc) + ) + end_utc = end_utc.astimezone(dt.timezone.utc) + + cutoff = end_time - freq_delta + return latest_record_ts < cutoff def process( self, @@ -176,39 +197,34 @@ def process( cc = tlocal.cc config = get_config(config_id) - # check if we should generate report for end_time - # currently we only verify if end_time > latest report date + policy.freq in db + # check if the current time is > policy's time_delta + previous record_ts from summary_table report_freq = config.policy.frequency - should_generate = self.should_generate_report(cc, end_time,config_id,report_freq) - if not should_generate: - logger.info("[%s] Skip generate report for date: %s with frequency %s",config_id, end_time.date(), report_freq.get_text()) - return; - data_range = config.policy.range - total_timedelta = data_range.baseline_timedelta - logger.info("[%s] fetching benchmark data from source",config_id) - - if config.source.type!="benchmark_time_series_api": - logger.error(f"{config_id}: currently we only suppport benchmark_time_series_api to fetch source data") - return; - - # Comparison: [end_time - 1d, end_time) - comp_s = end_time - data_range.comparison_timedelta() - comp_e = end_time - comparison_data = self._fetch_from_benchmark_ts_api( - config_id=config_id, - start_time=baseline_s, - end_time=baseline_e, - source=config.source, + should_generate = self.should_generate_report( + cc, end_time, config_id, report_freq ) + if not should_generate: + logger.info( + "[%s] Skip generate report for date: %s with frequency %s", + config_id, + end_time.isoformat(), + report_freq.get_text(), + ) + return - data = self._fetch_from_benchmark_ts_api(config_id, end_time, start_time, config.source) - latest_ts = data.time_range.end - # no data in the time range - if not latest_ts: - logger.info("[%s] No data found for report %s",config_id, end_time.date()) + latest = self.get_latest(config, end_time) + if not latest: return - regression_policy = config.policy.metrics + latest_map = to_latest_data_map(latest) + baseline = self.get_basline(config, end_time) + if not baseline: + return + baseline_map = to_baseline_map(baseline) + detect_regressions_with_policies( + baseline_map=baseline_map, + latest_map=latest_map, + metric_policies=config.policy.metrics, + ) return { "start_time": to_timestap_str(start_time), @@ -217,11 +233,42 @@ def process( "records_count": len(records), } - def get_basline(self, config: BenchmarkConfig,end_time: dt.datetime): + def get_latest(self, config: BenchmarkConfig, end_time: dt.datetime): data_range = config.policy.range - baseline_s = end_time - data_range.total_timedelta() - baseline_e = end_time - data_range.comparison_timedelta() + latest_s = end_time - data_range.comparison_timedelta() + latest_e = end_time + latest_data = self._fetch_from_benchmark_ts_api( + config_id=config.id, + start_time=latest_s, + end_time=latest_e, + source=config.source, + ) + if not latest_data.time_range or latest_data.time_range.end: + logger.info( + "[%s] Skip generate report for date:" + "%s with frequency %s, no data found during [%s,%s]", + config.id, + latest_s.isoformat(), + latest_e.isoformat(), + ) + return None + + if not self.should_use_data(latest_data.time_range.end, end_time): + logger.info( + "[%s] Skip generate report for date: trying to get_basline" + " with frequency %s, but no data found during for [%s,%s]", + config.id, + config.policy.frequency.get_text(), + latest_s.isoformat(), + latest_e.isoformat(), + ) + return None + return latest_data + def get_basline(self, config: BenchmarkConfig, end_time: dt.datetime): + data_range = config.policy.range + baseline_s = end_time - data_range.total_timedelta() + baseline_e = end_time - data_range.comparison_timedelta() # fetch baseline from api raw_data = self._fetch_from_benchmark_ts_api( config_id=config.id, @@ -229,44 +276,54 @@ def get_basline(self, config: BenchmarkConfig,end_time: dt.datetime): end_time=baseline_e, source=config.source, ) + if not self.should_use_data(raw_data.time_range.end, end_time): + logger.info( + "[%s][get_basline] Skip generate report, no data found during [%s,%s]", + config.id, + baseline_s.isoformat(), + baseline_e.isoformat(), + ) + return None + return raw_data - def to_baseline(data:BenchmarkTimeSeriesApiData): - data. - - - - - - - - - - def _detect_regression(self,end_time: dt.datetime, data: BenchmarkTimeSeriesApiData, policy: Policy): - metrics_dict = policy.metrics - baseline_range = policy.range.baseline_timedelta() - comparison = policy.range.comparison_timedelta() - - - + def should_use_data( + self, + latest_ts_str: str, + end_time: dt.datetime, + min_delta: dt.timedelta = dt.timedelta(days=2), + ) -> bool: + if not latest_ts_str: + return False + latest_dt = isoparse(latest_ts_str) + cutoff = end_time - min_delta + return latest_dt >= cutoff - return - def _fetch_from_benchmark_ts_api(self,config_id:str, end_time: dt.datetime,start_time:dt.datetime, source: BenchmarkApiSource): + def _fetch_from_benchmark_ts_api( + self, + config_id: str, + end_time: dt.datetime, + start_time: dt.datetime, + source: BenchmarkApiSource, + ): str_end_time = end_time.isoformat() str_start_time = start_time.isoformat() - query = source.render(ctx={ - "startTime": str_start_time, - "endTime": str_end_time, - }) + query = source.render( + ctx={ + "startTime": str_start_time, + "endTime": str_end_time, + } + ) url = source.api_query_url try: - resp:BenchmarkTimeSeriesApiResponse = BenchmarkTimeSeriesApiResponse.from_request(url, query) + resp: BenchmarkTimeSeriesApiResponse = ( + BenchmarkTimeSeriesApiResponse.from_request(url, query) + ) return resp.data except Exception as e: raise RuntimeError(f"[{config_id}]Fetch failed:", e) - class WorkerPoolHandler: """ WorkerPoolHandler runs workers in parallel to generate benchmark regression report @@ -288,7 +345,8 @@ def start( args: Optional[argparse.Namespace] = None, ) -> None: logger.info( - "[WorkerPoolHandler] start to process benchmark summary data with config %s", config["name"] + "[WorkerPoolHandler] start to process benchmark summary data with config %s", + config["name"], ) with ThreadPoolExecutor(max_workers=self.max_workers) as executor: futures = [] @@ -313,6 +371,7 @@ def start( logger.warning(f"Error processing future: {e}") errors.append({"error": str(e)}) + def main( args: Optional[argparse.Namespace] = None, github_access_token: str = "", @@ -336,13 +395,10 @@ def main( cc = get_clickhouse_client_environment() time_intervals = TimeIntervalGenerator().generate(cc) - # get jobs in queue from clickhouse for list of time intervals, in parallel handler = WorkerPoolHandler( config_retrievers, - BenchmarkSummaryProcessor( - is_dry_run=is_dry_run - ), + BenchmarkSummaryProcessor(is_dry_run=is_dry_run), ) handler.start(time_intervals, args) logger.info(" [Main] Done. work completed.") diff --git a/torchci/lib/clickhouse.ts b/torchci/lib/clickhouse.ts index b48673ad8c..f8c720caea 100644 --- a/torchci/lib/clickhouse.ts +++ b/torchci/lib/clickhouse.ts @@ -18,13 +18,13 @@ export function getClickhouseClient() { request_timeout: 180_000, // 3 mins }); } -// export function getClickhouseClientWritable() { return createClient({ host: process.env.CLICKHOUSE_HUD_USER_URL ?? "http://localhost:8123", username: process.env.CLICKHOUSE_HUD_USER_WRITE_USERNAME ?? "default", password: process.env.CLICKHOUSE_HUD_USER_WRITE_PASSWORD ?? "", + request_timeout: 180_000, // 3 minutes }); } From abb2904e2715ae2e81ddd9bf6af475400c69aef7 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 2 Sep 2025 23:04:34 -0700 Subject: [PATCH 20/27] addid --- .../common/config.py | 2 +- .../common/config_model.py | 2 +- .../common/regression_utils.py | 382 ++++++++++-------- .../lambda_function.py | 74 +--- 4 files changed, 227 insertions(+), 233 deletions(-) diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py index 18831f6070..c0420a8d88 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/config.py +++ b/aws/lambda/benchmark_regression_summary_report/common/config.py @@ -9,7 +9,7 @@ RangeConfig, ) -# compiler benchmark regression config +# Compiler benchmark regression config # todo(elainewy): eventually each team should configure their own benchmark regression config, currenlty place here for lambda COMPILER_BENCHMARK_CONFIG = BenchmarkConfig( name="Compiler Benchmark Regression", diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py index dd892ccc45..663bf4689d 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/config_model.py +++ b/aws/lambda/benchmark_regression_summary_report/common/config_model.py @@ -166,6 +166,7 @@ def create_github_comment(self, body: str, github_token: str) -> Dict[str, Any]: resp = requests.post(url, headers=headers, json={"body": body}) resp.raise_for_status() return resp.json() + @dataclass class Policy: frequency: Frequency @@ -180,7 +181,6 @@ def get_github_notification_config(self) -> Optional[GitHubNotificationConfig]: return notification_from_dict(self.notification_config) # type: ignore - # -------- Top-level benchmark regression config -------- @dataclass class BenchmarkConfig: diff --git a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py index 008986e4c5..11959a3c97 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py +++ b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py @@ -3,10 +3,9 @@ from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict import statistics from dateutil.parser import isoparse -from common.config_model import RegressionPolicy +from common.config_model import BenchmarkConfig, RegressionPolicy from common.benchmark_time_series_api_model import ( BenchmarkTimeSeriesApiData, - BenchmarkTimeSeriesItem, ) RegressionClassifyLabel = Literal[ @@ -21,112 +20,19 @@ class BaselineItem(TypedDict): class LatestItem(TypedDict): group_info: Dict[str, Any] - values: List[float] + values: List[Dict[str, Any]] -def to_latest_data_map( - data: BenchmarkTimeSeriesApiData, field="value" -) -> Dict[tuple, LatestItem]: - result = {} - for ts_group in data.time_series: - group_keys = tuple(sorted(ts_group.group_info.items())) - values = [ - float(d[field]) - for d in sorted( - ts_group.data, - key=lambda d: isoparse(d["granularity_bucket"]), # convert to datetime - ) - if field in d - ] - result[group_keys] = { - "group_info": ts_group.group_info, - "values": values, - } - return result - - -def to_baseline_map( - baseline: BenchmarkTimeSeriesApiData, - mode: str = "mean", - field: str = "value", -) -> Dict[tuple, BaselineItem]: - """ - return - { - group_key[tuple]: { - "group_info": {...}, - "baseline": float - } - } - """ - result = {} - for ts_group in baseline.time_series: - group_keys = tuple(sorted(ts_group.group_info.items())) - values = [float(d[field]) for d in ts_group.data if field in d] - if not values: - continue - - if mode == "mean": - val = statistics.fmean(values) - elif mode == "p90": - val = percentile(values, 0.9) - else: - raise ValueError("mode must be 'mean' or 'p90'") - - result[group_keys] = { - "group_info": ts_group.group_info, - "baseline": val, - } - return result - - -def classify_flags(flags: list[bool], min_points: int = 3) -> RegressionClassifyLabel: - """ - Classify a sequence of boolean flags to detect regression. - - - regression: last run has >= 2 consecutive True values - - suspicious: there is a run of >= 3 consecutive True values, but not at the end - - no_regression: all other cases - - insufficient_data: not enough data points (< min_points) - - Special case: - - If min_points == 1, then just look at the last flag: - True -> regression - False -> no_regression - """ - n = len(flags) - if n == 0: - return "insufficient_data" - - if min_points == 1: - return "regression" if flags[-1] else "no_regression" - - if n < min_points: - return "insufficient_data" - - # trailing run length - t = 0 - for v in reversed(flags): - if v: - t += 1 - else: - break - if t >= 2: - return "regression" - - # longest run anywhere - longest = cur = 0 - for v in flags: - cur = cur + 1 if v else 0 - longest = max(longest, cur) - - if longest >= 3: - return "suspicious" - - return "no_regression" - - -def percentile(values, q: float): +class PerGroupResult(TypedDict, total=True): + group_info: Dict[str, Any] + baseline: Optional[float] + points: List[Any] + flags: List[bool] + label: RegressionClassifyLabel + policy: Optional["RegressionPolicy"] + + +def percentile(values: list[float], q: float): if not values: return None v = sorted(values) @@ -138,68 +44,206 @@ def percentile(values, q: float): return v[f] + (v[c] - v[f]) * (k - f) -def _resolve_policy( - metric_policies: Dict[str, RegressionPolicy], - metric: str, -) -> Optional[RegressionPolicy]: - if not metric: - return None - m = metric.lower() - return metric_policies.get(m) - - -def detect_regressions_with_policies( - baseline_map: Dict[tuple, BaselineItem], - latest_map: Dict[tuple, LatestItem], - *, - metric_policies: Dict[str, RegressionPolicy], - min_points: int = 2, -) -> Tuple[List[Dict[str, Any]], bool]: - """ - For each group: - - choose policy by group_info['metric'] - - compute flags via policy.is_violation(value, baseline) - - classify with classify_flags - Returns a list of {group_info, baseline, values, flags, label, policy} - """ - results: List[Dict[str, Any]] = [] - - is_any_regression = False - - for key in sorted(latest_map.keys()): - cur_item = latest_map.get(key) - gi = cur_item["group_info"] if cur_item else {} - latest_vals = cur_item["values"] if cur_item else [] - policy = _resolve_policy(metric_policies, gi.get("metric", "")) - if not policy: - logging.warning( - f"no policy for metric %s with group_info=%s", gi.get("metric", ""), gi - ) - continue - - base_item = baseline_map.get(key) - baseline_value = base_item.get("value") if base_item else None - if not base_item or not baseline_value: - logging.warning( - f"no baseline for metric %s with group_info=%s", - gi.get("metric", ""), - gi, +class BenchmarkRegressionReportGenerator: + def __init__( + self, + config: BenchmarkConfig, + latest_ts: BenchmarkTimeSeriesApiData, + baseline_ts: BenchmarkTimeSeriesApiData, + ) -> None: + self.metric_policies = config.policy.metrics + self.latest_ts = self._to_latest_data_map(latest_ts) + self.baseline_ts = self._to_baseline_map(baseline_ts) + + def generate(self) -> Tuple[List[PerGroupResult], bool]: + return self.detect_regressions_with_policies( + self.baseline_ts, + self.latest_ts, + metric_policies=self.metric_policies, + ) + + def detect_regressions_with_policies( + self, + baseline_map: Dict[tuple, BaselineItem], + dp_map: Dict[tuple, LatestItem], + *, + metric_policies: Dict[str, RegressionPolicy], + min_points: int = 2, + ) -> Tuple[List[PerGroupResult], bool]: + """ + For each group: + - choose policy by group_info['metric'] + - compute flags via policy.is_violation(value, baseline) + - classify with classify_flags + Returns a list of {group_info, baseline, values, flags, label, policy} + """ + results: List[PerGroupResult] = [] + + is_any_regression = False + + for key in sorted(dp_map.keys()): + cur_item = dp_map.get(key) + gi = cur_item["group_info"] if cur_item else {} + points: List[Any] = cur_item["values"] if cur_item else [] + + base_item = baseline_map.get(key) + baseline_value = base_item.get("value") if base_item else None + + # + policy = self._resolve_policy(metric_policies, gi.get("metric", "")) + if not policy: + results.append( + PerGroupResult( + group_info=gi, + baseline=baseline_value, + points=[], + flags=[], + label="insufficient_data", + policy=None, + ) + ) + continue + + if baseline_value is None or len(points) == 0: + results.append( + PerGroupResult( + group_info=gi, + baseline=baseline_value, + points=[], + flags=[], + label="insufficient_data", + policy=policy, + ) + ) + continue + + # Per-point violations (True = regression) + flags: List[bool] = [ + policy.is_violation(p["value"], baseline_value) for p in points + ] + label = self.classify_flags(flags, min_points=min_points) + + enriched_points = [{**p, "flag": f} for p, f in zip(points, flags)] + results.append( + PerGroupResult( + group_info=gi, + baseline=baseline_value, + points=enriched_points, + flags=[], + label=label, + policy=policy, + ) ) - continue - - # Per-point violations (True = regression) - flags = [policy.is_violation(v, baseline_value) for v in latest_vals] - label = classify_flags(flags, min_points=min_points) - results.append( - { - "group_info": gi, - "baseline": baseline_value, - "values": latest_vals, - "flags": flags, - "label": label, - "policy": policy, + if label == "regression": + is_any_regression = True + return results, is_any_regression + + def _to_latest_data_map( + self, data: "BenchmarkTimeSeriesApiData", field: str = "value" + ) -> Dict[tuple, LatestItem]: + result: Dict[tuple, LatestItem] = {} + for ts_group in data.time_series: + group_keys = tuple(sorted(ts_group.group_info.items())) + points: List[Dict[str, Any]] = [] + for d in sorted( + ts_group.data, key=lambda d: isoparse(d["granularity_bucket"]) + ): + if field not in d: + continue + + points.append( + { + "value": float(d[field]), + "commit": d.get("head_sha"), + "branch": d.get("head_branch"), + "timestamp": isoparse(d["granularity_bucket"]), + } + ) + result[group_keys] = { + "group_info": ts_group.group_info, + "values": points, } - ) - if label == "regression": - is_any_regression = True - return results, is_any_regression + return result + + def _to_baseline_map( + self, + baseline: BenchmarkTimeSeriesApiData, + mode: str = "mean", + field: str = "value", + ) -> Dict[tuple, BaselineItem]: + result = {} + for ts_group in baseline.time_series: + group_keys = tuple(sorted(ts_group.group_info.items())) + values = [float(d[field]) for d in ts_group.data if field in d] + if not values: + continue + + if mode == "mean": + val = statistics.fmean(values) + elif mode == "p90": + val = percentile(values, 0.9) + else: + raise ValueError("mode must be 'mean' or 'p90'") + + result[group_keys] = { + "group_info": ts_group.group_info, + "baseline": val, + } + return result + + def classify_flags( + self, flags: list[bool], min_points: int = 3 + ) -> RegressionClassifyLabel: + """ + Classify a sequence of boolean flags to detect regression. + + - regression: last run has >= 2 consecutive True values + - suspicious: there is a run of >= 3 consecutive True values, but not at the end + - no_regression: all other cases + - insufficient_data: not enough data points (< min_points) + + Special case: + - If min_points == 1, then just look at the last flag: + True -> regression + False -> no_regression + """ + n = len(flags) + if n == 0: + return "insufficient_data" + + if min_points == 1: + return "regression" if flags[-1] else "no_regression" + + if n < min_points: + return "insufficient_data" + + # trailing run length + t = 0 + for v in reversed(flags): + if v: + t += 1 + else: + break + if t >= 2: + return "regression" + + # longest run anywhere + longest = cur = 0 + for v in flags: + cur = cur + 1 if v else 0 + longest = max(longest, cur) + + if longest >= 3: + return "suspicious" + + return "no_regression" + + def _resolve_policy( + self, + metric_policies: Dict[str, RegressionPolicy], + metric: str, + ) -> Optional[RegressionPolicy]: + if not metric: + return None + m = metric.lower() + return metric_policies.get(m) diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py index 0c112156a8..ab421a6bc7 100644 --- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py +++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py @@ -4,37 +4,23 @@ import logging import os import threading -from collections import defaultdict from concurrent.futures import as_completed, ThreadPoolExecutor import datetime as dt -from common.regression_utils import ( - detect_regressions_with_policies, - to_baseline_map, - to_latest_data_map, - to_time_series_item_map, -) +from typing import Optional +from common.regression_utils import BenchmarkRegressionReportGenerator +import clickhouse_connect from common.benchmark_time_series_api_model import ( - BenchmarkTimeSeriesApiData, BenchmarkTimeSeriesApiResponse, - TimeRange, ) from common.config_model import ( BenchmarkApiSource, BenchmarkConfig, Frequency, - Policy, - RangeConfig, ) from common.config import BENCHMARK_REGRESSION_CONFIG -from jinja2 import Template -import requests from dateutil.parser import isoparse -from typing import Any, Dict, Iterable, List, Optional, Set -import clickhouse_connect -import yaml -from github import Auth, Github - +from pprint import pprint logging.basicConfig( level=logging.INFO, @@ -77,36 +63,8 @@ def get_clickhouse_client_environment() -> clickhouse_connect.driver.client.Clie password=ENVS["CLICKHOUSE_PASSWORD"], ) - -def write_to_file(data: Any, filename="", path=""): - """ - Writes data to a specified file. If no path is provided, writes to the current directory. - - :param data: The content to write to the file. - :param filename: The name of the file (default: 'output.txt'). - :param path: The directory where the file should be saved (default: current directory). - """ - - if not filename: - filename = "output_snapshot.json" - if not path: - path = "." - - # Ensure the path exists - os.makedirs(path, exist_ok=True) - - # Construct full file path - file_path = os.path.join(path, filename) - - # Write data to file - with open(file_path, "w", encoding="utf-8") as file: - file.write(data) - logger.info(f"File written to: {os.path.abspath(file_path)}") - - BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = "benchmark_regression_summary_report" - def get_config(config_id: str) -> BenchmarkConfig: try: config: BenchmarkConfig = BENCHMARK_REGRESSION_CONFIG[config_id] @@ -116,7 +74,6 @@ def get_config(config_id: str) -> BenchmarkConfig: raise e return config - class BenchmarkSummaryProcessor: """ """ @@ -170,7 +127,6 @@ def _get_latest_record_ts( end_time if end_time.tzinfo else end_time.replace(tzinfo=dt.timezone.utc) ) end_utc = end_utc.astimezone(dt.timezone.utc) - cutoff = end_time - freq_delta return latest_record_ts < cutoff @@ -214,24 +170,18 @@ def process( latest = self.get_latest(config, end_time) if not latest: return - - latest_map = to_latest_data_map(latest) baseline = self.get_basline(config, end_time) if not baseline: return - baseline_map = to_baseline_map(baseline) - detect_regressions_with_policies( - baseline_map=baseline_map, - latest_map=latest_map, - metric_policies=config.policy.metrics, - ) - return { - "start_time": to_timestap_str(start_time), - "end_time": to_timestap_str(end_time), - "jobs_count": len(queued_jobs), - "records_count": len(records), - } + generator = BenchmarkRegressionReportGenerator( + config=config, latest_ts=latest, baseline_ts=baseline + ) + result, regression_detected = generator.generate() + if self.is_dry_run: + print("regression_detected: ", regression_detected) + print(json.dumps(result, indent=2, default=str)) + return def get_latest(self, config: BenchmarkConfig, end_time: dt.datetime): data_range = config.policy.range From a0a48c1078a8e43b58ef30f95257b9283522b211 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 2 Sep 2025 23:28:25 -0700 Subject: [PATCH 21/27] addid --- .../common/config.py | 8 + .../lambda_function.py | 171 ++++++++---------- 2 files changed, 85 insertions(+), 94 deletions(-) diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py index c0420a8d88..6ddf84696a 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/config.py +++ b/aws/lambda/benchmark_regression_summary_report/common/config.py @@ -64,8 +64,16 @@ }, ), ) + BENCHMARK_REGRESSION_CONFIG = BenchmarkRegressionConfigBook( configs={ "compiler_regression": COMPILER_BENCHMARK_CONFIG, } ) + + +def get_benchmark_regression_config(config_id: str) -> BenchmarkConfig: + try: + return BENCHMARK_REGRESSION_CONFIG[config_id] + except KeyError: + raise ValueError(f"Invalid config id: {config_id}") diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py index ab421a6bc7..9ba17de7ed 100644 --- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py +++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py @@ -17,11 +17,9 @@ BenchmarkConfig, Frequency, ) -from common.config import BENCHMARK_REGRESSION_CONFIG +from common.config import get_benchmark_regression_config from dateutil.parser import isoparse -from pprint import pprint - logging.basicConfig( level=logging.INFO, ) @@ -35,7 +33,9 @@ "CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME", ""), } -BENMARK_REGRESSION_REPORT_DB = "fortesting.benchmark_regression_report" +BENCHMARK_REGRESSION_REPORT_TABLE = "fortesting.benchmark_regression_report" + +BENCHMARK_REGRESSION_TRACKING_CONFIG_IDS = ["compiler_regression"] def truncate_to_hour(ts: dt.datetime) -> dt.datetime: @@ -63,16 +63,11 @@ def get_clickhouse_client_environment() -> clickhouse_connect.driver.client.Clie password=ENVS["CLICKHOUSE_PASSWORD"], ) -BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = "benchmark_regression_summary_report" -def get_config(config_id: str) -> BenchmarkConfig: - try: - config: BenchmarkConfig = BENCHMARK_REGRESSION_CONFIG[config_id] - except KeyError: - raise ValueError(f"Invalid config id: {config_id}") - except Exception as e: - raise e - return config +BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = ( + "fortesting.benchmark_regression_summary_report" +) + class BenchmarkSummaryProcessor: """ """ @@ -83,53 +78,6 @@ def __init__( ) -> None: self.is_dry_run = is_dry_run - def should_generate_report( - self, - cc: clickhouse_connect.driver.client.Client, - end_time: dt.datetime, - config_id: str, - f: Frequency, - ) -> bool: - """ - decide wether should generate the report based on the frequency in policy - """ - - def _get_latest_record_ts( - cc: clickhouse_connect.driver.Client, - config_id: str, - ) -> Optional[dt.datetime]: - res = cc.query( - """ - SELECT max(last_record_ts) - FROM benchmark_regression_report - WHERE report_id = {config_id:String} - """, - parameters={"config_id": config_id}, - ) - if not res.result_rows or res.result_rows[0][0] is None: - return None - latest: dt.datetime = res.result_rows[0][ - 0 - ] # typically tz-aware UTC from clickhouse_connect - # If not tz-aware, force UTC: - if latest.tzinfo is None: - latest = latest.replace(tzinfo=dt.timezone.utc) - return latest - - freq_delta = f.to_timedelta() - latest_record_ts = _get_latest_record_ts(cc, config_id) - - # No report exists yet, generate - if not latest_record_ts: - return True - - end_utc = ( - end_time if end_time.tzinfo else end_time.replace(tzinfo=dt.timezone.utc) - ) - end_utc = end_utc.astimezone(dt.timezone.utc) - cutoff = end_time - freq_delta - return latest_record_ts < cutoff - def process( self, config_id: str, @@ -151,11 +99,21 @@ def process( else: tlocal.cc = get_clickhouse_client_environment() cc = tlocal.cc - config = get_config(config_id) + + try: + config = get_benchmark_regression_config(config_id) + except ValueError as e: + logger.error(f"Skip process, Invalid config: {e}") + return + except Exception as e: + print( + f"Something else went wrong when call get_benchmark_regression_config: {e}" + ) + return # check if the current time is > policy's time_delta + previous record_ts from summary_table report_freq = config.policy.frequency - should_generate = self.should_generate_report( + should_generate = self._should_generate_report( cc, end_time, config_id, report_freq ) if not should_generate: @@ -273,6 +231,50 @@ def _fetch_from_benchmark_ts_api( except Exception as e: raise RuntimeError(f"[{config_id}]Fetch failed:", e) + def _should_generate_report( + self, + cc: clickhouse_connect.driver.client.Client, + end_time: dt.datetime, + config_id: str, + f: Frequency, + ) -> bool: + def _get_latest_record_ts( + cc: clickhouse_connect.driver.Client, + config_id: str, + ) -> Optional[dt.datetime]: + table = BENCHMARK_REGRESSION_REPORT_TABLE + res = cc.query( + f""" + SELECT max(last_record_ts) + FROM {table} + WHERE report_id = {{config_id:String}} + """, + parameters={"config_id": config_id}, + ) + if not res.result_rows or res.result_rows[0][0] is None: + return None + latest: dt.datetime = res.result_rows[0][ + 0 + ] # typically tz-aware UTC from clickhouse_connect + # If not tz-aware, force UTC: + if latest.tzinfo is None: + latest = latest.replace(tzinfo=dt.timezone.utc) + return latest + + freq_delta = f.to_timedelta() + latest_record_ts = _get_latest_record_ts(cc, config_id) + + # No report exists yet, generate + if not latest_record_ts: + return True + + end_utc = ( + end_time if end_time.tzinfo else end_time.replace(tzinfo=dt.timezone.utc) + ) + end_utc = end_utc.astimezone(dt.timezone.utc) + cutoff = end_time - freq_delta + return latest_record_ts < cutoff + class WorkerPoolHandler: """ @@ -284,26 +286,32 @@ class WorkerPoolHandler: def __init__( self, benchmark_summary_processor: BenchmarkSummaryProcessor, - max_workers: int = 4, + max_workers: int = 6, ): self.benchmark_summary_processor = benchmark_summary_processor self.max_workers = max_workers def start( self, - config: Dict[str, Any], + config_ids: list[str], args: Optional[argparse.Namespace] = None, ) -> None: logger.info( - "[WorkerPoolHandler] start to process benchmark summary data with config %s", - config["name"], + "[WorkerPoolHandler] start to process benchmark " + "summary data with config_ids %s", + config_ids, + ) + end_time = dt.datetime.now(dt.timezone.utc).replace( + minute=0, second=0, microsecond=0 ) + logger.info("current time with hour granularity(utc) %s", end_time) with ThreadPoolExecutor(max_workers=self.max_workers) as executor: futures = [] - for interval in time_intervals: + for config_id in config_ids: future = executor.submit( self.benchmark_summary_processor.process, - config, + config_id, + end_time, cc=None, args=args, ) @@ -343,14 +351,12 @@ def main( ) else: cc = get_clickhouse_client_environment() - time_intervals = TimeIntervalGenerator().generate(cc) # get jobs in queue from clickhouse for list of time intervals, in parallel handler = WorkerPoolHandler( - config_retrievers, BenchmarkSummaryProcessor(is_dry_run=is_dry_run), ) - handler.start(time_intervals, args) + handler.start(BENCHMARK_REGRESSION_TRACKING_CONFIG_IDS, args) logger.info(" [Main] Done. work completed.") @@ -395,12 +401,6 @@ def parse_args() -> argparse.Namespace: default=ENVS["GITHUB_ACCESS_TOKEN"], help="the github access token to access github api", ) - parser.add_argument( - "--local-output", - action="store_true", - help="when set, generate json result in local environment. " - + "this is only used for local test environment when dry-run is enabled", - ) parser.add_argument( "--not-dry-run", action="store_true", @@ -408,20 +408,6 @@ def parse_args() -> argparse.Namespace: + "environment. By default, we run in dry-run mode for local " + "environment", ) - parser.add_argument( - "--output-file-name", - type=str, - default="job_queue_times_snapshot.json", - help="the name of output file for local environment. this " - + "is only used for local test environment when local-output is enabled", - ) - parser.add_argument( - "--output-file-path", - type=str, - default="", - help="the path of output file for local environment. this is " - + "only used for local test environment when local-output is enabled", - ) args, _ = parser.parse_known_args() return args @@ -442,9 +428,6 @@ def local_run() -> None: args, args.github_access_token, is_dry_run=is_dry_run, - local_output=args.local_output, - output_snapshot_file_name=args.output_file_name, - output_snapshot_file_path=args.output_file_path, ) From a692bb9d88294b70312014a6d9b803354b804dd1 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 3 Sep 2025 02:01:15 -0700 Subject: [PATCH 22/27] addid --- .../common/benchmark_time_series_api_model.py | 2 +- .../common/config_model.py | 1 - .../common/regression_utils.py | 25 ++++---- .../lambda_function.py | 61 ++++++++----------- 4 files changed, 41 insertions(+), 48 deletions(-) diff --git a/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py index 2fa8960013..7aad397aa3 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py +++ b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py @@ -28,7 +28,7 @@ class BenchmarkTimeSeriesApiResponse: @classmethod def from_request( - cls, url: str, query: dict, timeout: int = 60 + cls, url: str, query: dict, timeout: int = 180 ) -> "BenchmarkTimeSeriesApiResponse": """ Send a POST request and parse into BenchmarkTimeSeriesApiResponse. diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py index 663bf4689d..53e59f80f9 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/config_model.py +++ b/aws/lambda/benchmark_regression_summary_report/common/config_model.py @@ -177,7 +177,6 @@ class Policy: def get_github_notification_config(self) -> Optional[GitHubNotificationConfig]: if not self.notification_config: return None - if self.notification_config and self.notification_config.get("type") == "github": return notification_from_dict(self.notification_config) # type: ignore diff --git a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py index 11959a3c97..bf66bdd92b 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py +++ b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py @@ -7,6 +7,9 @@ from common.benchmark_time_series_api_model import ( BenchmarkTimeSeriesApiData, ) +import pprint + +logger = logging.getLogger() RegressionClassifyLabel = Literal[ "regression", "suspicious", "no_regression", "insufficient_data" @@ -27,7 +30,6 @@ class PerGroupResult(TypedDict, total=True): group_info: Dict[str, Any] baseline: Optional[float] points: List[Any] - flags: List[bool] label: RegressionClassifyLabel policy: Optional["RegressionPolicy"] @@ -53,7 +55,7 @@ def __init__( ) -> None: self.metric_policies = config.policy.metrics self.latest_ts = self._to_latest_data_map(latest_ts) - self.baseline_ts = self._to_baseline_map(baseline_ts) + self.baseline_ts = self._to_baseline_map(baseline_ts, mode="max") def generate(self) -> Tuple[List[PerGroupResult], bool]: return self.detect_regressions_with_policies( @@ -82,35 +84,34 @@ def detect_regressions_with_policies( is_any_regression = False for key in sorted(dp_map.keys()): + logger.info("key: %s", key) cur_item = dp_map.get(key) gi = cur_item["group_info"] if cur_item else {} points: List[Any] = cur_item["values"] if cur_item else [] base_item = baseline_map.get(key) + logger.info("base_item for keys(%s):\n%s ",key, pprint.pformat(base_item)) baseline_value = base_item.get("value") if base_item else None - - # policy = self._resolve_policy(metric_policies, gi.get("metric", "")) if not policy: + logger.warning("No policy for %s", gi) results.append( PerGroupResult( group_info=gi, baseline=baseline_value, points=[], - flags=[], label="insufficient_data", policy=None, ) ) continue - if baseline_value is None or len(points) == 0: + logger.warning("baseline_value is %s, len(points) == %s", baseline_value,len(points)) results.append( PerGroupResult( group_info=gi, baseline=baseline_value, points=[], - flags=[], label="insufficient_data", policy=policy, ) @@ -129,7 +130,6 @@ def detect_regressions_with_policies( group_info=gi, baseline=baseline_value, points=enriched_points, - flags=[], label=label, policy=policy, ) @@ -150,12 +150,11 @@ def _to_latest_data_map( ): if field not in d: continue - points.append( { "value": float(d[field]), - "commit": d.get("head_sha"), - "branch": d.get("head_branch"), + "commit": d.get("commit"), + "branch": d.get("branch"), "timestamp": isoparse(d["granularity_bucket"]), } ) @@ -182,12 +181,14 @@ def _to_baseline_map( val = statistics.fmean(values) elif mode == "p90": val = percentile(values, 0.9) + elif mode == "max": + val = max(values) else: raise ValueError("mode must be 'mean' or 'p90'") result[group_keys] = { "group_info": ts_group.group_info, - "baseline": val, + "value": val, } return result diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py index 9ba17de7ed..29c2596e0e 100644 --- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py +++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py @@ -1,12 +1,13 @@ #!/usr/bin/env python import argparse +from concurrent.futures import ThreadPoolExecutor, as_completed import json import logging import os import threading -from concurrent.futures import as_completed, ThreadPoolExecutor +import requests import datetime as dt -from typing import Optional +from typing import Any, Optional from common.regression_utils import BenchmarkRegressionReportGenerator import clickhouse_connect from common.benchmark_time_series_api_model import ( @@ -46,7 +47,8 @@ def get_clickhouse_client( host: str, user: str, password: str ) -> clickhouse_connect.driver.client.Client: # for local testing only, disable SSL verification - # return clickhouse_connect.get_client(host=host, user=user, password=password,secure=True, verify=False) + logger.info("trying to connect with clickhouse") + return clickhouse_connect.get_client(host=host, user=user, password=password,secure=True, verify=False) return clickhouse_connect.get_client( host=host, user=user, password=password, secure=True @@ -63,7 +65,6 @@ def get_clickhouse_client_environment() -> clickhouse_connect.driver.client.Clie password=ENVS["CLICKHOUSE_PASSWORD"], ) - BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = ( "fortesting.benchmark_regression_summary_report" ) @@ -87,6 +88,7 @@ def process( ): # ensure each thread has its own clickhouse client. clickhouse client # is not thread-safe. + logger.info("here") if cc is None: tlocal = threading.local() if not hasattr(tlocal, "cc") or tlocal.cc is None: @@ -99,9 +101,11 @@ def process( else: tlocal.cc = get_clickhouse_client_environment() cc = tlocal.cc + logger.info("i'm here") try: config = get_benchmark_regression_config(config_id) + logger.info("found config for config_id %s",config_id) except ValueError as e: logger.error(f"Skip process, Invalid config: {e}") return @@ -118,15 +122,19 @@ def process( ) if not should_generate: logger.info( - "[%s] Skip generate report for date: %s with frequency %s", + "[%s] Skip generate report for date:%s with frequency %s, no data found during [%s,%s]", config_id, end_time.isoformat(), report_freq.get_text(), ) return + else: + logger.info( "[%s] Plan to generate report for time: %s with frequency %s ...", + config_id,end_time,report_freq.get_text()) latest = self.get_latest(config, end_time) if not latest: + logger.info("no latest data found") return baseline = self.get_basline(config, end_time) if not baseline: @@ -151,25 +159,9 @@ def get_latest(self, config: BenchmarkConfig, end_time: dt.datetime): end_time=latest_e, source=config.source, ) - if not latest_data.time_range or latest_data.time_range.end: - logger.info( - "[%s] Skip generate report for date:" - "%s with frequency %s, no data found during [%s,%s]", - config.id, - latest_s.isoformat(), - latest_e.isoformat(), - ) + if not latest_data.time_range or not latest_data.time_range.end: return None - if not self.should_use_data(latest_data.time_range.end, end_time): - logger.info( - "[%s] Skip generate report for date: trying to get_basline" - " with frequency %s, but no data found during for [%s,%s]", - config.id, - config.policy.frequency.get_text(), - latest_s.isoformat(), - latest_e.isoformat(), - ) return None return latest_data @@ -213,23 +205,27 @@ def _fetch_from_benchmark_ts_api( start_time: dt.datetime, source: BenchmarkApiSource, ): - str_end_time = end_time.isoformat() - str_start_time = start_time.isoformat() + str_end_time = end_time.strftime("%Y-%m-%dT%H:%M:%S") + str_start_time = start_time.strftime("%Y-%m-%dT%H:%M:%S") query = source.render( ctx={ "startTime": str_start_time, - "endTime": str_end_time, + "stopTime": str_end_time, } ) url = source.api_query_url + + logger.info("trying to call %s",url) try: resp: BenchmarkTimeSeriesApiResponse = ( BenchmarkTimeSeriesApiResponse.from_request(url, query) ) - return resp.data + except requests.exceptions.HTTPError as e: + logger.error("Server error message: %s", e.response.json().get("error")) + raise except Exception as e: - raise RuntimeError(f"[{config_id}]Fetch failed:", e) + raise RuntimeError(f"[{config_id}]Fetch failed: {e}") def _should_generate_report( self, @@ -344,13 +340,7 @@ def main( raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN") # get time intervals. - logger.info(" [Main] generating time intervals ....") - if args: - cc = get_clickhouse_client( - args.clickhouse_endpoint, args.clickhouse_username, args.clickhouse_password - ) - else: - cc = get_clickhouse_client_environment() + logger.info("[Main] start work ....") # get jobs in queue from clickhouse for list of time intervals, in parallel handler = WorkerPoolHandler( @@ -419,6 +409,9 @@ def local_run() -> None: args = parse_args() + + logger.info("args: %s",args) + # update environment variables for input parameters # always run in dry-run mode in local environment, unless it's disabled. From f2cd0e23adcb68d12a98bdcc7b661af886168e48 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 3 Sep 2025 02:43:49 -0700 Subject: [PATCH 23/27] addid --- .../common/config.py | 8 +- .../common/config_model.py | 33 +++++-- .../common/regression_utils.py | 94 +++++++++++-------- 3 files changed, 85 insertions(+), 50 deletions(-) diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py index 6ddf84696a..2c5280a63b 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/config.py +++ b/aws/lambda/benchmark_regression_summary_report/common/config.py @@ -48,13 +48,13 @@ ), metrics={ "passrate": RegressionPolicy( - name="passrate", condition="greater_than", threshold=0.9 + name="passrate", condition="greater_than", threshold=0.9, baseline_aggregation="max" ), "geomean": RegressionPolicy( - name="geomean", condition="greater_than", threshold=0.95 + name="geomean", condition="greater_than", threshold=0.95,baseline_aggregation="max" ), - "dynamo_peak_mem": RegressionPolicy( - name="dynamo_peak_mem", condition="greater_than", threshold=0.9 + "compression_ratio": RegressionPolicy( + name="compression_ratio", condition="greater_than", threshold=0.9, baseline_aggregation="max" ), }, notification_config={ diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py index 53e59f80f9..66e9ed04ed 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/config_model.py +++ b/aws/lambda/benchmark_regression_summary_report/common/config_model.py @@ -97,30 +97,45 @@ def baseline_timedelta(self) -> timedelta: @dataclass class RegressionPolicy: """ - - "greater_than": higher is better; violation if value < baseline * threshold - - "less_than": lower is better; violation if value > baseline * threshold - - "equal_to": value should be ~= baseline * threshold within rel_tol + Defines the policy for a given metric. + - "greater_than": higher is better; violation if new value < baseline * threshold + - "less_than": lower is better; violation if new value > baseline * threshold + - "equal_to": new value should be ~= baseline * threshold within rel_tol + - "greater_equal": higher is better; violation if new value <= baseline * threshold + - "less_equal": lower is better; violation if new value >= baseline * threshold + """ name: str - condition: Literal["greater_than", "less_than", "equal_to"] + condition: Literal["greater_than", "less_than", "equal_to","greater_equal","less_equal"] threshold: float + baseline_aggregation: Literal["avg", "max", "min", "p50", "p90", "p95","latest","earliest"] = "max" rel_tol: float = 1e-3 # used only for "equal_to" def is_violation(self, value: float, baseline: float) -> bool: target = baseline * self.threshold if self.condition == "greater_than": - # value should be >= target + # value must be strictly greater than target + return value <= target + + if self.condition == "greater_equal": + # value must be greater or equal to target return value < target if self.condition == "less_than": - # value should be <= target + # value must be strictly less than target + return value >= target + + if self.condition == "less_equal": + # value must be less or equal to target return value > target - # equal_to: |value - target| should be within rel_tol * max(1, |target|) - denom = max(1.0, abs(target)) - return abs(value - target) > self.rel_tol * denom + if self.condition == "equal_to": + # |value - target| should be within rel_tol * max(1, |target|) + denom = max(1.0, abs(target)) + return abs(value - target) > self.rel_tol * denom + raise ValueError(f"Unknown condition: {self.condition}") class BaseNotificationConfig: # every subclass must override this type_tag: ClassVar[str] diff --git a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py index bf66bdd92b..978fd8b71e 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py +++ b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py @@ -6,6 +6,7 @@ from common.config_model import BenchmarkConfig, RegressionPolicy from common.benchmark_time_series_api_model import ( BenchmarkTimeSeriesApiData, + BenchmarkTimeSeriesItem, ) import pprint @@ -21,7 +22,7 @@ class BaselineItem(TypedDict): value: float -class LatestItem(TypedDict): +class BenchmarkValueItem(TypedDict): group_info: Dict[str, Any] values: List[Dict[str, Any]] @@ -35,8 +36,6 @@ class PerGroupResult(TypedDict, total=True): def percentile(values: list[float], q: float): - if not values: - return None v = sorted(values) k = (len(v) - 1) * q f = math.floor(k) @@ -54,20 +53,20 @@ def __init__( baseline_ts: BenchmarkTimeSeriesApiData, ) -> None: self.metric_policies = config.policy.metrics - self.latest_ts = self._to_latest_data_map(latest_ts) - self.baseline_ts = self._to_baseline_map(baseline_ts, mode="max") + self.latest_ts = self._to_data_map(latest_ts) + self.baseline_raw = self._to_data_map(baseline_ts) def generate(self) -> Tuple[List[PerGroupResult], bool]: return self.detect_regressions_with_policies( - self.baseline_ts, + self.baseline_raw, self.latest_ts, metric_policies=self.metric_policies, ) def detect_regressions_with_policies( self, - baseline_map: Dict[tuple, BaselineItem], - dp_map: Dict[tuple, LatestItem], + baseline_map: Dict[tuple, BenchmarkValueItem], + dp_map: Dict[tuple, BenchmarkValueItem], *, metric_policies: Dict[str, RegressionPolicy], min_points: int = 2, @@ -90,27 +89,41 @@ def detect_regressions_with_policies( points: List[Any] = cur_item["values"] if cur_item else [] base_item = baseline_map.get(key) + if not base_item: + logger.warning("Skip. No baseline item found for %s", gi) + results.append( + PerGroupResult( + group_info=gi, + baseline=None, + points=[], + label="insufficient_data", + policy=None, + ) + ) + continue logger.info("base_item for keys(%s):\n%s ",key, pprint.pformat(base_item)) - baseline_value = base_item.get("value") if base_item else None policy = self._resolve_policy(metric_policies, gi.get("metric", "")) if not policy: logger.warning("No policy for %s", gi) results.append( PerGroupResult( group_info=gi, - baseline=baseline_value, + baseline=None, points=[], label="insufficient_data", policy=None, ) ) continue + + baseline_aggre_mode = policy.baseline_aggregation + baseline_value = self._get_baseline(base_item,baseline_aggre_mode) if baseline_value is None or len(points) == 0: logger.warning("baseline_value is %s, len(points) == %s", baseline_value,len(points)) results.append( PerGroupResult( group_info=gi, - baseline=baseline_value, + baseline=None, points=[], label="insufficient_data", policy=policy, @@ -120,7 +133,7 @@ def detect_regressions_with_policies( # Per-point violations (True = regression) flags: List[bool] = [ - policy.is_violation(p["value"], baseline_value) for p in points + policy.is_violation(p["value"], baseline_value["value"]) for p in points ] label = self.classify_flags(flags, min_points=min_points) @@ -138,10 +151,10 @@ def detect_regressions_with_policies( is_any_regression = True return results, is_any_regression - def _to_latest_data_map( + def _to_data_map( self, data: "BenchmarkTimeSeriesApiData", field: str = "value" - ) -> Dict[tuple, LatestItem]: - result: Dict[tuple, LatestItem] = {} + ) -> Dict[tuple, BenchmarkValueItem]: + result: Dict[tuple, BenchmarkValueItem] = {} for ts_group in data.time_series: group_keys = tuple(sorted(ts_group.group_info.items())) points: List[Dict[str, Any]] = [] @@ -164,32 +177,39 @@ def _to_latest_data_map( } return result - def _to_baseline_map( + def _get_baseline( self, - baseline: BenchmarkTimeSeriesApiData, + data: BenchmarkValueItem, mode: str = "mean", field: str = "value", - ) -> Dict[tuple, BaselineItem]: - result = {} - for ts_group in baseline.time_series: - group_keys = tuple(sorted(ts_group.group_info.items())) - values = [float(d[field]) for d in ts_group.data if field in d] - if not values: - continue - - if mode == "mean": - val = statistics.fmean(values) - elif mode == "p90": - val = percentile(values, 0.9) - elif mode == "max": - val = max(values) - else: - raise ValueError("mode must be 'mean' or 'p90'") + ) -> Optional[BaselineItem]: + values = [float(d[field]) for d in data["values"] if field in d] + if not values: + return None - result[group_keys] = { - "group_info": ts_group.group_info, - "value": val, - } + if mode == "mean": + val = statistics.fmean(values) + elif mode == "p90": + val = percentile(values, 0.9) + elif mode == "max": + val = max(values) + elif mode == "min": + val = min(values) + elif mode == "latest": + val = values[-1] + elif mode == "earliest": + val = values[0] + elif mode == "p50": + val = percentile(values, 0.5) + elif mode == "p95": + val = percentile(values, 0.95) + else: + logger.warning("Unknown mode: %s", mode) + return None + result:BaselineItem = { + "group_info": data["group_info"], + "value": val, + } return result def classify_flags( From f9d7f66f741d682fd286f2a0d151780ad6d066ac Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 3 Sep 2025 02:53:20 -0700 Subject: [PATCH 24/27] addid --- .../common/config.py | 6 +++--- .../common/config_model.py | 10 +++++----- .../lambda_function.py | 5 +---- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py index 2c5280a63b..d894c5c544 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/config.py +++ b/aws/lambda/benchmark_regression_summary_report/common/config.py @@ -48,13 +48,13 @@ ), metrics={ "passrate": RegressionPolicy( - name="passrate", condition="greater_than", threshold=0.9, baseline_aggregation="max" + name="passrate", condition="greater_equal", threshold=0.9, baseline_aggregation="max", ), "geomean": RegressionPolicy( - name="geomean", condition="greater_than", threshold=0.95,baseline_aggregation="max" + name="geomean", condition="greater_equal", threshold=0.95,baseline_aggregation="max", ), "compression_ratio": RegressionPolicy( - name="compression_ratio", condition="greater_than", threshold=0.9, baseline_aggregation="max" + name="compression_ratio", condition="greater_equal", threshold=0.9, baseline_aggregation="max", ), }, notification_config={ diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py index 66e9ed04ed..59c2f86d9a 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/config_model.py +++ b/aws/lambda/benchmark_regression_summary_report/common/config_model.py @@ -98,12 +98,12 @@ def baseline_timedelta(self) -> timedelta: class RegressionPolicy: """ Defines the policy for a given metric. - - "greater_than": higher is better; violation if new value < baseline * threshold - - "less_than": lower is better; violation if new value > baseline * threshold + - new value muset be {x} baseline value: + - "greater_than": higher is better; new value must be strictly greater to baseline + - "less_than": lower is better; new value must be strictly lower to baseline - "equal_to": new value should be ~= baseline * threshold within rel_tol - - "greater_equal": higher is better; violation if new value <= baseline * threshold - - "less_equal": lower is better; violation if new value >= baseline * threshold - + - "greater_equal": higher is better; new value must be greater or equal to baseline + - "less_equal": lower is better; new value must be less or equal to baseline """ name: str condition: Literal["greater_than", "less_than", "equal_to","greater_equal","less_equal"] diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py index 29c2596e0e..adb98d2d9c 100644 --- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py +++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py @@ -88,7 +88,6 @@ def process( ): # ensure each thread has its own clickhouse client. clickhouse client # is not thread-safe. - logger.info("here") if cc is None: tlocal = threading.local() if not hasattr(tlocal, "cc") or tlocal.cc is None: @@ -101,8 +100,6 @@ def process( else: tlocal.cc = get_clickhouse_client_environment() cc = tlocal.cc - logger.info("i'm here") - try: config = get_benchmark_regression_config(config_id) logger.info("found config for config_id %s",config_id) @@ -215,7 +212,7 @@ def _fetch_from_benchmark_ts_api( ) url = source.api_query_url - logger.info("trying to call %s",url) + logger.info("[%s]trying to call %s, with query %s",config_id, url,query) try: resp: BenchmarkTimeSeriesApiResponse = ( BenchmarkTimeSeriesApiResponse.from_request(url, query) From 4bbb803f442cf9317e7500efec15f03664ec5088 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 3 Sep 2025 03:11:19 -0700 Subject: [PATCH 25/27] addid --- .../common/regression_utils.py | 4 +--- .../benchmark_regression_summary_report/lambda_function.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py index 978fd8b71e..10091192b9 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py +++ b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py @@ -83,7 +83,6 @@ def detect_regressions_with_policies( is_any_regression = False for key in sorted(dp_map.keys()): - logger.info("key: %s", key) cur_item = dp_map.get(key) gi = cur_item["group_info"] if cur_item else {} points: List[Any] = cur_item["values"] if cur_item else [] @@ -101,7 +100,6 @@ def detect_regressions_with_policies( ) ) continue - logger.info("base_item for keys(%s):\n%s ",key, pprint.pformat(base_item)) policy = self._resolve_policy(metric_policies, gi.get("metric", "")) if not policy: logger.warning("No policy for %s", gi) @@ -141,7 +139,7 @@ def detect_regressions_with_policies( results.append( PerGroupResult( group_info=gi, - baseline=baseline_value, + baseline= baseline_value["value"], points=enriched_points, label=label, policy=policy, diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py index adb98d2d9c..4c8a0f5863 100644 --- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py +++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py @@ -212,7 +212,7 @@ def _fetch_from_benchmark_ts_api( ) url = source.api_query_url - logger.info("[%s]trying to call %s, with query %s",config_id, url,query) + logger.info("[%s]trying to call %s, with query\n %s",config_id, url,query) try: resp: BenchmarkTimeSeriesApiResponse = ( BenchmarkTimeSeriesApiResponse.from_request(url, query) From c193c8eb71b14b49e17b72913b87a4982df71658 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 3 Sep 2025 13:11:18 -0700 Subject: [PATCH 26/27] addid --- .../params.json | 17 ------ .../query.sql | 61 ------------------- 2 files changed, 78 deletions(-) delete mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json delete mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json deleted file mode 100644 index 95f00e1501..0000000000 --- a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "params": { - "branches": "Array(String)", - "commits": "Array(String)", - "compilers": "Array(String)", - "device": "String", - "arch": "String", - "dtype": "String", - "granularity": "String", - "mode": "String", - "startTime": "DateTime64(3)", - "stopTime": "DateTime64(3)", - "suites": "Array(String)", - "workflowId": "Int64" - }, - "tests": [] -} diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql deleted file mode 100644 index bf233f6ff8..0000000000 --- a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql +++ /dev/null @@ -1,61 +0,0 @@ -WITH benchmarks AS ( - SELECT - workflow_id, - job_id, - suite, - model_name, - metric_name, - value, - metric_extra_info AS extra_info, - DATE_TRUNC({granularity:String}, fromUnixTimestamp(timestamp)) AS granularity_bucket, - benchmark_dtype, - benchmark_mode, - device, - arch, - replaceOne(head_branch, 'refs/heads/', '') AS head_branch, - - benchmark_extra_info['output'] AS output, - - REGEXP_REPLACE( - benchmark_extra_info['output'], - CONCAT('_', suite, '_', {dtype:String}, '_', {mode:String}, '_', {device:String}, '_.*'), - '' - ) AS temp - - FROM benchmark.oss_ci_benchmark_torchinductor - WHERE - timestamp >= toUnixTimestamp({startTime:DateTime64(3)}) AND - timestamp < toUnixTimestamp({stopTime:DateTime64(3)}) AND - (has({commits:Array(String)}, head_sha) OR empty({commits:Array(String)})) AND - (has({suites:Array(String)}, suite) OR empty({suites:Array(String)})) AND - (workflow_id = {workflowId:Int64} OR {workflowId:Int64} = 0) -) - -SELECT - workflow_id, - job_id, - REGEXP_REPLACE(temp, '.*/', '') AS backend, - suite, - model_name AS model, - metric_name AS metric, - value, - output, - granularity_bucket, - extra_info, -FROM benchmarks -WHERE - (has({branches:Array(String)}, head_branch) OR empty({branches:Array(String)})) - AND ( - ( - ({arch:String} = '' OR {arch:String} = 'a100') AND - output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_%') - ) OR ( - {arch:String} != '' AND - output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_', {arch:String}, '\_%') - ) OR ( - benchmark_dtype = {dtype:String} AND - benchmark_mode = {mode:String} AND - device = {device:String} AND - arch = {arch:String} - ) - ); From 7d882c530180de90bab6828199c28f91a6f62fb2 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 3 Sep 2025 13:12:09 -0700 Subject: [PATCH 27/27] addid --- torchci/lib/benchmark/compilerUtils.ts | 6 - .../pages/api/benchmark/get_time_series.ts | 190 ------------------ 2 files changed, 196 deletions(-) diff --git a/torchci/lib/benchmark/compilerUtils.ts b/torchci/lib/benchmark/compilerUtils.ts index 6d3e0902d9..00212177b3 100644 --- a/torchci/lib/benchmark/compilerUtils.ts +++ b/torchci/lib/benchmark/compilerUtils.ts @@ -100,10 +100,7 @@ export function computePassrate( const [bucket, workflowId, suite, compiler] = key.split("+"); passrate.push({ metric: "passrate", -<<<<<<< HEAD value: p, -======= ->>>>>>> 556c0ef04 (addid) granularity_bucket: bucket, workflow_id: workflowId, suite: suite, @@ -169,10 +166,7 @@ export function computeGeomean( const [bucket, workflowId, suite, compiler] = key.split("+"); returnedGeomean.push({ metric: "geomean", -<<<<<<< HEAD value: Number(gm), -======= ->>>>>>> 556c0ef04 (addid) granularity_bucket: bucket, workflow_id: workflowId, suite: suite, diff --git a/torchci/pages/api/benchmark/get_time_series.ts b/torchci/pages/api/benchmark/get_time_series.ts index a51a4f77cd..ce069f5590 100644 --- a/torchci/pages/api/benchmark/get_time_series.ts +++ b/torchci/pages/api/benchmark/get_time_series.ts @@ -1,28 +1,6 @@ -<<<<<<< HEAD import { getCompilerBenchmarkData } from "lib/benchmark/api_helper/compilers/precompute"; import { readApiGetParams } from "lib/benchmark/api_helper/utils"; import type { NextApiRequest, NextApiResponse } from "next"; -======= -import { - computeGeomean, - computePassrate, - computePeakMemoryUsage, - convertToCompilerPerformanceData, - getPassingModels, -} from "lib/benchmark/compilerUtils"; -import { queryClickhouseSaved } from "lib/clickhouse"; -import type { NextApiRequest, NextApiResponse } from "next"; -import { getNestedField } from "./group_data"; - -type GroupInfo = Record; -type Subgroup = { group_Info: GroupInfo; data: T[] }; -type GroupedItem = { - group_Info: GroupInfo; - rows: Record>; -}; -type Params = Record; -const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2"; ->>>>>>> 556c0ef04 (addid) /** * API Route: /api/benchmark/get_time_series @@ -83,171 +61,3 @@ async function getBenmarkTimeSeriesData( throw new Error(`Unsupported request_name: ${request_name}`); } } -<<<<<<< HEAD -======= - -// Utility to extract params from either GET or POST -// it accepts both ?parameters= and POST with JSON body -function readParams(req: NextApiRequest): Params { - // 1) If POST with parsed JSON body - if (req.method === "POST" && req.body && typeof req.body === "object") { - return req.body as Params; - } - - // 2) If POST with raw string body - if ( - req.method === "POST" && - typeof req.body === "string" && - req.body.trim() - ) { - try { - return JSON.parse(req.body) as Params; - } catch {} - } - - // 3) If GET with ?parameters= - const raw = req.query.parameters as string | undefined; - if (raw) { - try { - return JSON.parse(raw) as Params; - } catch {} - } - - // 4) Fallback: use query params directly - const q: Params = {}; - Object.entries(req.query).forEach(([k, v]) => { - if (k !== "parameters") q[k] = Array.isArray(v) ? v[0] : v; - }); - return q; -} - -/** - * Group data by `keys`, and inside each group further subgroup by `subGroupKeys`. - */ -function groupBy( - data: T[], - keys: string[], - subGroupKeys: string[] = [] -): GroupedItem[] { - const groups = new Map>>(); - const mainInfo = new Map(); - - for (const row of data as any[]) { - // build main group key - const mainKeyParts = keys.map((k) => String(getNestedField(row, k))); - const mainKey = mainKeyParts.join("|"); - if (!mainInfo.has(mainKey)) { - const info: GroupInfo = {}; - keys.forEach((k, i) => (info[k] = mainKeyParts[i])); - mainInfo.set(mainKey, info); - } - - // build subgroup key - const subKeyParts = - subGroupKeys.length > 0 - ? subGroupKeys.map((k) => String(getNestedField(row, k))) - : ["__ALL__"]; // default single subgroup if none provided - const subKey = subKeyParts.join("|"); - const subInfo: GroupInfo = {}; - - subGroupKeys.forEach((k, i) => (subInfo[k] = subKeyParts[i])); - - if (!groups.has(mainKey)) groups.set(mainKey, new Map()); - const subMap = groups.get(mainKey)!; - - if (!subMap.has(subKey)) { - subMap.set(subKey, { group_Info: subInfo, data: [] }); - } - subMap.get(subKey)!.data.push(row as T); - } - - // build result array - const result: GroupedItem[] = []; - for (const [mainKey, subMap] of groups.entries()) { - const rowsObj = Object.fromEntries(subMap.entries()); - result.push({ - group_Info: mainInfo.get(mainKey)!, - rows: rowsObj, - }); - } - return result; -} - -async function getCompilerBenchmarkData(inputparams: any) { - const start = Date.now(); - const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams); - const end = Date.now(); - const result = toPrecomputeCompiler(rows, inputparams, "time_series"); - console.log("time to get data", end - start); - return result; -} - -function toPrecomputeCompiler( - rawData: any[], - inputparams: any, - type: string = "time_series" -) { - const data = convertToCompilerPerformanceData(rawData); - const models = getPassingModels(data); - - const passrate = computePassrate(data, models); - const geomean = computeGeomean(data, models); - const peakMemory = computePeakMemoryUsage(data, models); - - const all_data = [passrate, geomean, peakMemory].flat(); - - all_data.map((row) => { - row["dtype"] = inputparams["dtype"]; - row["arch"] = inputparams["arch"]; - row["device"] = inputparams["device"]; - row["mode"] = inputparams["mode"]; - }); - - let res: any[] = []; - switch (type) { - case "time_series": - // grouping data by comipler, device, arch, dtype, suite, metric, mode - // then sorted it with granularity_bucket in ascending order - const tsd = groupBy( - all_data, - ["dtype", "arch", "device", "suite", "compiler", "metric", "mode"], - ["workflow_id"] - ); - res = tsd.map((group) => { - const group_info = group.group_Info; - const group_data = group.rows; - - // no need for the group_info for subgroup, directly get the data - const ts_list = Object.values(group_data) - .filter((item) => item.data.length > 0) - .map((item) => item.data[0]) - .sort( - (a, b) => - new Date(a.granularity_bucket).getTime() - - new Date(b.granularity_bucket).getTime() - ); - return { - group_info, - num_of_dp: ts_list.length, - result: ts_list, - }; - }); - return res; - case "table": - res = groupBy( - all_data, - [ - "dtype", - "arch", - "device", - "mode", - "workflow_id", - "granularity_bucket", - ], - ["metric", "compiler"] - ); - } - - return res; -} ->>>>>>> 556c0ef04 (addid)