From 62bca1b37b624e37c7152e318190a27d0b5bf1d9 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Sun, 31 Aug 2025 11:15:09 -0700
Subject: [PATCH 01/27] addid

---
 .../params.json                               |  17 +++
 .../query.sql                                 | 110 ++++++++++++++++++
 .../components/metrics/panels/TablePanel.tsx  |   1 +
 .../benchmark/compiler_benmark_time_series.ts |  40 +++++++
 torchci/pages/api/benchmark/group_data.ts     |   1 +
 5 files changed, 169 insertions(+)
 create mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json
 create mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
 create mode 100644 torchci/pages/api/benchmark/compiler_benmark_time_series.ts

diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json
new file mode 100644
index 0000000000..95f00e1501
--- /dev/null
+++ b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json
@@ -0,0 +1,17 @@
+{
+  "params": {
+    "branches": "Array(String)",
+    "commits": "Array(String)",
+    "compilers": "Array(String)",
+    "device": "String",
+    "arch": "String",
+    "dtype": "String",
+    "granularity": "String",
+    "mode": "String",
+    "startTime": "DateTime64(3)",
+    "stopTime": "DateTime64(3)",
+    "suites": "Array(String)",
+    "workflowId": "Int64"
+  },
+  "tests": []
+}
diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
new file mode 100644
index 0000000000..7ef1efb232
--- /dev/null
+++ b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
@@ -0,0 +1,110 @@
+-- This query is used to get the PT2 benchmark results from different experiments
+-- to powers the TorchInductor benchmark dashboard
+WITH benchmarks AS (
+    SELECT
+        workflow_id,
+        job_id,
+        suite,
+        model_name,
+        metric_name,
+        value,
+        metric_extra_info AS extra_info,
+        DATE_TRUNC(
+            {granularity: String },
+            fromUnixTimestamp(timestamp)
+        ) AS granularity_bucket,
+        -- Filters
+        benchmark_dtype,
+        benchmark_mode,
+        device,
+        arch,
+        replaceOne(head_branch, 'refs/heads/', '') AS head_branch,
+        benchmark_extra_info['output'] AS output,
+        REGEXP_REPLACE(
+            output,
+            CONCAT(
+                '_',
+                suite,
+                '_',
+                { dtype: String },
+                '_',
+                {mode: String },
+                '_',
+                {device: String },
+                '_.*'
+            ),
+            ''
+        ) AS temp
+    FROM
+        benchmark.oss_ci_benchmark_torchinductor
+    WHERE
+        timestamp >= toUnixTimestamp({startTime: DateTime64(3) })
+        AND timestamp < toUnixTimestamp({stopTime: DateTime64(3) })
+        AND (
+            has({commits: Array(String) }, head_sha)
+            OR empty({commits: Array(String) })
+        )
+        AND (
+            has({suites: Array(String) }, suite)
+            OR empty({suites: Array(String) })
+        )
+        AND (
+            workflow_id = {workflowId: Int64}
+            OR {workflowId: Int64} = 0
+        )
+)
+
+SELECT
+    workflow_id,
+    job_id,
+    REGEXP_REPLACE(temp, '.*/', '') AS backend,
+    suite,
+    model_name AS model,
+    metric_name AS metric,
+    value,
+    extra_info,
+    output,
+    granularity_bucket
+FROM
+    benchmarks
+WHERE
+    (
+        has({branches: Array(String) }, head_branch)
+        OR empty({branches: Array(String) })
+    )
+    -- TODO (huydhn): Clean up the output field and how it's used in the query
+    -- in 6 months
+    AND (
+        (
+            ({arch: String } = '' OR {arch: String } = 'a100')
+            AND output LIKE CONCAT(
+                '%\_',
+                {dtype: String },
+                '\_',
+                {mode: String },
+                '\_',
+                {device: String },
+                '\_%'
+            )
+        )
+        OR (
+            {arch: String } != ''
+            AND output LIKE CONCAT(
+                '%\_',
+                {dtype: String },
+                '\_',
+                {mode: String },
+                '\_',
+                {device: String },
+                '\_',
+                {arch: String },
+                '\_%'
+            )
+        )
+        OR (
+            benchmark_dtype = {dtype: String }
+            AND benchmark_mode = {mode: String }
+            AND device = {device: String }
+            AND arch = {arch: String }
+        )
+    )
diff --git a/torchci/components/metrics/panels/TablePanel.tsx b/torchci/components/metrics/panels/TablePanel.tsx
index d53d0ab8d6..023223d238 100644
--- a/torchci/components/metrics/panels/TablePanel.tsx
+++ b/torchci/components/metrics/panels/TablePanel.tsx
@@ -7,6 +7,7 @@ import useSWR from "swr";
 
 const fetcher = (url: string) => fetch(url).then((res) => res.json());
 
+
 export default function TablePanel({
   // Human-readable title for this panel.
   title,
diff --git a/torchci/pages/api/benchmark/compiler_benmark_time_series.ts b/torchci/pages/api/benchmark/compiler_benmark_time_series.ts
new file mode 100644
index 0000000000..4e8254766d
--- /dev/null
+++ b/torchci/pages/api/benchmark/compiler_benmark_time_series.ts
@@ -0,0 +1,40 @@
+
+import { queryClickhouseSaved } from "lib/clickhouse";
+import type { NextApiRequest, NextApiResponse } from "next";
+
+const DEFAULT_TABLE_GROUP = [
+  "device",
+  "backend",
+  "model",
+  "dtype",
+  "backend",
+  "arch",
+];
+const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2";
+
+type QueryParams = {
+  startTime: string; // ISO timestamp
+  stopTime: string;  // ISO timestamp
+  [k: string]: any;  // other parameters
+};
+
+
+export default async function handler(
+  req: NextApiRequest,
+  res: NextApiResponse
+) {
+
+  console.log("compiler_benmark_time_series.");
+  const inputparams = JSON.parse(req.query.parameters as string)
+  console.log("inputs", inputparams);
+
+  const start = Date.now();
+  const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams);
+  const end = Date.now();
+  console.log(`Process took ${end - start}ms`);
+
+  console.log("merged rows:", rows.length);
+  res.status(200).json({ data: rows });
+}
+
+
diff --git a/torchci/pages/api/benchmark/group_data.ts b/torchci/pages/api/benchmark/group_data.ts
index 713ed480d9..d8703d8a8b 100644
--- a/torchci/pages/api/benchmark/group_data.ts
+++ b/torchci/pages/api/benchmark/group_data.ts
@@ -32,6 +32,7 @@ export default async function handler(
       details: formatZodError(request.error),
     });
   }
+
   const qp = request.data;
   const groupTableByFields =
     qp.group_table_by_fields || deepClone(DEFAULT_TABLE_GROUP);

From 6a5f5006a29069d2a6cbec50df2e2dfa43a5ef48 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Sun, 31 Aug 2025 18:12:51 -0700
Subject: [PATCH 02/27] addid

---
 .../query.sql                                 | 155 +++++---------
 .../components/metrics/panels/TablePanel.tsx  |   2 +-
 torchci/lib/benchmark/compilerUtils.ts        |  48 +++++
 .../benchmark/compiler_benmark_time_series.ts |  40 ----
 .../pages/api/benchmark/get_time_series.ts    | 202 ++++++++++++++++++
 torchci/pages/api/benchmark/group_data.ts     |   2 +-
 6 files changed, 305 insertions(+), 144 deletions(-)
 delete mode 100644 torchci/pages/api/benchmark/compiler_benmark_time_series.ts

diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
index 7ef1efb232..bf233f6ff8 100644
--- a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
+++ b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
@@ -1,110 +1,61 @@
--- This query is used to get the PT2 benchmark results from different experiments
--- to powers the TorchInductor benchmark dashboard
 WITH benchmarks AS (
-    SELECT
-        workflow_id,
-        job_id,
-        suite,
-        model_name,
-        metric_name,
-        value,
-        metric_extra_info AS extra_info,
-        DATE_TRUNC(
-            {granularity: String },
-            fromUnixTimestamp(timestamp)
-        ) AS granularity_bucket,
-        -- Filters
-        benchmark_dtype,
-        benchmark_mode,
-        device,
-        arch,
-        replaceOne(head_branch, 'refs/heads/', '') AS head_branch,
-        benchmark_extra_info['output'] AS output,
-        REGEXP_REPLACE(
-            output,
-            CONCAT(
-                '_',
-                suite,
-                '_',
-                { dtype: String },
-                '_',
-                {mode: String },
-                '_',
-                {device: String },
-                '_.*'
-            ),
-            ''
-        ) AS temp
-    FROM
-        benchmark.oss_ci_benchmark_torchinductor
-    WHERE
-        timestamp >= toUnixTimestamp({startTime: DateTime64(3) })
-        AND timestamp < toUnixTimestamp({stopTime: DateTime64(3) })
-        AND (
-            has({commits: Array(String) }, head_sha)
-            OR empty({commits: Array(String) })
-        )
-        AND (
-            has({suites: Array(String) }, suite)
-            OR empty({suites: Array(String) })
-        )
-        AND (
-            workflow_id = {workflowId: Int64}
-            OR {workflowId: Int64} = 0
-        )
-)
-
-SELECT
+  SELECT
     workflow_id,
     job_id,
-    REGEXP_REPLACE(temp, '.*/', '') AS backend,
     suite,
-    model_name AS model,
-    metric_name AS metric,
+    model_name,
+    metric_name,
     value,
-    extra_info,
-    output,
-    granularity_bucket
-FROM
-    benchmarks
+    metric_extra_info AS extra_info,
+    DATE_TRUNC({granularity:String}, fromUnixTimestamp(timestamp)) AS granularity_bucket,
+    benchmark_dtype,
+    benchmark_mode,
+    device,
+    arch,
+    replaceOne(head_branch, 'refs/heads/', '') AS head_branch,
+
+    benchmark_extra_info['output'] AS output,
+
+    REGEXP_REPLACE(
+      benchmark_extra_info['output'],
+      CONCAT('_', suite, '_', {dtype:String}, '_', {mode:String}, '_', {device:String}, '_.*'),
+      ''
+    ) AS temp
+
+  FROM benchmark.oss_ci_benchmark_torchinductor
+  WHERE
+    timestamp >= toUnixTimestamp({startTime:DateTime64(3)}) AND
+    timestamp <  toUnixTimestamp({stopTime:DateTime64(3)}) AND
+    (has({commits:Array(String)}, head_sha) OR empty({commits:Array(String)})) AND
+    (has({suites:Array(String)}, suite)     OR empty({suites:Array(String)})) AND
+    (workflow_id = {workflowId:Int64} OR {workflowId:Int64} = 0)
+)
+
+SELECT
+  workflow_id,
+  job_id,
+  REGEXP_REPLACE(temp, '.*/', '') AS backend,
+  suite,
+  model_name AS model,
+  metric_name AS metric,
+  value,
+  output,
+  granularity_bucket,
+  extra_info,
+FROM benchmarks
 WHERE
+  (has({branches:Array(String)}, head_branch) OR empty({branches:Array(String)}))
+  AND (
     (
-        has({branches: Array(String) }, head_branch)
-        OR empty({branches: Array(String) })
-    )
-    -- TODO (huydhn): Clean up the output field and how it's used in the query
-    -- in 6 months
-    AND (
-        (
-            ({arch: String } = '' OR {arch: String } = 'a100')
-            AND output LIKE CONCAT(
-                '%\_',
-                {dtype: String },
-                '\_',
-                {mode: String },
-                '\_',
-                {device: String },
-                '\_%'
-            )
-        )
-        OR (
-            {arch: String } != ''
-            AND output LIKE CONCAT(
-                '%\_',
-                {dtype: String },
-                '\_',
-                {mode: String },
-                '\_',
-                {device: String },
-                '\_',
-                {arch: String },
-                '\_%'
-            )
-        )
-        OR (
-            benchmark_dtype = {dtype: String }
-            AND benchmark_mode = {mode: String }
-            AND device = {device: String }
-            AND arch = {arch: String }
-        )
+      ({arch:String} = '' OR {arch:String} = 'a100') AND
+      output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_%')
+    ) OR (
+      {arch:String} != '' AND
+      output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_', {arch:String}, '\_%')
+    ) OR (
+      benchmark_dtype = {dtype:String} AND
+      benchmark_mode  = {mode:String}  AND
+      device          = {device:String} AND
+      arch            = {arch:String}
     )
+  );
diff --git a/torchci/components/metrics/panels/TablePanel.tsx b/torchci/components/metrics/panels/TablePanel.tsx
index 023223d238..c53b29f4e3 100644
--- a/torchci/components/metrics/panels/TablePanel.tsx
+++ b/torchci/components/metrics/panels/TablePanel.tsx
@@ -1,13 +1,13 @@
 import HelpIcon from "@mui/icons-material/Help";
 import { Box, Skeleton, Typography } from "@mui/material";
 import IconButton from "@mui/material/IconButton";
+import { Box } from "@mui/system";
 import { DataGrid, GridColDef } from "@mui/x-data-grid";
 import { CSSProperties } from "react";
 import useSWR from "swr";
 
 const fetcher = (url: string) => fetch(url).then((res) => res.json());
 
-
 export default function TablePanel({
   // Human-readable title for this panel.
   title,
diff --git a/torchci/lib/benchmark/compilerUtils.ts b/torchci/lib/benchmark/compilerUtils.ts
index 06e4542269..00212177b3 100644
--- a/torchci/lib/benchmark/compilerUtils.ts
+++ b/torchci/lib/benchmark/compilerUtils.ts
@@ -456,3 +456,51 @@ export function convertToCompilerPerformanceData(data: BenchmarkData[]) {
 
   return Object.values(convertData);
 }
+
+export function computePassrateSimple(data: any[]) {
+  if (!Array.isArray(data) || data.length === 0) return [];
+
+  const blocked = new Set(BLOCKLIST_COMPILERS);
+  const passingAcc = new Set(PASSING_ACCURACY);
+  const toDisplay = (c: string) => COMPILER_NAMES_TO_DISPLAY_NAMES[c] ?? c;
+
+  const totalCount = new Map<string, number>();
+  const passCount = new Map<string, number>();
+
+  for (const r of data) {
+    const compilerDisp = toDisplay(r.compiler);
+    if (blocked.has(compilerDisp)) continue;
+
+    const key = `${r.granularity_bucket}+${r.workflow_id}+${r.suite}+${compilerDisp}`;
+
+    // 计总
+    totalCount.set(key, (totalCount.get(key) ?? 0) + 1);
+
+    const acc = r.accuracy ?? "";
+    const speed = r.speedup ?? 0;
+    const pass =
+      (passingAcc.has(acc) && (speed !== 0 || compilerDisp === "export")) ||
+      acc === "pass_due_to_skip";
+
+    if (pass) passCount.set(key, (passCount.get(key) ?? 0) + 1);
+  }
+  const out: any[] = [];
+  for (const [key, tc] of totalCount) {
+    const pc = passCount.get(key) ?? 0;
+    const p = tc > 0 ? pc / tc : 0;
+
+    const [bucket, wfStr, suite, compiler] = key.split("+");
+    out.push({
+      metirc: "passrate",
+      granularity_bucket: bucket,
+      workflow_id: Number(wfStr),
+      suite,
+      compiler,
+      passrate: p,
+      pass_count: pc,
+      total_count: tc,
+      passrate_display: `${(p * 100).toFixed(0)}%, ${pc}/${tc}`,
+    });
+  }
+  return out;
+}
diff --git a/torchci/pages/api/benchmark/compiler_benmark_time_series.ts b/torchci/pages/api/benchmark/compiler_benmark_time_series.ts
deleted file mode 100644
index 4e8254766d..0000000000
--- a/torchci/pages/api/benchmark/compiler_benmark_time_series.ts
+++ /dev/null
@@ -1,40 +0,0 @@
-
-import { queryClickhouseSaved } from "lib/clickhouse";
-import type { NextApiRequest, NextApiResponse } from "next";
-
-const DEFAULT_TABLE_GROUP = [
-  "device",
-  "backend",
-  "model",
-  "dtype",
-  "backend",
-  "arch",
-];
-const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2";
-
-type QueryParams = {
-  startTime: string; // ISO timestamp
-  stopTime: string;  // ISO timestamp
-  [k: string]: any;  // other parameters
-};
-
-
-export default async function handler(
-  req: NextApiRequest,
-  res: NextApiResponse
-) {
-
-  console.log("compiler_benmark_time_series.");
-  const inputparams = JSON.parse(req.query.parameters as string)
-  console.log("inputs", inputparams);
-
-  const start = Date.now();
-  const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams);
-  const end = Date.now();
-  console.log(`Process took ${end - start}ms`);
-
-  console.log("merged rows:", rows.length);
-  res.status(200).json({ data: rows });
-}
-
-
diff --git a/torchci/pages/api/benchmark/get_time_series.ts b/torchci/pages/api/benchmark/get_time_series.ts
index ce069f5590..7f05d6b492 100644
--- a/torchci/pages/api/benchmark/get_time_series.ts
+++ b/torchci/pages/api/benchmark/get_time_series.ts
@@ -1,6 +1,28 @@
+<<<<<<< HEAD
 import { getCompilerBenchmarkData } from "lib/benchmark/api_helper/compilers/precompute";
 import { readApiGetParams } from "lib/benchmark/api_helper/utils";
 import type { NextApiRequest, NextApiResponse } from "next";
+=======
+import {
+  computeGeomean,
+  computePassrate,
+  computePeakMemoryUsage,
+  convertToCompilerPerformanceData,
+  getPassingModels,
+} from "lib/benchmark/compilerUtils";
+import { queryClickhouseSaved } from "lib/clickhouse";
+import type { NextApiRequest, NextApiResponse } from "next";
+import { getNestedField } from "./group_data";
+
+type GroupInfo = Record<string, string>;
+type Subgroup<T> = { group_Info: GroupInfo; data: T[] };
+type GroupedItem<T> = {
+  group_Info: GroupInfo;
+  rows: Record<string, Subgroup<T>>;
+};
+type Params = Record<string, any>;
+const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2";
+>>>>>>> 52d0ced7a (addid)
 
 /**
  * API Route: /api/benchmark/get_time_series
@@ -22,12 +44,20 @@ export default async function handler(
   req: NextApiRequest,
   res: NextApiResponse
 ) {
+<<<<<<< HEAD
+=======
+
+>>>>>>> 52d0ced7a (addid)
   if (req.method !== "GET" && req.method !== "POST") {
     res.setHeader("Allow", "GET, POST");
     return res.status(405).json({ error: "Only GET and POST allowed" });
   }
 
+<<<<<<< HEAD
   const params = readApiGetParams(req);
+=======
+  const params = readParams(req);
+>>>>>>> 52d0ced7a (addid)
   console.log("[API]get_time_series, received request:", params);
 
   // validate params
@@ -39,6 +69,10 @@ export default async function handler(
   ) {
     return res.status(400).json({ error: "Missing parameters" });
   }
+<<<<<<< HEAD
+=======
+
+>>>>>>> 52d0ced7a (addid)
   // get time series data
   try {
     const { name, query_params } = params;
@@ -61,3 +95,171 @@ async function getBenmarkTimeSeriesData(
       throw new Error(`Unsupported request_name: ${request_name}`);
   }
 }
+<<<<<<< HEAD
+=======
+
+// Utility to extract params from either GET or POST
+// it accepts both ?parameters=<json string> and POST with JSON body
+function readParams(req: NextApiRequest): Params {
+  // 1) If POST with parsed JSON body
+  if (req.method === "POST" && req.body && typeof req.body === "object") {
+    return req.body as Params;
+  }
+
+  // 2) If POST with raw string body
+  if (
+    req.method === "POST" &&
+    typeof req.body === "string" &&
+    req.body.trim()
+  ) {
+    try {
+      return JSON.parse(req.body) as Params;
+    } catch {}
+  }
+
+  // 3) If GET with ?parameters=<json string>
+  const raw = req.query.parameters as string | undefined;
+  if (raw) {
+    try {
+      return JSON.parse(raw) as Params;
+    } catch {}
+  }
+
+  // 4) Fallback: use query params directly
+  const q: Params = {};
+  Object.entries(req.query).forEach(([k, v]) => {
+    if (k !== "parameters") q[k] = Array.isArray(v) ? v[0] : v;
+  });
+  return q;
+}
+
+/**
+ * Group data by `keys`, and inside each group further subgroup by `subGroupKeys`.
+ */
+function groupBy<T>(
+  data: T[],
+  keys: string[],
+  subGroupKeys: string[] = []
+): GroupedItem<T>[] {
+  const groups = new Map<string, Map<string, Subgroup<T>>>();
+  const mainInfo = new Map<string, GroupInfo>();
+
+  for (const row of data as any[]) {
+    // build main group key
+    const mainKeyParts = keys.map((k) => String(getNestedField(row, k)));
+    const mainKey = mainKeyParts.join("|");
+    if (!mainInfo.has(mainKey)) {
+      const info: GroupInfo = {};
+      keys.forEach((k, i) => (info[k] = mainKeyParts[i]));
+      mainInfo.set(mainKey, info);
+    }
+
+    // build subgroup key
+    const subKeyParts =
+      subGroupKeys.length > 0
+        ? subGroupKeys.map((k) => String(getNestedField(row, k)))
+        : ["__ALL__"]; // default single subgroup if none provided
+    const subKey = subKeyParts.join("|");
+    const subInfo: GroupInfo = {};
+
+    subGroupKeys.forEach((k, i) => (subInfo[k] = subKeyParts[i]));
+
+    if (!groups.has(mainKey)) groups.set(mainKey, new Map());
+    const subMap = groups.get(mainKey)!;
+
+    if (!subMap.has(subKey)) {
+      subMap.set(subKey, { group_Info: subInfo, data: [] });
+    }
+    subMap.get(subKey)!.data.push(row as T);
+  }
+
+  // build result array
+  const result: GroupedItem<T>[] = [];
+  for (const [mainKey, subMap] of groups.entries()) {
+    const rowsObj = Object.fromEntries(subMap.entries());
+    result.push({
+      group_Info: mainInfo.get(mainKey)!,
+      rows: rowsObj,
+    });
+  }
+  return result;
+}
+
+async function getCompilerBenchmarkData(inputparams: any) {
+  const start = Date.now();
+  const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams);
+  const end = Date.now();
+  const result = toPrecomputeCompiler(rows, inputparams, "time_series");
+  console.log("time to get data", end - start);
+  return result;
+}
+
+function toPrecomputeCompiler(
+  rawData: any[],
+  inputparams: any,
+  type: string = "time_series"
+) {
+  const data = convertToCompilerPerformanceData(rawData);
+  const models = getPassingModels(data);
+
+  const passrate = computePassrate(data, models);
+  const geomean = computeGeomean(data, models);
+  const peakMemory = computePeakMemoryUsage(data, models);
+
+  const all_data = [passrate, geomean, peakMemory].flat();
+
+  all_data.map((row) => {
+    row["dtype"] = inputparams["dtype"];
+    row["arch"] = inputparams["arch"];
+    row["device"] = inputparams["device"];
+    row["mode"] = inputparams["mode"];
+  });
+
+  let res: any[] = [];
+  switch (type) {
+    case "time_series":
+      // grouping data by comipler, device, arch, dtype, suite, metric, mode
+      // then sorted it with granularity_bucket in ascending order
+      const tsd = groupBy(
+        all_data,
+        ["dtype", "arch", "device", "suite", "compiler", "metric", "mode"],
+        ["workflow_id"]
+      );
+      res = tsd.map((group) => {
+        const group_info = group.group_Info;
+        const group_data = group.rows;
+
+        // no need for the group_info for subgroup, directly get the data
+        const ts_list = Object.values(group_data)
+          .filter((item) => item.data.length > 0)
+          .map((item) => item.data[0])
+          .sort(
+            (a, b) =>
+              new Date(a.granularity_bucket).getTime() -
+              new Date(b.granularity_bucket).getTime()
+          );
+        return {
+          group_info,
+          num_of_dp: ts_list.length,
+          result: ts_list,
+        };
+      });
+      return res;
+    case "table":
+      res = groupBy(
+        all_data,
+        [
+          "dtype",
+          "arch",
+          "device",
+          "mode",
+          "workflow_id",
+          "granularity_bucket",
+        ],
+        ["metric", "compiler"]
+      );
+  }
+
+  return res;
+}
+>>>>>>> 52d0ced7a (addid)
diff --git a/torchci/pages/api/benchmark/group_data.ts b/torchci/pages/api/benchmark/group_data.ts
index d8703d8a8b..030d79cc24 100644
--- a/torchci/pages/api/benchmark/group_data.ts
+++ b/torchci/pages/api/benchmark/group_data.ts
@@ -16,7 +16,7 @@ const DEFAULT_TABLE_GROUP = [
 const DEFAULT_ROW_GROUP = ["workflow_id", "job_id", "metadata_info.timestamp"];
 const BENCNMARK_TABLE_NAME = "oss_ci_benchmark_llms";
 
-function getNestedField(obj: any, path: string): any {
+export function getNestedField(obj: any, path: string): any {
   return path.split(".").reduce((o, key) => (o && key in o ? o[key] : ""), obj);
 }
 

From 9de60df4400fefffaf60050d7cfe4e49cb56db78 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Sun, 31 Aug 2025 18:15:02 -0700
Subject: [PATCH 03/27] addid

---
 .../params.json                               | 17 ------
 .../query.sql                                 | 61 -------------------
 .../pages/api/benchmark/get_time_series.ts    |  4 ++
 3 files changed, 4 insertions(+), 78 deletions(-)
 delete mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json
 delete mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql

diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json
deleted file mode 100644
index 95f00e1501..0000000000
--- a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "params": {
-    "branches": "Array(String)",
-    "commits": "Array(String)",
-    "compilers": "Array(String)",
-    "device": "String",
-    "arch": "String",
-    "dtype": "String",
-    "granularity": "String",
-    "mode": "String",
-    "startTime": "DateTime64(3)",
-    "stopTime": "DateTime64(3)",
-    "suites": "Array(String)",
-    "workflowId": "Int64"
-  },
-  "tests": []
-}
diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
deleted file mode 100644
index bf233f6ff8..0000000000
--- a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
+++ /dev/null
@@ -1,61 +0,0 @@
-WITH benchmarks AS (
-  SELECT
-    workflow_id,
-    job_id,
-    suite,
-    model_name,
-    metric_name,
-    value,
-    metric_extra_info AS extra_info,
-    DATE_TRUNC({granularity:String}, fromUnixTimestamp(timestamp)) AS granularity_bucket,
-    benchmark_dtype,
-    benchmark_mode,
-    device,
-    arch,
-    replaceOne(head_branch, 'refs/heads/', '') AS head_branch,
-
-    benchmark_extra_info['output'] AS output,
-
-    REGEXP_REPLACE(
-      benchmark_extra_info['output'],
-      CONCAT('_', suite, '_', {dtype:String}, '_', {mode:String}, '_', {device:String}, '_.*'),
-      ''
-    ) AS temp
-
-  FROM benchmark.oss_ci_benchmark_torchinductor
-  WHERE
-    timestamp >= toUnixTimestamp({startTime:DateTime64(3)}) AND
-    timestamp <  toUnixTimestamp({stopTime:DateTime64(3)}) AND
-    (has({commits:Array(String)}, head_sha) OR empty({commits:Array(String)})) AND
-    (has({suites:Array(String)}, suite)     OR empty({suites:Array(String)})) AND
-    (workflow_id = {workflowId:Int64} OR {workflowId:Int64} = 0)
-)
-
-SELECT
-  workflow_id,
-  job_id,
-  REGEXP_REPLACE(temp, '.*/', '') AS backend,
-  suite,
-  model_name AS model,
-  metric_name AS metric,
-  value,
-  output,
-  granularity_bucket,
-  extra_info,
-FROM benchmarks
-WHERE
-  (has({branches:Array(String)}, head_branch) OR empty({branches:Array(String)}))
-  AND (
-    (
-      ({arch:String} = '' OR {arch:String} = 'a100') AND
-      output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_%')
-    ) OR (
-      {arch:String} != '' AND
-      output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_', {arch:String}, '\_%')
-    ) OR (
-      benchmark_dtype = {dtype:String} AND
-      benchmark_mode  = {mode:String}  AND
-      device          = {device:String} AND
-      arch            = {arch:String}
-    )
-  );
diff --git a/torchci/pages/api/benchmark/get_time_series.ts b/torchci/pages/api/benchmark/get_time_series.ts
index 7f05d6b492..5de906f554 100644
--- a/torchci/pages/api/benchmark/get_time_series.ts
+++ b/torchci/pages/api/benchmark/get_time_series.ts
@@ -21,8 +21,12 @@ type GroupedItem<T> = {
   rows: Record<string, Subgroup<T>>;
 };
 type Params = Record<string, any>;
+<<<<<<< HEAD
 const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2";
 >>>>>>> 52d0ced7a (addid)
+=======
+const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance";
+>>>>>>> e259e2663 (addid)
 
 /**
  * API Route: /api/benchmark/get_time_series

From 94fbf8da51985218a8a09d709dcc406236499459 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Sun, 31 Aug 2025 18:26:33 -0700
Subject: [PATCH 04/27] addid

---
 .../pages/api/benchmark/get_time_series.ts    | 206 ------------------
 1 file changed, 206 deletions(-)

diff --git a/torchci/pages/api/benchmark/get_time_series.ts b/torchci/pages/api/benchmark/get_time_series.ts
index 5de906f554..ce069f5590 100644
--- a/torchci/pages/api/benchmark/get_time_series.ts
+++ b/torchci/pages/api/benchmark/get_time_series.ts
@@ -1,32 +1,6 @@
-<<<<<<< HEAD
 import { getCompilerBenchmarkData } from "lib/benchmark/api_helper/compilers/precompute";
 import { readApiGetParams } from "lib/benchmark/api_helper/utils";
 import type { NextApiRequest, NextApiResponse } from "next";
-=======
-import {
-  computeGeomean,
-  computePassrate,
-  computePeakMemoryUsage,
-  convertToCompilerPerformanceData,
-  getPassingModels,
-} from "lib/benchmark/compilerUtils";
-import { queryClickhouseSaved } from "lib/clickhouse";
-import type { NextApiRequest, NextApiResponse } from "next";
-import { getNestedField } from "./group_data";
-
-type GroupInfo = Record<string, string>;
-type Subgroup<T> = { group_Info: GroupInfo; data: T[] };
-type GroupedItem<T> = {
-  group_Info: GroupInfo;
-  rows: Record<string, Subgroup<T>>;
-};
-type Params = Record<string, any>;
-<<<<<<< HEAD
-const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2";
->>>>>>> 52d0ced7a (addid)
-=======
-const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance";
->>>>>>> e259e2663 (addid)
 
 /**
  * API Route: /api/benchmark/get_time_series
@@ -48,20 +22,12 @@ export default async function handler(
   req: NextApiRequest,
   res: NextApiResponse
 ) {
-<<<<<<< HEAD
-=======
-
->>>>>>> 52d0ced7a (addid)
   if (req.method !== "GET" && req.method !== "POST") {
     res.setHeader("Allow", "GET, POST");
     return res.status(405).json({ error: "Only GET and POST allowed" });
   }
 
-<<<<<<< HEAD
   const params = readApiGetParams(req);
-=======
-  const params = readParams(req);
->>>>>>> 52d0ced7a (addid)
   console.log("[API]get_time_series, received request:", params);
 
   // validate params
@@ -73,10 +39,6 @@ export default async function handler(
   ) {
     return res.status(400).json({ error: "Missing parameters" });
   }
-<<<<<<< HEAD
-=======
-
->>>>>>> 52d0ced7a (addid)
   // get time series data
   try {
     const { name, query_params } = params;
@@ -99,171 +61,3 @@ async function getBenmarkTimeSeriesData(
       throw new Error(`Unsupported request_name: ${request_name}`);
   }
 }
-<<<<<<< HEAD
-=======
-
-// Utility to extract params from either GET or POST
-// it accepts both ?parameters=<json string> and POST with JSON body
-function readParams(req: NextApiRequest): Params {
-  // 1) If POST with parsed JSON body
-  if (req.method === "POST" && req.body && typeof req.body === "object") {
-    return req.body as Params;
-  }
-
-  // 2) If POST with raw string body
-  if (
-    req.method === "POST" &&
-    typeof req.body === "string" &&
-    req.body.trim()
-  ) {
-    try {
-      return JSON.parse(req.body) as Params;
-    } catch {}
-  }
-
-  // 3) If GET with ?parameters=<json string>
-  const raw = req.query.parameters as string | undefined;
-  if (raw) {
-    try {
-      return JSON.parse(raw) as Params;
-    } catch {}
-  }
-
-  // 4) Fallback: use query params directly
-  const q: Params = {};
-  Object.entries(req.query).forEach(([k, v]) => {
-    if (k !== "parameters") q[k] = Array.isArray(v) ? v[0] : v;
-  });
-  return q;
-}
-
-/**
- * Group data by `keys`, and inside each group further subgroup by `subGroupKeys`.
- */
-function groupBy<T>(
-  data: T[],
-  keys: string[],
-  subGroupKeys: string[] = []
-): GroupedItem<T>[] {
-  const groups = new Map<string, Map<string, Subgroup<T>>>();
-  const mainInfo = new Map<string, GroupInfo>();
-
-  for (const row of data as any[]) {
-    // build main group key
-    const mainKeyParts = keys.map((k) => String(getNestedField(row, k)));
-    const mainKey = mainKeyParts.join("|");
-    if (!mainInfo.has(mainKey)) {
-      const info: GroupInfo = {};
-      keys.forEach((k, i) => (info[k] = mainKeyParts[i]));
-      mainInfo.set(mainKey, info);
-    }
-
-    // build subgroup key
-    const subKeyParts =
-      subGroupKeys.length > 0
-        ? subGroupKeys.map((k) => String(getNestedField(row, k)))
-        : ["__ALL__"]; // default single subgroup if none provided
-    const subKey = subKeyParts.join("|");
-    const subInfo: GroupInfo = {};
-
-    subGroupKeys.forEach((k, i) => (subInfo[k] = subKeyParts[i]));
-
-    if (!groups.has(mainKey)) groups.set(mainKey, new Map());
-    const subMap = groups.get(mainKey)!;
-
-    if (!subMap.has(subKey)) {
-      subMap.set(subKey, { group_Info: subInfo, data: [] });
-    }
-    subMap.get(subKey)!.data.push(row as T);
-  }
-
-  // build result array
-  const result: GroupedItem<T>[] = [];
-  for (const [mainKey, subMap] of groups.entries()) {
-    const rowsObj = Object.fromEntries(subMap.entries());
-    result.push({
-      group_Info: mainInfo.get(mainKey)!,
-      rows: rowsObj,
-    });
-  }
-  return result;
-}
-
-async function getCompilerBenchmarkData(inputparams: any) {
-  const start = Date.now();
-  const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams);
-  const end = Date.now();
-  const result = toPrecomputeCompiler(rows, inputparams, "time_series");
-  console.log("time to get data", end - start);
-  return result;
-}
-
-function toPrecomputeCompiler(
-  rawData: any[],
-  inputparams: any,
-  type: string = "time_series"
-) {
-  const data = convertToCompilerPerformanceData(rawData);
-  const models = getPassingModels(data);
-
-  const passrate = computePassrate(data, models);
-  const geomean = computeGeomean(data, models);
-  const peakMemory = computePeakMemoryUsage(data, models);
-
-  const all_data = [passrate, geomean, peakMemory].flat();
-
-  all_data.map((row) => {
-    row["dtype"] = inputparams["dtype"];
-    row["arch"] = inputparams["arch"];
-    row["device"] = inputparams["device"];
-    row["mode"] = inputparams["mode"];
-  });
-
-  let res: any[] = [];
-  switch (type) {
-    case "time_series":
-      // grouping data by comipler, device, arch, dtype, suite, metric, mode
-      // then sorted it with granularity_bucket in ascending order
-      const tsd = groupBy(
-        all_data,
-        ["dtype", "arch", "device", "suite", "compiler", "metric", "mode"],
-        ["workflow_id"]
-      );
-      res = tsd.map((group) => {
-        const group_info = group.group_Info;
-        const group_data = group.rows;
-
-        // no need for the group_info for subgroup, directly get the data
-        const ts_list = Object.values(group_data)
-          .filter((item) => item.data.length > 0)
-          .map((item) => item.data[0])
-          .sort(
-            (a, b) =>
-              new Date(a.granularity_bucket).getTime() -
-              new Date(b.granularity_bucket).getTime()
-          );
-        return {
-          group_info,
-          num_of_dp: ts_list.length,
-          result: ts_list,
-        };
-      });
-      return res;
-    case "table":
-      res = groupBy(
-        all_data,
-        [
-          "dtype",
-          "arch",
-          "device",
-          "mode",
-          "workflow_id",
-          "granularity_bucket",
-        ],
-        ["metric", "compiler"]
-      );
-  }
-
-  return res;
-}
->>>>>>> 52d0ced7a (addid)

From 5127ca62423c3dd44510d5c1d7c4b2c39d026f7e Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 2 Sep 2025 11:00:06 -0700
Subject: [PATCH 05/27] addid

---
 torchci/pages/api/benchmark/group_data.ts | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torchci/pages/api/benchmark/group_data.ts b/torchci/pages/api/benchmark/group_data.ts
index 030d79cc24..713ed480d9 100644
--- a/torchci/pages/api/benchmark/group_data.ts
+++ b/torchci/pages/api/benchmark/group_data.ts
@@ -16,7 +16,7 @@ const DEFAULT_TABLE_GROUP = [
 const DEFAULT_ROW_GROUP = ["workflow_id", "job_id", "metadata_info.timestamp"];
 const BENCNMARK_TABLE_NAME = "oss_ci_benchmark_llms";
 
-export function getNestedField(obj: any, path: string): any {
+function getNestedField(obj: any, path: string): any {
   return path.split(".").reduce((o, key) => (o && key in o ? o[key] : ""), obj);
 }
 
@@ -32,7 +32,6 @@ export default async function handler(
       details: formatZodError(request.error),
     });
   }
-
   const qp = request.data;
   const groupTableByFields =
     qp.group_table_by_fields || deepClone(DEFAULT_TABLE_GROUP);

From 0b8e72cd2b1500e64cb0523244d6b98586354901 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 2 Sep 2025 13:02:53 -0700
Subject: [PATCH 06/27] addid

---
 torchci/lib/benchmark/api_helper/compilers/precompute.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torchci/lib/benchmark/api_helper/compilers/precompute.ts b/torchci/lib/benchmark/api_helper/compilers/precompute.ts
index 0a11ab44b3..9dbc888aa2 100644
--- a/torchci/lib/benchmark/api_helper/compilers/precompute.ts
+++ b/torchci/lib/benchmark/api_helper/compilers/precompute.ts
@@ -60,6 +60,7 @@ function toPrecomputeCompiler(
   type: string = "time_series"
 ) {
   const data = convertToCompilerPerformanceData(rawData);
+
   const models = getPassingModels(data);
 
   const passrate = computePassrate(data, models);

From 0ec6dcbefddc3bce215ce76528d3858a88a68c85 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 2 Sep 2025 17:49:05 -0700
Subject: [PATCH 07/27] addid

---
 torchci/lib/benchmark/compilerUtils.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torchci/lib/benchmark/compilerUtils.ts b/torchci/lib/benchmark/compilerUtils.ts
index 00212177b3..68a4c322a4 100644
--- a/torchci/lib/benchmark/compilerUtils.ts
+++ b/torchci/lib/benchmark/compilerUtils.ts
@@ -4,6 +4,7 @@ import {
   PASSING_ACCURACY,
   SCALE,
 } from "components/benchmark/compilers/common";
+import { number } from "echarts";
 import { BenchmarkData, CompilerPerformanceData } from "lib/types";
 
 export function getPassingModels(data: CompilerPerformanceData[]) {

From 96233a32638e6e13c2c9b299d586d68f07bcb85e Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 2 Sep 2025 17:49:36 -0700
Subject: [PATCH 08/27] addid

---
 torchci/lib/benchmark/compilerUtils.ts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchci/lib/benchmark/compilerUtils.ts b/torchci/lib/benchmark/compilerUtils.ts
index 68a4c322a4..00212177b3 100644
--- a/torchci/lib/benchmark/compilerUtils.ts
+++ b/torchci/lib/benchmark/compilerUtils.ts
@@ -4,7 +4,6 @@ import {
   PASSING_ACCURACY,
   SCALE,
 } from "components/benchmark/compilers/common";
-import { number } from "echarts";
 import { BenchmarkData, CompilerPerformanceData } from "lib/types";
 
 export function getPassingModels(data: CompilerPerformanceData[]) {

From b1c79ce3991062d04d0b43f728b78c3dcefd509e Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 3 Sep 2025 02:01:45 -0700
Subject: [PATCH 09/27] addid

---
 .gitignore                                                   | 1 +
 torchci/components/benchmark/compilers/SummaryGraphPanel.tsx | 2 ++
 torchci/lib/benchmark/api_helper/compilers/precompute.ts     | 1 -
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 3be92dafed..e324f591ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -72,3 +72,4 @@ aws/tools/cleanup-ssm/**/*.rs.bk
 
 # Remove the python version file from pyenv
 .python-version
+
diff --git a/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx b/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx
index 3faab6f3cd..dcdcbf4a15 100644
--- a/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx
+++ b/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx
@@ -88,6 +88,8 @@ function SuiteGraphPanel({
     JSON.stringify(queryParamsWithSuite)
   )}`;
 
+  console.log("url params", queryParamsWithSuite);
+
   let { data, error } = useSWR(url, fetcher, {
     refreshInterval: 60 * 60 * 1000, // refresh every hour
   });
diff --git a/torchci/lib/benchmark/api_helper/compilers/precompute.ts b/torchci/lib/benchmark/api_helper/compilers/precompute.ts
index 9dbc888aa2..0a11ab44b3 100644
--- a/torchci/lib/benchmark/api_helper/compilers/precompute.ts
+++ b/torchci/lib/benchmark/api_helper/compilers/precompute.ts
@@ -60,7 +60,6 @@ function toPrecomputeCompiler(
   type: string = "time_series"
 ) {
   const data = convertToCompilerPerformanceData(rawData);
-
   const models = getPassingModels(data);
 
   const passrate = computePassrate(data, models);

From a21f91adfe4ab7e6e265222651b81a888c14f194 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 3 Sep 2025 10:40:55 -0700
Subject: [PATCH 10/27] addid

---
 .../api_helper/compilers/precompute.ts        |  2 +-
 torchci/lib/benchmark/compilerUtils.ts        | 48 -------------------
 2 files changed, 1 insertion(+), 49 deletions(-)

diff --git a/torchci/lib/benchmark/api_helper/compilers/precompute.ts b/torchci/lib/benchmark/api_helper/compilers/precompute.ts
index 0a11ab44b3..e9a9f89799 100644
--- a/torchci/lib/benchmark/api_helper/compilers/precompute.ts
+++ b/torchci/lib/benchmark/api_helper/compilers/precompute.ts
@@ -163,7 +163,7 @@ function toPrecomputeCompiler(
   }
 
   const response: BenchmarkTimeSeriesResponse = {
-    time_series: res,
+    time_series: rawData,
     time_range: {
       start: new Date(earliest_timestamp).toISOString(),
       end: new Date(latest_timestamp).toISOString(),
diff --git a/torchci/lib/benchmark/compilerUtils.ts b/torchci/lib/benchmark/compilerUtils.ts
index 00212177b3..06e4542269 100644
--- a/torchci/lib/benchmark/compilerUtils.ts
+++ b/torchci/lib/benchmark/compilerUtils.ts
@@ -456,51 +456,3 @@ export function convertToCompilerPerformanceData(data: BenchmarkData[]) {
 
   return Object.values(convertData);
 }
-
-export function computePassrateSimple(data: any[]) {
-  if (!Array.isArray(data) || data.length === 0) return [];
-
-  const blocked = new Set(BLOCKLIST_COMPILERS);
-  const passingAcc = new Set(PASSING_ACCURACY);
-  const toDisplay = (c: string) => COMPILER_NAMES_TO_DISPLAY_NAMES[c] ?? c;
-
-  const totalCount = new Map<string, number>();
-  const passCount = new Map<string, number>();
-
-  for (const r of data) {
-    const compilerDisp = toDisplay(r.compiler);
-    if (blocked.has(compilerDisp)) continue;
-
-    const key = `${r.granularity_bucket}+${r.workflow_id}+${r.suite}+${compilerDisp}`;
-
-    // 计总
-    totalCount.set(key, (totalCount.get(key) ?? 0) + 1);
-
-    const acc = r.accuracy ?? "";
-    const speed = r.speedup ?? 0;
-    const pass =
-      (passingAcc.has(acc) && (speed !== 0 || compilerDisp === "export")) ||
-      acc === "pass_due_to_skip";
-
-    if (pass) passCount.set(key, (passCount.get(key) ?? 0) + 1);
-  }
-  const out: any[] = [];
-  for (const [key, tc] of totalCount) {
-    const pc = passCount.get(key) ?? 0;
-    const p = tc > 0 ? pc / tc : 0;
-
-    const [bucket, wfStr, suite, compiler] = key.split("+");
-    out.push({
-      metirc: "passrate",
-      granularity_bucket: bucket,
-      workflow_id: Number(wfStr),
-      suite,
-      compiler,
-      passrate: p,
-      pass_count: pc,
-      total_count: tc,
-      passrate_display: `${(p * 100).toFixed(0)}%, ${pc}/${tc}`,
-    });
-  }
-  return out;
-}

From d0b325eab6e1db88813106162facae267a709246 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 3 Sep 2025 10:42:29 -0700
Subject: [PATCH 11/27] addid

---
 torchci/lib/benchmark/api_helper/compilers/precompute.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchci/lib/benchmark/api_helper/compilers/precompute.ts b/torchci/lib/benchmark/api_helper/compilers/precompute.ts
index e9a9f89799..0a11ab44b3 100644
--- a/torchci/lib/benchmark/api_helper/compilers/precompute.ts
+++ b/torchci/lib/benchmark/api_helper/compilers/precompute.ts
@@ -163,7 +163,7 @@ function toPrecomputeCompiler(
   }
 
   const response: BenchmarkTimeSeriesResponse = {
-    time_series: rawData,
+    time_series: res,
     time_range: {
       start: new Date(earliest_timestamp).toISOString(),
       end: new Date(latest_timestamp).toISOString(),

From 7347feeaf5c1954a2f9348b94380d78d931fad60 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 3 Sep 2025 10:54:06 -0700
Subject: [PATCH 12/27] addid

---
 .gitignore                                                   | 1 -
 torchci/components/benchmark/compilers/SummaryGraphPanel.tsx | 2 --
 2 files changed, 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index e324f591ec..3be92dafed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -72,4 +72,3 @@ aws/tools/cleanup-ssm/**/*.rs.bk
 
 # Remove the python version file from pyenv
 .python-version
-
diff --git a/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx b/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx
index dcdcbf4a15..3faab6f3cd 100644
--- a/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx
+++ b/torchci/components/benchmark/compilers/SummaryGraphPanel.tsx
@@ -88,8 +88,6 @@ function SuiteGraphPanel({
     JSON.stringify(queryParamsWithSuite)
   )}`;
 
-  console.log("url params", queryParamsWithSuite);
-
   let { data, error } = useSWR(url, fetcher, {
     refreshInterval: 60 * 60 * 1000, // refresh every hour
   });

From 986178e6ccade372d2d02f45ddebafca70ab9914 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 3 Sep 2025 12:57:16 -0700
Subject: [PATCH 13/27] addid

---
 torchci/components/metrics/panels/TablePanel.tsx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchci/components/metrics/panels/TablePanel.tsx b/torchci/components/metrics/panels/TablePanel.tsx
index c53b29f4e3..d53d0ab8d6 100644
--- a/torchci/components/metrics/panels/TablePanel.tsx
+++ b/torchci/components/metrics/panels/TablePanel.tsx
@@ -1,7 +1,6 @@
 import HelpIcon from "@mui/icons-material/Help";
 import { Box, Skeleton, Typography } from "@mui/material";
 import IconButton from "@mui/material/IconButton";
-import { Box } from "@mui/system";
 import { DataGrid, GridColDef } from "@mui/x-data-grid";
 import { CSSProperties } from "react";
 import useSWR from "swr";

From ee4732790c86cafce88ec002a3f2d782c169ccdc Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Sun, 31 Aug 2025 11:15:09 -0700
Subject: [PATCH 14/27] addid

---
 .../params.json                               |  17 +++
 .../query.sql                                 | 110 ++++++++++++++++++
 .../components/metrics/panels/TablePanel.tsx  |   1 +
 .../benchmark/compiler_benmark_time_series.ts |  40 +++++++
 torchci/pages/api/benchmark/group_data.ts     |   1 +
 5 files changed, 169 insertions(+)
 create mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json
 create mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
 create mode 100644 torchci/pages/api/benchmark/compiler_benmark_time_series.ts

diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json
new file mode 100644
index 0000000000..95f00e1501
--- /dev/null
+++ b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json
@@ -0,0 +1,17 @@
+{
+  "params": {
+    "branches": "Array(String)",
+    "commits": "Array(String)",
+    "compilers": "Array(String)",
+    "device": "String",
+    "arch": "String",
+    "dtype": "String",
+    "granularity": "String",
+    "mode": "String",
+    "startTime": "DateTime64(3)",
+    "stopTime": "DateTime64(3)",
+    "suites": "Array(String)",
+    "workflowId": "Int64"
+  },
+  "tests": []
+}
diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
new file mode 100644
index 0000000000..7ef1efb232
--- /dev/null
+++ b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
@@ -0,0 +1,110 @@
+-- This query is used to get the PT2 benchmark results from different experiments
+-- to powers the TorchInductor benchmark dashboard
+WITH benchmarks AS (
+    SELECT
+        workflow_id,
+        job_id,
+        suite,
+        model_name,
+        metric_name,
+        value,
+        metric_extra_info AS extra_info,
+        DATE_TRUNC(
+            {granularity: String },
+            fromUnixTimestamp(timestamp)
+        ) AS granularity_bucket,
+        -- Filters
+        benchmark_dtype,
+        benchmark_mode,
+        device,
+        arch,
+        replaceOne(head_branch, 'refs/heads/', '') AS head_branch,
+        benchmark_extra_info['output'] AS output,
+        REGEXP_REPLACE(
+            output,
+            CONCAT(
+                '_',
+                suite,
+                '_',
+                { dtype: String },
+                '_',
+                {mode: String },
+                '_',
+                {device: String },
+                '_.*'
+            ),
+            ''
+        ) AS temp
+    FROM
+        benchmark.oss_ci_benchmark_torchinductor
+    WHERE
+        timestamp >= toUnixTimestamp({startTime: DateTime64(3) })
+        AND timestamp < toUnixTimestamp({stopTime: DateTime64(3) })
+        AND (
+            has({commits: Array(String) }, head_sha)
+            OR empty({commits: Array(String) })
+        )
+        AND (
+            has({suites: Array(String) }, suite)
+            OR empty({suites: Array(String) })
+        )
+        AND (
+            workflow_id = {workflowId: Int64}
+            OR {workflowId: Int64} = 0
+        )
+)
+
+SELECT
+    workflow_id,
+    job_id,
+    REGEXP_REPLACE(temp, '.*/', '') AS backend,
+    suite,
+    model_name AS model,
+    metric_name AS metric,
+    value,
+    extra_info,
+    output,
+    granularity_bucket
+FROM
+    benchmarks
+WHERE
+    (
+        has({branches: Array(String) }, head_branch)
+        OR empty({branches: Array(String) })
+    )
+    -- TODO (huydhn): Clean up the output field and how it's used in the query
+    -- in 6 months
+    AND (
+        (
+            ({arch: String } = '' OR {arch: String } = 'a100')
+            AND output LIKE CONCAT(
+                '%\_',
+                {dtype: String },
+                '\_',
+                {mode: String },
+                '\_',
+                {device: String },
+                '\_%'
+            )
+        )
+        OR (
+            {arch: String } != ''
+            AND output LIKE CONCAT(
+                '%\_',
+                {dtype: String },
+                '\_',
+                {mode: String },
+                '\_',
+                {device: String },
+                '\_',
+                {arch: String },
+                '\_%'
+            )
+        )
+        OR (
+            benchmark_dtype = {dtype: String }
+            AND benchmark_mode = {mode: String }
+            AND device = {device: String }
+            AND arch = {arch: String }
+        )
+    )
diff --git a/torchci/components/metrics/panels/TablePanel.tsx b/torchci/components/metrics/panels/TablePanel.tsx
index d53d0ab8d6..023223d238 100644
--- a/torchci/components/metrics/panels/TablePanel.tsx
+++ b/torchci/components/metrics/panels/TablePanel.tsx
@@ -7,6 +7,7 @@ import useSWR from "swr";
 
 const fetcher = (url: string) => fetch(url).then((res) => res.json());
 
+
 export default function TablePanel({
   // Human-readable title for this panel.
   title,
diff --git a/torchci/pages/api/benchmark/compiler_benmark_time_series.ts b/torchci/pages/api/benchmark/compiler_benmark_time_series.ts
new file mode 100644
index 0000000000..4e8254766d
--- /dev/null
+++ b/torchci/pages/api/benchmark/compiler_benmark_time_series.ts
@@ -0,0 +1,40 @@
+
+import { queryClickhouseSaved } from "lib/clickhouse";
+import type { NextApiRequest, NextApiResponse } from "next";
+
+const DEFAULT_TABLE_GROUP = [
+  "device",
+  "backend",
+  "model",
+  "dtype",
+  "backend",
+  "arch",
+];
+const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2";
+
+type QueryParams = {
+  startTime: string; // ISO timestamp
+  stopTime: string;  // ISO timestamp
+  [k: string]: any;  // other parameters
+};
+
+
+export default async function handler(
+  req: NextApiRequest,
+  res: NextApiResponse
+) {
+
+  console.log("compiler_benmark_time_series.");
+  const inputparams = JSON.parse(req.query.parameters as string)
+  console.log("inputs", inputparams);
+
+  const start = Date.now();
+  const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams);
+  const end = Date.now();
+  console.log(`Process took ${end - start}ms`);
+
+  console.log("merged rows:", rows.length);
+  res.status(200).json({ data: rows });
+}
+
+
diff --git a/torchci/pages/api/benchmark/group_data.ts b/torchci/pages/api/benchmark/group_data.ts
index 713ed480d9..d8703d8a8b 100644
--- a/torchci/pages/api/benchmark/group_data.ts
+++ b/torchci/pages/api/benchmark/group_data.ts
@@ -32,6 +32,7 @@ export default async function handler(
       details: formatZodError(request.error),
     });
   }
+
   const qp = request.data;
   const groupTableByFields =
     qp.group_table_by_fields || deepClone(DEFAULT_TABLE_GROUP);

From 86b69990cb8a259748494b011ef5efbb3e61d674 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Sun, 31 Aug 2025 18:12:51 -0700
Subject: [PATCH 15/27] addid

---
 .../query.sql                                 | 155 +++++---------
 .../components/metrics/panels/TablePanel.tsx  |   2 +-
 torchci/lib/benchmark/compilerUtils.ts        |  54 +++++
 .../benchmark/compiler_benmark_time_series.ts |  40 ----
 .../pages/api/benchmark/get_time_series.ts    | 190 ++++++++++++++++++
 torchci/pages/api/benchmark/group_data.ts     |   2 +-
 6 files changed, 299 insertions(+), 144 deletions(-)
 delete mode 100644 torchci/pages/api/benchmark/compiler_benmark_time_series.ts

diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
index 7ef1efb232..bf233f6ff8 100644
--- a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
+++ b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
@@ -1,110 +1,61 @@
--- This query is used to get the PT2 benchmark results from different experiments
--- to powers the TorchInductor benchmark dashboard
 WITH benchmarks AS (
-    SELECT
-        workflow_id,
-        job_id,
-        suite,
-        model_name,
-        metric_name,
-        value,
-        metric_extra_info AS extra_info,
-        DATE_TRUNC(
-            {granularity: String },
-            fromUnixTimestamp(timestamp)
-        ) AS granularity_bucket,
-        -- Filters
-        benchmark_dtype,
-        benchmark_mode,
-        device,
-        arch,
-        replaceOne(head_branch, 'refs/heads/', '') AS head_branch,
-        benchmark_extra_info['output'] AS output,
-        REGEXP_REPLACE(
-            output,
-            CONCAT(
-                '_',
-                suite,
-                '_',
-                { dtype: String },
-                '_',
-                {mode: String },
-                '_',
-                {device: String },
-                '_.*'
-            ),
-            ''
-        ) AS temp
-    FROM
-        benchmark.oss_ci_benchmark_torchinductor
-    WHERE
-        timestamp >= toUnixTimestamp({startTime: DateTime64(3) })
-        AND timestamp < toUnixTimestamp({stopTime: DateTime64(3) })
-        AND (
-            has({commits: Array(String) }, head_sha)
-            OR empty({commits: Array(String) })
-        )
-        AND (
-            has({suites: Array(String) }, suite)
-            OR empty({suites: Array(String) })
-        )
-        AND (
-            workflow_id = {workflowId: Int64}
-            OR {workflowId: Int64} = 0
-        )
-)
-
-SELECT
+  SELECT
     workflow_id,
     job_id,
-    REGEXP_REPLACE(temp, '.*/', '') AS backend,
     suite,
-    model_name AS model,
-    metric_name AS metric,
+    model_name,
+    metric_name,
     value,
-    extra_info,
-    output,
-    granularity_bucket
-FROM
-    benchmarks
+    metric_extra_info AS extra_info,
+    DATE_TRUNC({granularity:String}, fromUnixTimestamp(timestamp)) AS granularity_bucket,
+    benchmark_dtype,
+    benchmark_mode,
+    device,
+    arch,
+    replaceOne(head_branch, 'refs/heads/', '') AS head_branch,
+
+    benchmark_extra_info['output'] AS output,
+
+    REGEXP_REPLACE(
+      benchmark_extra_info['output'],
+      CONCAT('_', suite, '_', {dtype:String}, '_', {mode:String}, '_', {device:String}, '_.*'),
+      ''
+    ) AS temp
+
+  FROM benchmark.oss_ci_benchmark_torchinductor
+  WHERE
+    timestamp >= toUnixTimestamp({startTime:DateTime64(3)}) AND
+    timestamp <  toUnixTimestamp({stopTime:DateTime64(3)}) AND
+    (has({commits:Array(String)}, head_sha) OR empty({commits:Array(String)})) AND
+    (has({suites:Array(String)}, suite)     OR empty({suites:Array(String)})) AND
+    (workflow_id = {workflowId:Int64} OR {workflowId:Int64} = 0)
+)
+
+SELECT
+  workflow_id,
+  job_id,
+  REGEXP_REPLACE(temp, '.*/', '') AS backend,
+  suite,
+  model_name AS model,
+  metric_name AS metric,
+  value,
+  output,
+  granularity_bucket,
+  extra_info,
+FROM benchmarks
 WHERE
+  (has({branches:Array(String)}, head_branch) OR empty({branches:Array(String)}))
+  AND (
     (
-        has({branches: Array(String) }, head_branch)
-        OR empty({branches: Array(String) })
-    )
-    -- TODO (huydhn): Clean up the output field and how it's used in the query
-    -- in 6 months
-    AND (
-        (
-            ({arch: String } = '' OR {arch: String } = 'a100')
-            AND output LIKE CONCAT(
-                '%\_',
-                {dtype: String },
-                '\_',
-                {mode: String },
-                '\_',
-                {device: String },
-                '\_%'
-            )
-        )
-        OR (
-            {arch: String } != ''
-            AND output LIKE CONCAT(
-                '%\_',
-                {dtype: String },
-                '\_',
-                {mode: String },
-                '\_',
-                {device: String },
-                '\_',
-                {arch: String },
-                '\_%'
-            )
-        )
-        OR (
-            benchmark_dtype = {dtype: String }
-            AND benchmark_mode = {mode: String }
-            AND device = {device: String }
-            AND arch = {arch: String }
-        )
+      ({arch:String} = '' OR {arch:String} = 'a100') AND
+      output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_%')
+    ) OR (
+      {arch:String} != '' AND
+      output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_', {arch:String}, '\_%')
+    ) OR (
+      benchmark_dtype = {dtype:String} AND
+      benchmark_mode  = {mode:String}  AND
+      device          = {device:String} AND
+      arch            = {arch:String}
     )
+  );
diff --git a/torchci/components/metrics/panels/TablePanel.tsx b/torchci/components/metrics/panels/TablePanel.tsx
index 023223d238..c53b29f4e3 100644
--- a/torchci/components/metrics/panels/TablePanel.tsx
+++ b/torchci/components/metrics/panels/TablePanel.tsx
@@ -1,13 +1,13 @@
 import HelpIcon from "@mui/icons-material/Help";
 import { Box, Skeleton, Typography } from "@mui/material";
 import IconButton from "@mui/material/IconButton";
+import { Box } from "@mui/system";
 import { DataGrid, GridColDef } from "@mui/x-data-grid";
 import { CSSProperties } from "react";
 import useSWR from "swr";
 
 const fetcher = (url: string) => fetch(url).then((res) => res.json());
 
-
 export default function TablePanel({
   // Human-readable title for this panel.
   title,
diff --git a/torchci/lib/benchmark/compilerUtils.ts b/torchci/lib/benchmark/compilerUtils.ts
index 06e4542269..6d3e0902d9 100644
--- a/torchci/lib/benchmark/compilerUtils.ts
+++ b/torchci/lib/benchmark/compilerUtils.ts
@@ -100,7 +100,10 @@ export function computePassrate(
     const [bucket, workflowId, suite, compiler] = key.split("+");
     passrate.push({
       metric: "passrate",
+<<<<<<< HEAD
       value: p,
+=======
+>>>>>>> 556c0ef04 (addid)
       granularity_bucket: bucket,
       workflow_id: workflowId,
       suite: suite,
@@ -166,7 +169,10 @@ export function computeGeomean(
     const [bucket, workflowId, suite, compiler] = key.split("+");
     returnedGeomean.push({
       metric: "geomean",
+<<<<<<< HEAD
       value: Number(gm),
+=======
+>>>>>>> 556c0ef04 (addid)
       granularity_bucket: bucket,
       workflow_id: workflowId,
       suite: suite,
@@ -456,3 +462,51 @@ export function convertToCompilerPerformanceData(data: BenchmarkData[]) {
 
   return Object.values(convertData);
 }
+
+export function computePassrateSimple(data: any[]) {
+  if (!Array.isArray(data) || data.length === 0) return [];
+
+  const blocked = new Set(BLOCKLIST_COMPILERS);
+  const passingAcc = new Set(PASSING_ACCURACY);
+  const toDisplay = (c: string) => COMPILER_NAMES_TO_DISPLAY_NAMES[c] ?? c;
+
+  const totalCount = new Map<string, number>();
+  const passCount = new Map<string, number>();
+
+  for (const r of data) {
+    const compilerDisp = toDisplay(r.compiler);
+    if (blocked.has(compilerDisp)) continue;
+
+    const key = `${r.granularity_bucket}+${r.workflow_id}+${r.suite}+${compilerDisp}`;
+
+    // 计总
+    totalCount.set(key, (totalCount.get(key) ?? 0) + 1);
+
+    const acc = r.accuracy ?? "";
+    const speed = r.speedup ?? 0;
+    const pass =
+      (passingAcc.has(acc) && (speed !== 0 || compilerDisp === "export")) ||
+      acc === "pass_due_to_skip";
+
+    if (pass) passCount.set(key, (passCount.get(key) ?? 0) + 1);
+  }
+  const out: any[] = [];
+  for (const [key, tc] of totalCount) {
+    const pc = passCount.get(key) ?? 0;
+    const p = tc > 0 ? pc / tc : 0;
+
+    const [bucket, wfStr, suite, compiler] = key.split("+");
+    out.push({
+      metirc: "passrate",
+      granularity_bucket: bucket,
+      workflow_id: Number(wfStr),
+      suite,
+      compiler,
+      passrate: p,
+      pass_count: pc,
+      total_count: tc,
+      passrate_display: `${(p * 100).toFixed(0)}%, ${pc}/${tc}`,
+    });
+  }
+  return out;
+}
diff --git a/torchci/pages/api/benchmark/compiler_benmark_time_series.ts b/torchci/pages/api/benchmark/compiler_benmark_time_series.ts
deleted file mode 100644
index 4e8254766d..0000000000
--- a/torchci/pages/api/benchmark/compiler_benmark_time_series.ts
+++ /dev/null
@@ -1,40 +0,0 @@
-
-import { queryClickhouseSaved } from "lib/clickhouse";
-import type { NextApiRequest, NextApiResponse } from "next";
-
-const DEFAULT_TABLE_GROUP = [
-  "device",
-  "backend",
-  "model",
-  "dtype",
-  "backend",
-  "arch",
-];
-const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2";
-
-type QueryParams = {
-  startTime: string; // ISO timestamp
-  stopTime: string;  // ISO timestamp
-  [k: string]: any;  // other parameters
-};
-
-
-export default async function handler(
-  req: NextApiRequest,
-  res: NextApiResponse
-) {
-
-  console.log("compiler_benmark_time_series.");
-  const inputparams = JSON.parse(req.query.parameters as string)
-  console.log("inputs", inputparams);
-
-  const start = Date.now();
-  const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams);
-  const end = Date.now();
-  console.log(`Process took ${end - start}ms`);
-
-  console.log("merged rows:", rows.length);
-  res.status(200).json({ data: rows });
-}
-
-
diff --git a/torchci/pages/api/benchmark/get_time_series.ts b/torchci/pages/api/benchmark/get_time_series.ts
index ce069f5590..a51a4f77cd 100644
--- a/torchci/pages/api/benchmark/get_time_series.ts
+++ b/torchci/pages/api/benchmark/get_time_series.ts
@@ -1,6 +1,28 @@
+<<<<<<< HEAD
 import { getCompilerBenchmarkData } from "lib/benchmark/api_helper/compilers/precompute";
 import { readApiGetParams } from "lib/benchmark/api_helper/utils";
 import type { NextApiRequest, NextApiResponse } from "next";
+=======
+import {
+  computeGeomean,
+  computePassrate,
+  computePeakMemoryUsage,
+  convertToCompilerPerformanceData,
+  getPassingModels,
+} from "lib/benchmark/compilerUtils";
+import { queryClickhouseSaved } from "lib/clickhouse";
+import type { NextApiRequest, NextApiResponse } from "next";
+import { getNestedField } from "./group_data";
+
+type GroupInfo = Record<string, string>;
+type Subgroup<T> = { group_Info: GroupInfo; data: T[] };
+type GroupedItem<T> = {
+  group_Info: GroupInfo;
+  rows: Record<string, Subgroup<T>>;
+};
+type Params = Record<string, any>;
+const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2";
+>>>>>>> 556c0ef04 (addid)
 
 /**
  * API Route: /api/benchmark/get_time_series
@@ -61,3 +83,171 @@ async function getBenmarkTimeSeriesData(
       throw new Error(`Unsupported request_name: ${request_name}`);
   }
 }
+<<<<<<< HEAD
+=======
+
+// Utility to extract params from either GET or POST
+// it accepts both ?parameters=<json string> and POST with JSON body
+function readParams(req: NextApiRequest): Params {
+  // 1) If POST with parsed JSON body
+  if (req.method === "POST" && req.body && typeof req.body === "object") {
+    return req.body as Params;
+  }
+
+  // 2) If POST with raw string body
+  if (
+    req.method === "POST" &&
+    typeof req.body === "string" &&
+    req.body.trim()
+  ) {
+    try {
+      return JSON.parse(req.body) as Params;
+    } catch {}
+  }
+
+  // 3) If GET with ?parameters=<json string>
+  const raw = req.query.parameters as string | undefined;
+  if (raw) {
+    try {
+      return JSON.parse(raw) as Params;
+    } catch {}
+  }
+
+  // 4) Fallback: use query params directly
+  const q: Params = {};
+  Object.entries(req.query).forEach(([k, v]) => {
+    if (k !== "parameters") q[k] = Array.isArray(v) ? v[0] : v;
+  });
+  return q;
+}
+
+/**
+ * Group data by `keys`, and inside each group further subgroup by `subGroupKeys`.
+ */
+function groupBy<T>(
+  data: T[],
+  keys: string[],
+  subGroupKeys: string[] = []
+): GroupedItem<T>[] {
+  const groups = new Map<string, Map<string, Subgroup<T>>>();
+  const mainInfo = new Map<string, GroupInfo>();
+
+  for (const row of data as any[]) {
+    // build main group key
+    const mainKeyParts = keys.map((k) => String(getNestedField(row, k)));
+    const mainKey = mainKeyParts.join("|");
+    if (!mainInfo.has(mainKey)) {
+      const info: GroupInfo = {};
+      keys.forEach((k, i) => (info[k] = mainKeyParts[i]));
+      mainInfo.set(mainKey, info);
+    }
+
+    // build subgroup key
+    const subKeyParts =
+      subGroupKeys.length > 0
+        ? subGroupKeys.map((k) => String(getNestedField(row, k)))
+        : ["__ALL__"]; // default single subgroup if none provided
+    const subKey = subKeyParts.join("|");
+    const subInfo: GroupInfo = {};
+
+    subGroupKeys.forEach((k, i) => (subInfo[k] = subKeyParts[i]));
+
+    if (!groups.has(mainKey)) groups.set(mainKey, new Map());
+    const subMap = groups.get(mainKey)!;
+
+    if (!subMap.has(subKey)) {
+      subMap.set(subKey, { group_Info: subInfo, data: [] });
+    }
+    subMap.get(subKey)!.data.push(row as T);
+  }
+
+  // build result array
+  const result: GroupedItem<T>[] = [];
+  for (const [mainKey, subMap] of groups.entries()) {
+    const rowsObj = Object.fromEntries(subMap.entries());
+    result.push({
+      group_Info: mainInfo.get(mainKey)!,
+      rows: rowsObj,
+    });
+  }
+  return result;
+}
+
+async function getCompilerBenchmarkData(inputparams: any) {
+  const start = Date.now();
+  const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams);
+  const end = Date.now();
+  const result = toPrecomputeCompiler(rows, inputparams, "time_series");
+  console.log("time to get data", end - start);
+  return result;
+}
+
+function toPrecomputeCompiler(
+  rawData: any[],
+  inputparams: any,
+  type: string = "time_series"
+) {
+  const data = convertToCompilerPerformanceData(rawData);
+  const models = getPassingModels(data);
+
+  const passrate = computePassrate(data, models);
+  const geomean = computeGeomean(data, models);
+  const peakMemory = computePeakMemoryUsage(data, models);
+
+  const all_data = [passrate, geomean, peakMemory].flat();
+
+  all_data.map((row) => {
+    row["dtype"] = inputparams["dtype"];
+    row["arch"] = inputparams["arch"];
+    row["device"] = inputparams["device"];
+    row["mode"] = inputparams["mode"];
+  });
+
+  let res: any[] = [];
+  switch (type) {
+    case "time_series":
+      // grouping data by comipler, device, arch, dtype, suite, metric, mode
+      // then sorted it with granularity_bucket in ascending order
+      const tsd = groupBy(
+        all_data,
+        ["dtype", "arch", "device", "suite", "compiler", "metric", "mode"],
+        ["workflow_id"]
+      );
+      res = tsd.map((group) => {
+        const group_info = group.group_Info;
+        const group_data = group.rows;
+
+        // no need for the group_info for subgroup, directly get the data
+        const ts_list = Object.values(group_data)
+          .filter((item) => item.data.length > 0)
+          .map((item) => item.data[0])
+          .sort(
+            (a, b) =>
+              new Date(a.granularity_bucket).getTime() -
+              new Date(b.granularity_bucket).getTime()
+          );
+        return {
+          group_info,
+          num_of_dp: ts_list.length,
+          result: ts_list,
+        };
+      });
+      return res;
+    case "table":
+      res = groupBy(
+        all_data,
+        [
+          "dtype",
+          "arch",
+          "device",
+          "mode",
+          "workflow_id",
+          "granularity_bucket",
+        ],
+        ["metric", "compiler"]
+      );
+  }
+
+  return res;
+}
+>>>>>>> 556c0ef04 (addid)
diff --git a/torchci/pages/api/benchmark/group_data.ts b/torchci/pages/api/benchmark/group_data.ts
index d8703d8a8b..030d79cc24 100644
--- a/torchci/pages/api/benchmark/group_data.ts
+++ b/torchci/pages/api/benchmark/group_data.ts
@@ -16,7 +16,7 @@ const DEFAULT_TABLE_GROUP = [
 const DEFAULT_ROW_GROUP = ["workflow_id", "job_id", "metadata_info.timestamp"];
 const BENCNMARK_TABLE_NAME = "oss_ci_benchmark_llms";
 
-function getNestedField(obj: any, path: string): any {
+export function getNestedField(obj: any, path: string): any {
   return path.split(".").reduce((o, key) => (o && key in o ? o[key] : ""), obj);
 }
 

From c70d56c6aad541adef905177d3a9410c7c057bd8 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 2 Sep 2025 12:29:30 -0700
Subject: [PATCH 16/27] addid

---
 .../.gitignore                                |   3 +
 .../Makefile                                  |  19 +
 .../lambda_function.py                        | 385 ++++++++++++++++++
 .../requirements.txt                          |   5 +
 4 files changed, 412 insertions(+)
 create mode 100644 aws/lambda/benchmark_regression_summary_report/.gitignore
 create mode 100644 aws/lambda/benchmark_regression_summary_report/Makefile
 create mode 100644 aws/lambda/benchmark_regression_summary_report/lambda_function.py
 create mode 100644 aws/lambda/benchmark_regression_summary_report/requirements.txt

diff --git a/aws/lambda/benchmark_regression_summary_report/.gitignore b/aws/lambda/benchmark_regression_summary_report/.gitignore
new file mode 100644
index 0000000000..bd92f6376a
--- /dev/null
+++ b/aws/lambda/benchmark_regression_summary_report/.gitignore
@@ -0,0 +1,3 @@
+*.zip
+deployment/
+venv/
diff --git a/aws/lambda/benchmark_regression_summary_report/Makefile b/aws/lambda/benchmark_regression_summary_report/Makefile
new file mode 100644
index 0000000000..478548770a
--- /dev/null
+++ b/aws/lambda/benchmark_regression_summary_report/Makefile
@@ -0,0 +1,19 @@
+all: run-local
+
+clean:
+	rm -rf deployment
+	rm -rf venv
+	rm -rf deployment.zip
+
+venv/bin/python:
+	virtualenv venv
+	venv/bin/pip install -r requirements.txt
+
+deployment.zip:
+	mkdir -p deployment
+	cp lambda_function.py ./deployment/.
+	pip3.10 install -r requirements.txt -t ./deployment/. --platform manylinux2014_x86_64 --only-binary=:all: --implementation cp --python-version 3.10 --upgrade
+	cd ./deployment && zip -q -r ../deployment.zip .
+
+.PHONY: create-deployment-package
+create-deployment-package: deployment.zip
diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
new file mode 100644
index 0000000000..5e57d49e99
--- /dev/null
+++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
@@ -0,0 +1,385 @@
+#!/usr/bin/env python
+import argparse
+import json
+import logging
+import os
+import threading
+from collections import defaultdict
+from concurrent.futures import as_completed, ThreadPoolExecutor
+from datetime import datetime, timedelta, timezone
+
+# Local imports
+from typing import Any, Dict, Iterable, List, Optional, Set
+
+import clickhouse_connect
+import yaml
+from dateutil.parser import parse
+from github import Auth, Github
+
+
+logging.basicConfig(
+    level=logging.INFO,
+)
+logger = logging.getLogger()
+logger.setLevel("INFO")
+
+ENVS = {
+    "GITHUB_ACCESS_TOKEN": os.getenv("GITHUB_ACCESS_TOKEN", ""),
+    "CLICKHOUSE_ENDPOINT": os.getenv("CLICKHOUSE_ENDPOINT", ""),
+    "CLICKHOUSE_PASSWORD": os.getenv("CLICKHOUSE_PASSWORD", ""),
+    "CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME", ""),
+}
+
+
+def get_clickhouse_client(
+    host: str, user: str, password: str
+) -> clickhouse_connect.driver.client.Client:
+    # for local testing only, disable SSL verification
+    # return clickhouse_connect.get_client(host=host, user=user, password=password,secure=True, verify=False)
+
+    return clickhouse_connect.get_client(
+        host=host, user=user, password=password, secure=True
+    )
+
+
+def get_clickhouse_client_environment() -> clickhouse_connect.driver.client.Client:
+    for name, env_val in ENVS.items():
+        if not env_val:
+            raise ValueError(f"Missing environment variable {name}")
+    return get_clickhouse_client(
+        host=ENVS["CLICKHOUSE_ENDPOINT"],
+        user=ENVS["CLICKHOUSE_USERNAME"],
+        password=ENVS["CLICKHOUSE_PASSWORD"],
+    )
+
+
+def is_unix_timestamp(value: str) -> bool:
+    """Check if the string is a valid Unix timestamp."""
+    if value.isdigit():  # Ensure it's numeric
+        try:
+            timestamp = int(value)
+            # Check if it's within a reasonable range (1970 to 2100)
+            datetime.fromtimestamp(timestamp)
+            return True
+        except (ValueError, OSError):
+            return False
+    return False
+
+
+def to_timestap_str(time: datetime) -> str:
+    return str(int(time.timestamp()))
+
+
+def write_to_file(data: Any, filename="", path=""):
+    """
+    Writes data to a specified file. If no path is provided, writes to the current directory.
+
+    :param data: The content to write to the file.
+    :param filename: The name of the file (default: 'output.txt').
+    :param path: The directory where the file should be saved (default: current directory).
+    """
+
+    if not filename:
+        filename = "output_snapshot.json"
+    if not path:
+        path = "."
+
+    # Ensure the path exists
+    os.makedirs(path, exist_ok=True)
+
+    # Construct full file path
+    file_path = os.path.join(path, filename)
+
+    # Write data to file
+    with open(file_path, "w", encoding="utf-8") as file:
+        file.write(data)
+    logger.info(f"File written to: {os.path.abspath(file_path)}")
+
+
+
+class BenchmarkSummaryProcessor:
+    """
+    """
+
+    def __init__(
+        self,
+        is_dry_run: bool = False,
+        local_output: bool = False,
+        output_snapshot_file_name: str = "summary_report_snapshot",
+        output_snapshot_file_path: str = "",
+    ) -> None:
+        self.is_dry_run = is_dry_run
+        self.is_dry_run = is_dry_run
+        self.output_snapshot_file_name = output_snapshot_file_name
+        self.output_snapshot_file_path = output_snapshot_file_path
+        self.local_output = local_output and is_dry_run
+
+    def process(
+        self,
+        start_time: datetime,
+        end_time: datetime,
+        cc: Optional[clickhouse_connect.driver.client.Client] = None,
+        args: Optional[argparse.Namespace] = None,
+    ) -> Dict[str, Any]:
+        # ensure each thread has its own clickhouse client. clickhouse client
+        # is not thread-safe.
+        if cc is None:
+            tlocal = threading.local()
+            if not hasattr(tlocal, "cc") or tlocal.cc is None:
+                if args:
+                    tlocal.cc = get_clickhouse_client(
+                        args.clickhouse_endpoint,
+                        args.clickhouse_username,
+                        args.clickhouse_password,
+                    )
+                else:
+                    tlocal.cc = get_clickhouse_client_environment()
+            cc = tlocal.cc
+
+        # fetches config to get time series from api
+
+
+        
+        queued_jobs = self._fetch_snapshot_from_db(cc, start_time, end_time, repo)
+
+        if len(queued_jobs) == 0:
+            logger.info(
+                f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] "
+                + f"No jobs in queue in time range: [{start_time},{end_time}]"
+            )
+
+        # add runner labels to each job based on machine type
+        self._add_runner_labels(
+            queued_jobs,
+            start_time,
+            meta_runner_config_retriever,
+            lf_runner_config_retriever,
+            old_lf_lf_runner_config_retriever,
+        )
+
+        if len(queued_jobs) == 0:
+            logger.info(
+                f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] "
+                + "No queued jobs, skipping generating histogram records.."
+            )
+
+        records = QueuedJobHistogramGenerator().generate_histogram_records(
+            queued_jobs,
+            datetime.now(timezone.utc),
+            "half-hour-mark-queue-time-histogram",
+            end_time,
+        )
+
+        if len(records) == 0:
+            logger.info(
+                f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] "
+                + "No histogram records, skipping writing.."
+            )
+
+        if self.is_dry_run:
+            logger.info(
+                f" [Dry Run Mode][Snapshot {to_timestap_str(end_time)}] "
+                + "Writing results to terminal/local file ..."
+            )
+            self._output_record(queued_jobs, end_time, type="queued_jobs")
+            self._output_record(records, end_time, type="records")
+            logger.info(
+                f" [Dry Run Mode][Snapshot {to_timestap_str(end_time)}] "
+                + "Done. Write results to terminal/local file ."
+            )
+        else:
+            self._write_to_db_table(cc, records)
+
+        return {
+            "start_time": to_timestap_str(start_time),
+            "end_time": to_timestap_str(end_time),
+            "jobs_count": len(queued_jobs),
+            "records_count": len(records),
+        }
+
+
+class WorkerPoolHandler:
+    """
+    WorkerPoolHandler runs workers in parallel to generate benchmark regression report
+    and writes the results to the target destination.
+
+    """
+
+    def __init__(
+        self,
+        benchmark_summary_processor: BenchmarkSummaryProcessor,
+        max_workers: int = 4,
+    ):
+        self.benchmark_summary_processor = benchmark_summary_processor
+        self.max_workers = max_workers
+
+    def start(
+        self,
+        config: Dict[str, Any],
+        args: Optional[argparse.Namespace] = None,
+    ) -> None:
+        logger.info(
+            "[WorkerPoolHandler] start to process benchmark summary data with config %s", config["name"]
+        )
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            futures = []
+            for interval in time_intervals:
+                future = executor.submit(
+                    self.benchmark_summary_processor.process,
+                    config,
+                    cc=None,
+                    args=args,
+                )
+                futures.append(future)
+        results = []
+        errors = []
+
+        # handle results from parallel processing
+        for future in as_completed(futures):
+            try:
+                result = future.result()
+                # This will raise an exception if one occurred
+                results.append(result)
+            except Exception as e:
+                logger.warning(f"Error processing future: {e}")
+                errors.append({"error": str(e)})
+
+def main(
+    args: Optional[argparse.Namespace] = None,
+    github_access_token: str = "",
+    is_dry_run: bool = False,
+    local_output: bool = False,
+    output_snapshot_file_name: str = "job_queue_times_snapshot",
+    output_snapshot_file_path: str = "",
+):
+    """
+    Main method to run in both local environment and lambda handler.
+       1. generate intervals[start_time,end_time] using latest timestamp from source table and target table
+       2. call WorkerPoolHandler to geneterate and write histogram data for each interval in parallel
+    """
+    # gets config retrievers, this is used to generate runner labels for histgram
+    if not github_access_token:
+        raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN")
+    config_retrievers = get_config_retrievers(github_access_token)
+
+    # get time intervals.
+    logger.info(" [Main] generating time intervals ....")
+    if args:
+        cc = get_clickhouse_client(
+            args.clickhouse_endpoint, args.clickhouse_username, args.clickhouse_password
+        )
+    else:
+        cc = get_clickhouse_client_environment()
+    time_intervals = TimeIntervalGenerator().generate(cc)
+
+
+    # get jobs in queue from clickhouse for list of time intervals, in parallel
+    handler = WorkerPoolHandler(
+        config_retrievers,
+        BenchmarkSummaryProcessor(
+            is_dry_run=is_dry_run,
+            local_output=local_output,
+            output_snapshot_file_name=output_snapshot_file_name,
+            output_snapshot_file_path=output_snapshot_file_path,
+        ),
+    )
+    handler.start(time_intervals, args)
+    logger.info(" [Main] Done. work completed.")
+
+
+def lambda_handler(event: Any, context: Any) -> None:
+    """
+    Main method to run in aws lambda environment
+    """
+    main(
+        None,
+        github_access_token=ENVS["GITHUB_ACCESS_TOKEN"],
+    )
+    return
+
+
+def parse_args() -> argparse.Namespace:
+    """
+    Parse command line args, this is mainly used for local test environment.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--clickhouse-endpoint",
+        default=ENVS["CLICKHOUSE_ENDPOINT"],
+        type=str,
+        help="the clickhouse endpoint, the clickhouse_endpoint "
+        + "name is  https://{clickhouse_endpoint}:{port} for full url ",
+    )
+    parser.add_argument(
+        "--clickhouse-username",
+        type=str,
+        default=ENVS["CLICKHOUSE_USERNAME"],
+        help="the clickhouse username",
+    )
+    parser.add_argument(
+        "--clickhouse-password",
+        type=str,
+        default=ENVS["CLICKHOUSE_PASSWORD"],
+        help="the clickhouse password for the user name",
+    )
+    parser.add_argument(
+        "--github-access-token",
+        type=str,
+        default=ENVS["GITHUB_ACCESS_TOKEN"],
+        help="the github access token to access github api",
+    )
+    parser.add_argument(
+        "--local-output",
+        action="store_true",
+        help="when set, generate json result in local environment. "
+        + "this is only used for local test environment when dry-run is enabled",
+    )
+    parser.add_argument(
+        "--not-dry-run",
+        action="store_true",
+        help="when set, writing results to destination from local "
+        + "environment. By default, we run in dry-run mode for local "
+        + "environment",
+    )
+    parser.add_argument(
+        "--output-file-name",
+        type=str,
+        default="job_queue_times_snapshot.json",
+        help="the name of output file for local environment. this "
+        + "is only used for local test environment when local-output is enabled",
+    )
+    parser.add_argument(
+        "--output-file-path",
+        type=str,
+        default="",
+        help="the path of output file for local environment. this is "
+        + "only used for local test environment when local-output is enabled",
+    )
+    args, _ = parser.parse_known_args()
+    return args
+
+
+def local_run() -> None:
+    """
+    method to run in local test environment
+    """
+
+    args = parse_args()
+
+    # update environment variables for input parameters
+
+    # always run in dry-run mode in local environment, unless it's disabled.
+    is_dry_run = not args.not_dry_run
+
+    main(
+        args,
+        args.github_access_token,
+        is_dry_run=is_dry_run,
+        local_output=args.local_output,
+        output_snapshot_file_name=args.output_file_name,
+        output_snapshot_file_path=args.output_file_path,
+    )
+
+
+if __name__ == "__main__":
+    local_run()
diff --git a/aws/lambda/benchmark_regression_summary_report/requirements.txt b/aws/lambda/benchmark_regression_summary_report/requirements.txt
new file mode 100644
index 0000000000..87c33c2e7f
--- /dev/null
+++ b/aws/lambda/benchmark_regression_summary_report/requirements.txt
@@ -0,0 +1,5 @@
+clickhouse_connect==0.8.5
+boto3==1.35.33
+PyGithub==1.59.0
+python-dateutil==2.8.2
+PyYAML==6.0.1

From 6c17c24bdc29ca650fd8eb184c22236bd77d138b Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 2 Sep 2025 14:24:21 -0700
Subject: [PATCH 17/27] addid

---
 .../Makefile                                  |   3 +-
 .../lambda_function.py                        |  38 +++---
 .../lib/benchmark_time_series_api_model.py    |  27 ++++
 .../lib/config.py                             |  55 ++++++++
 .../lib/config_model.py                       | 128 ++++++++++++++++++
 5 files changed, 232 insertions(+), 19 deletions(-)
 create mode 100644 aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py
 create mode 100644 aws/lambda/benchmark_regression_summary_report/lib/config.py
 create mode 100644 aws/lambda/benchmark_regression_summary_report/lib/config_model.py

diff --git a/aws/lambda/benchmark_regression_summary_report/Makefile b/aws/lambda/benchmark_regression_summary_report/Makefile
index 478548770a..3db1a588ca 100644
--- a/aws/lambda/benchmark_regression_summary_report/Makefile
+++ b/aws/lambda/benchmark_regression_summary_report/Makefile
@@ -11,7 +11,8 @@ venv/bin/python:
 
 deployment.zip:
 	mkdir -p deployment
-	cp lambda_function.py ./deployment/.
+	cp lambda_function.py lib ./deployment/.
+
 	pip3.10 install -r requirements.txt -t ./deployment/. --platform manylinux2014_x86_64 --only-binary=:all: --implementation cp --python-version 3.10 --upgrade
 	cd ./deployment && zip -q -r ../deployment.zip .
 
diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
index 5e57d49e99..8b7ae7cd5e 100644
--- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py
+++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
@@ -7,6 +7,8 @@
 from collections import defaultdict
 from concurrent.futures import as_completed, ThreadPoolExecutor
 from datetime import datetime, timedelta, timezone
+from lib.config import BENCHMARK_REGRESSION_CONFIG
+from jinja2 import Template
 
 # Local imports
 from typing import Any, Dict, Iterable, List, Optional, Set
@@ -69,7 +71,6 @@ def is_unix_timestamp(value: str) -> bool:
 def to_timestap_str(time: datetime) -> str:
     return str(int(time.timestamp()))
 
-
 def write_to_file(data: Any, filename="", path=""):
     """
     Writes data to a specified file. If no path is provided, writes to the current directory.
@@ -95,6 +96,21 @@ def write_to_file(data: Any, filename="", path=""):
         file.write(data)
     logger.info(f"File written to: {os.path.abspath(file_path)}")
 
+BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = "benchmark_regression_summary_report"
+
+
+def get_runtime_config(name: str, start: datetime, end: datetime):
+
+    try:
+        config = BENCHMARK_REGRESSION_CONFIG[name]
+        tmpl = Template(config.source.api_endpoint_params_template)
+        rendered = tmpl.render(
+            startTime=start.isoformat(timespec="milliseconds") + "Z",
+            stopTime=end.isoformat(timespec="milliseconds") + "Z",
+        )
+        cfg = dict(config)  # shallow copy
+        cfg["api_endpoint_params"] = json.loads(rendered)
+    return cfg
 
 
 class BenchmarkSummaryProcessor:
@@ -109,15 +125,10 @@ def __init__(
         output_snapshot_file_path: str = "",
     ) -> None:
         self.is_dry_run = is_dry_run
-        self.is_dry_run = is_dry_run
-        self.output_snapshot_file_name = output_snapshot_file_name
-        self.output_snapshot_file_path = output_snapshot_file_path
-        self.local_output = local_output and is_dry_run
 
     def process(
         self,
-        start_time: datetime,
-        end_time: datetime,
+        config: Dict[str, Any],
         cc: Optional[clickhouse_connect.driver.client.Client] = None,
         args: Optional[argparse.Namespace] = None,
     ) -> Dict[str, Any]:
@@ -138,8 +149,6 @@ def process(
 
         # fetches config to get time series from api
 
-
-        
         queued_jobs = self._fetch_snapshot_from_db(cc, start_time, end_time, repo)
 
         if len(queued_jobs) == 0:
@@ -148,15 +157,6 @@ def process(
                 + f"No jobs in queue in time range: [{start_time},{end_time}]"
             )
 
-        # add runner labels to each job based on machine type
-        self._add_runner_labels(
-            queued_jobs,
-            start_time,
-            meta_runner_config_retriever,
-            lf_runner_config_retriever,
-            old_lf_lf_runner_config_retriever,
-        )
-
         if len(queued_jobs) == 0:
             logger.info(
                 f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] "
@@ -190,6 +190,7 @@ def process(
         else:
             self._write_to_db_table(cc, records)
 
+
         return {
             "start_time": to_timestap_str(start_time),
             "end_time": to_timestap_str(end_time),
@@ -198,6 +199,7 @@ def process(
         }
 
 
+
 class WorkerPoolHandler:
     """
     WorkerPoolHandler runs workers in parallel to generate benchmark regression report
diff --git a/aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py b/aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py
new file mode 100644
index 0000000000..93b8131a3a
--- /dev/null
+++ b/aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py
@@ -0,0 +1,27 @@
+
+from dataclasses import dataclass, field
+from os import error
+from typing import Any, Dict, List, Optional
+
+
+@dataclass
+class TimeRange:
+    start: str
+    end: str
+
+@dataclass
+class TimeSeriesItem:
+    group_info: Dict[str, Any]  # flexible, could make a stricter dataclass if schema is known
+    num_of_dp: int
+    data: List[Dict[str, Any]] = field(default_factory=list)
+
+@dataclass
+class ApiData:
+    time_series: List[TimeSeriesItem]
+    time_range: TimeRange
+
+
+@dataclass
+class ApiResponse:
+    data: Optional[ApiData] = None   # present if success
+    error: Optional[str] = None      # present if failure
diff --git a/aws/lambda/benchmark_regression_summary_report/lib/config.py b/aws/lambda/benchmark_regression_summary_report/lib/config.py
new file mode 100644
index 0000000000..e5465b0419
--- /dev/null
+++ b/aws/lambda/benchmark_regression_summary_report/lib/config.py
@@ -0,0 +1,55 @@
+
+
+from lib.config_model import BenchmarkApiSource, BenchmarkConfig, BenchmarkRegressionConfigBook, DayRangeWindow, Frequency, RegressionPolicy, Policy, RangeConfig
+
+
+# compiler benchmark regression config
+COMPILER_BENCHMARK_CONFIG = BenchmarkConfig(
+            name="Compiler Benchmark Regression",
+            id = "compiler_regression",
+            source=BenchmarkApiSource(
+                api_query_url="http://localhost:3000/api/benchmark/get_time_series",
+                # currently we only detect the regression for h100 with dtype bfloat16, and mode inference
+                # we can extend this to other devices, dtypes and mode in the future
+                api_endpoint_params_template="""
+                {
+                  "name": "compiler_precompute",
+                  "query_params": {
+                    "commits": [],
+                    "compilers": [],
+                    "arch": "h100",
+                    "device": "cuda",
+                    "dtype": "bfloat16",
+                    "granularity": "hour",
+                    "mode": "inference",
+                    "startTime": "{{ startTime }}",
+                    "stopTime": "{{ stopTime }}",
+                    "suites": ["torchbench", "huggingface", "timm_models"],
+                    "workflowId": 0,
+                    "branches": ["main"]
+                  }
+                }
+                """
+            ),
+            # set baseline from past 7 days using avg, and compare with the last 1 day
+            policy=Policy(
+                frequency=Frequency(value=7, unit="days"),
+                range=RangeConfig(
+                    baseline=DayRangeWindow(value=7),
+                    comparison=DayRangeWindow(value=1),
+                ),
+                metrics={
+                    "passrate": RegressionPolicy(name="passrate",condition="greater_than", threshold=0.9),
+                    "geomean": RegressionPolicy(name="geomean",condition="greater_than", threshold=0.95),
+                    "dynamo_peak_mem": RegressionPolicy(
+                        name="dynamo_peak_mem",condition="greater_than", threshold=0.9
+                    ),
+                },
+            ),
+        )
+
+BENCHMARK_REGRESSION_CONFIG = BenchmarkRegressionConfigBook(
+    configs={
+        "compiler_regression":COMPILER_BENCHMARK_CONFIG,
+    }
+)
diff --git a/aws/lambda/benchmark_regression_summary_report/lib/config_model.py b/aws/lambda/benchmark_regression_summary_report/lib/config_model.py
new file mode 100644
index 0000000000..774a136ed6
--- /dev/null
+++ b/aws/lambda/benchmark_regression_summary_report/lib/config_model.py
@@ -0,0 +1,128 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Dict, Literal, Optional, Set
+from datetime import datetime, timedelta
+from jinja2 import Environment, Template, meta
+import json
+
+
+# -------- Frequency --------
+@dataclass
+class Frequency:
+    value: int
+    unit: Literal["days", "weeks"]
+
+    def to_timedelta(self) -> timedelta:
+        """Convert frequency into a datetime.timedelta."""
+        if self.unit == "days":
+            return timedelta(days=self.value)
+        elif self.unit == "weeks":
+            return timedelta(weeks=self.value)
+        else:
+            raise ValueError(f"Unsupported unit: {self.unit}")
+
+
+# -------- Source --------
+_JINJA_ENV = Environment(autoescape=False)
+
+@dataclass
+class BenchmarkApiSource:
+    api_query_url: str
+    api_endpoint_params_template: str
+    default_ctx: Dict[str, Any] = field(default_factory=dict)
+
+    def required_template_vars(self) -> set[str]:
+        ast = _JINJA_ENV.parse(self.api_endpoint_params_template)
+        return set(meta.find_undeclared_variables(ast))
+
+    def render(self, ctx: Dict[str, Any], strict: bool = True) -> dict:
+        """Render with caller-supplied context (no special casing for start/end)."""
+        merged = {**self.default_ctx, **ctx}
+
+        if strict:
+            required = self.required_template_vars()
+            missing = required - merged.keys()
+            if missing:
+                raise ValueError(f"Missing required vars: {missing}")
+        rendered = Template(self.api_endpoint_params_template).render(**merged)
+        return json.loads(rendered)
+
+
+# -------- Policy: range windows --------
+@dataclass
+class DayRangeWindow:
+    value: int
+    # raw indicates fetch from the source data
+    source: Literal["raw"] = "raw"
+
+
+
+
+
+
+
+@dataclass
+class RangeConfig:
+    baseline: RangeWindow
+    comparison: RangeWindow
+
+
+# -------- Policy: metrics --------
+@dataclass
+class RegressionPolicy:
+    name: str
+    # Meaning:
+    # - "greater_than": higher is better; violation if value < baseline * threshold
+    # - "less_than":    lower  is better; violation if value > baseline * threshold
+    # - "equal_to":     value should be ~= baseline * threshold within rel_tol
+    condition: Literal["greater_than", "less_than", "equal_to"]
+    threshold: float
+    rel_tol: float = 1e-3  # used only for "equal_to"
+
+    def is_violation(self, value: float, baseline: float) -> bool:
+        target = baseline * self.threshold
+
+        if self.condition == "greater_than":
+            # value should be >= target
+            return value < target
+
+        if self.condition == "less_than":
+            # value should be <= target
+            return value > target
+
+        # equal_to: |value - target| should be within rel_tol * max(1, |target|)
+        denom = max(1.0, abs(target))
+        return abs(value - target) > self.rel_tol * denom
+
+@dataclass
+class Policy:
+    frequency: Frequency
+    range: RangeConfig
+    metrics: Dict[str, RegressionPolicy]
+
+
+# -------- Top-level benchmark regression config --------
+@dataclass
+class BenchmarkConfig:
+    """
+        BenchmarkConfig defines the benchmark regression config for a given benchmark.
+        source: defines the source of the benchmark data we want to query_params
+        policy: defines the policy for the benchmark regressions
+        name:  the name of the benchmark
+        id:    the id of the benchmark, this must be unique for each benchmark, and cannot be changed once set
+    """
+    name: str
+    id: str
+    source: Source
+    policy: Policy
+
+
+@dataclass
+class BenchmarkRegressionConfigBook:
+    configs: Dict[str, BenchmarkConfig] = field(default_factory=dict)
+
+    def __getitem__(self, key: str) -> BenchmarkConfig:
+        config = self.configs.get(key, None)
+        if not config:
+            raise KeyError(f"Config {key} not found")
+        return config

From 118a7167fb1edcb9631cf7edffaa3216b5c29f81 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 2 Sep 2025 17:41:44 -0700
Subject: [PATCH 18/27] addid

---
 .../common/benchmark_time_series_api_model.py |  54 +++++
 .../{lib => common}/config.py                 |  14 +-
 .../common/config_model.py                    | 209 +++++++++++++++++
 .../lambda_function.py                        | 211 +++++++++++-------
 .../lib/benchmark_time_series_api_model.py    |  27 ---
 .../lib/config_model.py                       | 128 -----------
 6 files changed, 407 insertions(+), 236 deletions(-)
 create mode 100644 aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py
 rename aws/lambda/benchmark_regression_summary_report/{lib => common}/config.py (78%)
 create mode 100644 aws/lambda/benchmark_regression_summary_report/common/config_model.py
 delete mode 100644 aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py
 delete mode 100644 aws/lambda/benchmark_regression_summary_report/lib/config_model.py

diff --git a/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py
new file mode 100644
index 0000000000..3a68648dd3
--- /dev/null
+++ b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py
@@ -0,0 +1,54 @@
+
+from dataclasses import dataclass, field
+from typing import Optional, List, Dict, Any
+import requests
+
+@dataclass
+class TimeRange:
+    start: str
+    end: str
+
+@dataclass
+class BenchmarkTimeSeriesItem:
+    group_info: Dict[str, Any]
+    num_of_dp: int
+    data: List[Dict[str, Any]] = field(default_factory=list)
+
+@dataclass
+class BenchmarkTimeSeriesApiData:
+    time_series: List[BenchmarkTimeSeriesItem]
+    time_range: TimeRange
+
+@dataclass
+class BenchmarkTimeSeriesApiResponse:
+    data: BenchmarkTimeSeriesApiData
+
+    @classmethod
+    def from_request(cls, url: str, query: dict, timeout: int = 60) -> "BenchmarkTimeSeriesApiResponse":
+        """
+        Send a POST request and parse into BenchmarkTimeSeriesApiResponse.
+
+        Args:
+            url: API endpoint
+            query: JSON payload must
+            timeout: max seconds to wait for connect + response (default: 30)
+        Returns:
+            ApiResponse
+        Raises:
+            requests.exceptions.RequestException if network/timeout/HTTP error
+            RuntimeError if the API returns an "error" field or malformed data
+        """
+        resp = requests.post(url, json=query, timeout=timeout)
+        resp.raise_for_status()
+        payload = resp.json()
+
+        if "error" in payload:
+            raise RuntimeError(f"API error: {payload['error']}")
+
+        try:
+            tr = TimeRange(**payload["data"]["time_range"])
+            ts = [BenchmarkTimeSeriesItem(**item) for item in payload["data"]["time_series"]]
+        except Exception as e:
+            raise RuntimeError(f"Malformed API payload: {e}")
+
+        return cls(data=BenchmarkTimeSeriesApiData(time_series=ts, time_range=tr))
diff --git a/aws/lambda/benchmark_regression_summary_report/lib/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py
similarity index 78%
rename from aws/lambda/benchmark_regression_summary_report/lib/config.py
rename to aws/lambda/benchmark_regression_summary_report/common/config.py
index e5465b0419..b2677b3ce9 100644
--- a/aws/lambda/benchmark_regression_summary_report/lib/config.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/config.py
@@ -1,14 +1,14 @@
 
 
-from lib.config_model import BenchmarkApiSource, BenchmarkConfig, BenchmarkRegressionConfigBook, DayRangeWindow, Frequency, RegressionPolicy, Policy, RangeConfig
-
-
+from common.config_model import BenchmarkApiSource, BenchmarkConfig, BenchmarkRegressionConfigBook, DayRangeWindow, Frequency, RegressionPolicy, Policy, RangeConfig
 # compiler benchmark regression config
+# todo(elainewy): eventually each team should configure their own benchmark regression config, currenlty place here for lambda
 COMPILER_BENCHMARK_CONFIG = BenchmarkConfig(
             name="Compiler Benchmark Regression",
             id = "compiler_regression",
             source=BenchmarkApiSource(
                 api_query_url="http://localhost:3000/api/benchmark/get_time_series",
+                type="benchmark_time_series_api",
                 # currently we only detect the regression for h100 with dtype bfloat16, and mode inference
                 # we can extend this to other devices, dtypes and mode in the future
                 api_endpoint_params_template="""
@@ -33,7 +33,7 @@
             ),
             # set baseline from past 7 days using avg, and compare with the last 1 day
             policy=Policy(
-                frequency=Frequency(value=7, unit="days"),
+                frequency=Frequency(value=1, unit="days"),
                 range=RangeConfig(
                     baseline=DayRangeWindow(value=7),
                     comparison=DayRangeWindow(value=1),
@@ -45,9 +45,13 @@
                         name="dynamo_peak_mem",condition="greater_than", threshold=0.9
                     ),
                 },
+                notification_config={
+                    "type":"github",
+                    "repo":"pytorch/test-infra",
+                    "issue": "7081"
+                }
             ),
         )
-
 BENCHMARK_REGRESSION_CONFIG = BenchmarkRegressionConfigBook(
     configs={
         "compiler_regression":COMPILER_BENCHMARK_CONFIG,
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
new file mode 100644
index 0000000000..ffc09c8980
--- /dev/null
+++ b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
@@ -0,0 +1,209 @@
+from __future__ import annotations
+from dataclasses import dataclass, field, fields
+from typing import Any, ClassVar, Dict, Literal, Optional, Set, Type, Union
+from datetime import datetime, timedelta
+from jinja2 import Environment, Template, meta
+import requests
+import json
+
+
+# -------- Frequency --------
+@dataclass(frozen=True)
+class Frequency:
+    """
+    The frequency of how often the report should be generated.
+    The minimum frequency we support is 1 day.
+    Attributes:
+        value: Number of units (e.g., 7 for 7 days).
+        unit: Unit of time, either "days" or "weeks".
+
+    Methods:
+        to_timedelta: Convert frequency into a datetime.timedelta.
+        get_text: return the frequency in text format
+    """
+    value: int
+    unit: Literal["days", "weeks"]
+    def to_timedelta(self) -> timedelta:
+        """Convert frequency N days or M weeks into a datetime.timedelta."""
+        if self.unit == "days":
+            return timedelta(days=self.value)
+        elif self.unit == "weeks":
+            return timedelta(weeks=self.value)
+        else:
+            raise ValueError(f"Unsupported unit: {self.unit}")
+
+    def get_text(self):
+        return f"{self.value} {self.unit}"
+
+
+# -------- Source --------
+_JINJA_ENV = Environment(autoescape=False)
+
+@dataclass
+class BenchmarkApiSource:
+    """
+    Defines the source of the benchmark data we want to query
+    api_query_url: the url of the api to query
+    api_endpoint_params_template: the jinjia2 template of the api endpoint's query params
+    default_ctx: the default context to use when rendering the api_endpoint_params_template
+    """
+    api_query_url: str
+    api_endpoint_params_template: str
+    type: Literal["benchmark_time_series_api", "other"] = "benchmark_time_series_api"
+    default_ctx: Dict[str, Any] = field(default_factory=dict)
+
+    def required_template_vars(self) -> set[str]:
+        ast = _JINJA_ENV.parse(self.api_endpoint_params_template)
+        return set(meta.find_undeclared_variables(ast))
+
+    def render(self, ctx: Dict[str, Any], strict: bool = True) -> dict:
+        """Render with caller-supplied context (no special casing for start/end)."""
+        merged = {**self.default_ctx, **ctx}
+
+        if strict:
+            required = self.required_template_vars()
+            missing = required - merged.keys()
+            if missing:
+                raise ValueError(f"Missing required vars: {missing}")
+        rendered = Template(self.api_endpoint_params_template).render(**merged)
+        return json.loads(rendered)
+
+
+# -------- Policy: range windows --------
+@dataclass
+class DayRangeWindow:
+    value: int
+    # raw indicates fetch from the source data
+    source: Literal["raw"] = "raw"
+
+@dataclass
+class RangeConfig:
+    """
+    Defines the range of baseline and comparison windows for a given policy.
+    - baseline: the baseline window that build the baseline value
+    - comparison: the comparison window that we fetch data from to compare against the baseline value
+    """
+    baseline: DayRangeWindow
+    comparison: DayRangeWindow
+
+    def total_timedelta(self) -> timedelta:
+        return timedelta(days=self.baseline.value + self.comparison.value)
+    def comparison_timedelta(self) -> timedelta:
+        return timedelta(days=self.comparison.value)
+    def baseline_timedelta(self) -> timedelta:
+        return timedelta(days=self.baseline.value)
+
+# -------- Policy: metrics --------
+@dataclass
+class RegressionPolicy:
+    name: str
+    # Meaning:
+    # - "greater_than": higher is better; violation if value < baseline * threshold
+    # - "less_than":    lower  is better; violation if value > baseline * threshold
+    # - "equal_to":     value should be ~= baseline * threshold within rel_tol
+    condition: Literal["greater_than", "less_than", "equal_to"]
+    threshold: float
+    rel_tol: float = 1e-3  # used only for "equal_to"
+
+    def is_violation(self, value: float, baseline: float) -> bool:
+        target = baseline * self.threshold
+
+        if self.condition == "greater_than":
+            # value should be >= target
+            return value < target
+
+        if self.condition == "less_than":
+            # value should be <= target
+            return value > target
+
+        # equal_to: |value - target| should be within rel_tol * max(1, |target|)
+        denom = max(1.0, abs(target))
+        return abs(value - target) > self.rel_tol * denom
+
+class BaseNotificationConfig:
+    # every subclass must override this
+    type_tag: ClassVar[str]
+
+    @classmethod
+    def from_dict(cls: Type[T], d: Dict[str, Any]) -> T:
+        # pick only known fields for this dataclass
+        kwargs = {f.name: d.get(f.name) for f in fields(cls)}
+        return cls(**kwargs)  # type: ignore
+
+    @classmethod
+    def matches(cls, d: Dict[str, Any]) -> bool:
+        return d.get("type") == cls.type_tag
+
+
+@dataclass
+class GitHubNotificationConfig(BaseNotificationConfig):
+    type: str = "github"
+    repo: str = ""
+    issue_number: str = ""
+    type_tag: ClassVar[str] = "github"
+
+    def create_github_comment(self, body: str, github_token: str) -> Dict[str, Any]:
+        """
+            Create a new comment on a GitHub issue.
+            Args:
+                notification_config: dict with keys:
+                    - type: must be "github"
+                    - repo: "owner/repo"
+                    - issue: issue number (string or int)
+                body: text of the comment
+                token: GitHub personal access token or GitHub Actions token
+
+                Returns:
+                    The GitHub API response as a dict (JSON).
+        """
+        url = f"https://api.github.com/repos/{self.repo}/issues/{self.issue_number}/comments"
+        headers = {
+            "Authorization": f"token {github_token}",
+            "Accept": "application/vnd.github+json",
+            "User-Agent": "bench-reporter/1.0",
+        }
+        resp = requests.post(url, headers=headers, json={"body": body})
+        resp.raise_for_status()
+        return resp.json()
+@dataclass
+class Policy:
+    frequency: Frequency
+    range: RangeConfig
+    metrics: Dict[str, RegressionPolicy]
+    notification_config: Optional[Dict[str, Any]] = None
+
+    def get_github_notification_config(self) -> Optional[GitHubNotificationConfig]:
+        if not self.notification_config:
+            return None
+        if self.notification_config and self.notification_config.get("type") == "github":
+        return notification_from_dict(self.notification_config)  # type: ignore
+
+
+
+# -------- Top-level benchmark regression config --------
+@dataclass
+class BenchmarkConfig:
+    """
+    Represents a single benchmark regression configuration.
+
+        - BenchmarkConfig defines the benchmark regression config for a given benchmark.
+        - source: defines the source of the benchmark data we want to query
+        - policy: defines the policy for the benchmark regressions
+        - name:  the name of the benchmark
+        - id: the id of the benchmark, this must be unique for each benchmark, and cannot be changed once set
+    """
+    name: str
+    id: str
+    source: BenchmarkApiSource
+    policy: Policy
+
+
+@dataclass
+class BenchmarkRegressionConfigBook:
+    configs: Dict[str, BenchmarkConfig] = field(default_factory=dict)
+
+    def __getitem__(self, key: str) -> BenchmarkConfig:
+        config = self.configs.get(key, None)
+        if not config:
+            raise KeyError(f"Config {key} not found")
+        return config
diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
index 8b7ae7cd5e..2962e77d0a 100644
--- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py
+++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
@@ -6,16 +6,17 @@
 import threading
 from collections import defaultdict
 from concurrent.futures import as_completed, ThreadPoolExecutor
-from datetime import datetime, timedelta, timezone
-from lib.config import BENCHMARK_REGRESSION_CONFIG
+import datetime as dt
+from common.benchmark_time_series_api_model import BenchmarkTimeSeriesApiData, BenchmarkTimeSeriesApiResponse, TimeRange
+from common.config_model import BenchmarkApiSource, BenchmarkConfig, Frequency, Policy, RangeConfig
+from common.config import BENCHMARK_REGRESSION_CONFIG
 from jinja2 import Template
+import requests
+from dateutil.parser import isoparse
 
-# Local imports
 from typing import Any, Dict, Iterable, List, Optional, Set
-
 import clickhouse_connect
 import yaml
-from dateutil.parser import parse
 from github import Auth, Github
 
 
@@ -32,6 +33,11 @@
     "CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME", ""),
 }
 
+BENMARK_REGRESSION_REPORT_DB="fortesting.benchmark_regression_report"
+
+def truncate_to_hour(ts: dt.datetime) -> dt.datetime:
+    return ts.replace(minute=0, second=0, microsecond=0)
+
 
 def get_clickhouse_client(
     host: str, user: str, password: str
@@ -67,9 +73,8 @@ def is_unix_timestamp(value: str) -> bool:
             return False
     return False
 
-
-def to_timestap_str(time: datetime) -> str:
-    return str(int(time.timestamp()))
+def to_hour_str(ts: dt.datetime) -> str:
+    return truncate_to_hour(ts).isoformat().replace("+00:00", "Z")
 
 def write_to_file(data: Any, filename="", path=""):
     """
@@ -98,20 +103,14 @@ def write_to_file(data: Any, filename="", path=""):
 
 BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = "benchmark_regression_summary_report"
 
-
-def get_runtime_config(name: str, start: datetime, end: datetime):
-
+def get_config(config_id: str)-> BenchmarkConfig:
     try:
-        config = BENCHMARK_REGRESSION_CONFIG[name]
-        tmpl = Template(config.source.api_endpoint_params_template)
-        rendered = tmpl.render(
-            startTime=start.isoformat(timespec="milliseconds") + "Z",
-            stopTime=end.isoformat(timespec="milliseconds") + "Z",
-        )
-        cfg = dict(config)  # shallow copy
-        cfg["api_endpoint_params"] = json.loads(rendered)
-    return cfg
-
+        config: BenchmarkConfig = BENCHMARK_REGRESSION_CONFIG[config_id]
+    except KeyError:
+        raise ValueError(f"Invalid config id: {config_id}")
+    except Exception as e:
+        raise e
+    return config
 
 class BenchmarkSummaryProcessor:
     """
@@ -120,18 +119,47 @@ class BenchmarkSummaryProcessor:
     def __init__(
         self,
         is_dry_run: bool = False,
-        local_output: bool = False,
-        output_snapshot_file_name: str = "summary_report_snapshot",
-        output_snapshot_file_path: str = "",
     ) -> None:
         self.is_dry_run = is_dry_run
 
+    def should_generate_report(
+        self,
+        cc: clickhouse_connect.driver.client.Client,
+        end_time: dt.datetime,
+        config_id: str,
+        f: Frequency
+    ) -> bool:
+        """
+         decide wether should generate the report based on the frequency in policy
+        """
+        def get_latest_regression_report(
+            cc: clickhouse_connect.driver.Client,
+            config_id: str,
+        ):
+            result = cc.query(
+                "SELECT max(report_date) FROM benchmark_regression_report WHERE report_id = {config_id:String}",
+                parameters={"config_id": config_id},
+            )
+            if not result.result_rows or result.result_rows[0][0] is None:
+                return None
+            return result.result_rows[0][0]
+        freq_delta = f.to_timedelta()
+        latest_date = get_latest_regression_report(cc, config_id)
+        # No report exists yet, generate
+        if not latest_date:
+            return True
+        # we only verify by date to see if we should generate the data
+        cutoff = end_time.date() - freq_delta
+        return latest_date < cutoff
+
+
     def process(
         self,
-        config: Dict[str, Any],
+        config_id: str,
+        end_time: dt.datetime,
         cc: Optional[clickhouse_connect.driver.client.Client] = None,
         args: Optional[argparse.Namespace] = None,
-    ) -> Dict[str, Any]:
+    ):
         # ensure each thread has its own clickhouse client. clickhouse client
         # is not thread-safe.
         if cc is None:
@@ -146,50 +174,41 @@ def process(
                 else:
                     tlocal.cc = get_clickhouse_client_environment()
             cc = tlocal.cc
-
-        # fetches config to get time series from api
-
-        queued_jobs = self._fetch_snapshot_from_db(cc, start_time, end_time, repo)
-
-        if len(queued_jobs) == 0:
-            logger.info(
-                f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] "
-                + f"No jobs in queue in time range: [{start_time},{end_time}]"
-            )
-
-        if len(queued_jobs) == 0:
-            logger.info(
-                f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] "
-                + "No queued jobs, skipping generating histogram records.."
-            )
-
-        records = QueuedJobHistogramGenerator().generate_histogram_records(
-            queued_jobs,
-            datetime.now(timezone.utc),
-            "half-hour-mark-queue-time-histogram",
-            end_time,
+        config = get_config(config_id)
+
+        # check if we should generate report for end_time
+        # currently we only verify if end_time > latest report date + policy.freq in db
+        report_freq = config.policy.frequency
+        should_generate = self.should_generate_report(cc, end_time,config_id,report_freq)
+        if not should_generate:
+            logger.info("[%s] Skip generate report for date: %s with frequency %s",config_id, end_time.date(), report_freq.get_text())
+            return;
+        data_range = config.policy.range
+        total_timedelta = data_range.baseline_timedelta
+        logger.info("[%s] fetching benchmark data from source",config_id)
+
+        if config.source.type!="benchmark_time_series_api":
+            logger.error(f"{config_id}: currently we only suppport benchmark_time_series_api to fetch source data")
+            return;
+
+        # Comparison: [end_time - 1d, end_time)
+        comp_s = end_time - data_range.comparison_timedelta()
+        comp_e  = end_time
+        comparison_data = self._fetch_from_benchmark_ts_api(
+            config_id=config_id,
+            start_time=baseline_s,
+            end_time=baseline_e,
+            source=config.source,
         )
 
-        if len(records) == 0:
-            logger.info(
-                f" [QueueTimeProcessor][Snapshot {to_timestap_str(end_time)}] "
-                + "No histogram records, skipping writing.."
-            )
-
-        if self.is_dry_run:
-            logger.info(
-                f" [Dry Run Mode][Snapshot {to_timestap_str(end_time)}] "
-                + "Writing results to terminal/local file ..."
-            )
-            self._output_record(queued_jobs, end_time, type="queued_jobs")
-            self._output_record(records, end_time, type="records")
-            logger.info(
-                f" [Dry Run Mode][Snapshot {to_timestap_str(end_time)}] "
-                + "Done. Write results to terminal/local file ."
-            )
-        else:
-            self._write_to_db_table(cc, records)
+        data = self._fetch_from_benchmark_ts_api(config_id, end_time, start_time, config.source)
+        latest_ts = data.time_range.end
+        # no data in the time range
+        if not latest_ts:
+            logger.info("[%s] No data found for report %s",config_id, end_time.date())
+            return
 
+        regression_policy = config.policy.metrics
 
         return {
             "start_time": to_timestap_str(start_time),
@@ -198,6 +217,54 @@ def process(
             "records_count": len(records),
         }
 
+    def get_basline(self, config: BenchmarkConfig,end_time: dt.datetime):
+        data_range = config.policy.range
+        baseline_s = end_time - data_range.total_timedelta()
+        baseline_e   = end_time - data_range.comparison_timedelta()
+
+        # fetch baseline from api
+        raw_data = self._fetch_from_benchmark_ts_api(
+            config_id=config.id,
+            start_time=baseline_s,
+            end_time=baseline_e,
+            source=config.source,
+        )
+
+        def to_baseline(data:BenchmarkTimeSeriesApiData):
+            data.
+
+
+
+
+
+
+
+
+
+    def _detect_regression(self,end_time: dt.datetime, data: BenchmarkTimeSeriesApiData, policy: Policy):
+        metrics_dict = policy.metrics
+        baseline_range = policy.range.baseline_timedelta()
+        comparison = policy.range.comparison_timedelta()
+
+
+
+
+        return
+    def _fetch_from_benchmark_ts_api(self,config_id:str, end_time: dt.datetime,start_time:dt.datetime, source: BenchmarkApiSource):
+        str_end_time = end_time.isoformat()
+        str_start_time = start_time.isoformat()
+        query = source.render(ctx={
+            "startTime": str_start_time,
+            "endTime":  str_end_time,
+        })
+        url = source.api_query_url
+        try:
+            resp:BenchmarkTimeSeriesApiResponse = BenchmarkTimeSeriesApiResponse.from_request(url, query)
+
+            return resp.data
+        except Exception as e:
+            raise RuntimeError(f"[{config_id}]Fetch failed:", e)
+
 
 
 class WorkerPoolHandler:
@@ -250,19 +317,14 @@ def main(
     args: Optional[argparse.Namespace] = None,
     github_access_token: str = "",
     is_dry_run: bool = False,
-    local_output: bool = False,
-    output_snapshot_file_name: str = "job_queue_times_snapshot",
-    output_snapshot_file_path: str = "",
 ):
     """
     Main method to run in both local environment and lambda handler.
        1. generate intervals[start_time,end_time] using latest timestamp from source table and target table
        2. call WorkerPoolHandler to geneterate and write histogram data for each interval in parallel
     """
-    # gets config retrievers, this is used to generate runner labels for histgram
     if not github_access_token:
         raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN")
-    config_retrievers = get_config_retrievers(github_access_token)
 
     # get time intervals.
     logger.info(" [Main] generating time intervals ....")
@@ -279,10 +341,7 @@ def main(
     handler = WorkerPoolHandler(
         config_retrievers,
         BenchmarkSummaryProcessor(
-            is_dry_run=is_dry_run,
-            local_output=local_output,
-            output_snapshot_file_name=output_snapshot_file_name,
-            output_snapshot_file_path=output_snapshot_file_path,
+            is_dry_run=is_dry_run
         ),
     )
     handler.start(time_intervals, args)
diff --git a/aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py b/aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py
deleted file mode 100644
index 93b8131a3a..0000000000
--- a/aws/lambda/benchmark_regression_summary_report/lib/benchmark_time_series_api_model.py
+++ /dev/null
@@ -1,27 +0,0 @@
-
-from dataclasses import dataclass, field
-from os import error
-from typing import Any, Dict, List, Optional
-
-
-@dataclass
-class TimeRange:
-    start: str
-    end: str
-
-@dataclass
-class TimeSeriesItem:
-    group_info: Dict[str, Any]  # flexible, could make a stricter dataclass if schema is known
-    num_of_dp: int
-    data: List[Dict[str, Any]] = field(default_factory=list)
-
-@dataclass
-class ApiData:
-    time_series: List[TimeSeriesItem]
-    time_range: TimeRange
-
-
-@dataclass
-class ApiResponse:
-    data: Optional[ApiData] = None   # present if success
-    error: Optional[str] = None      # present if failure
diff --git a/aws/lambda/benchmark_regression_summary_report/lib/config_model.py b/aws/lambda/benchmark_regression_summary_report/lib/config_model.py
deleted file mode 100644
index 774a136ed6..0000000000
--- a/aws/lambda/benchmark_regression_summary_report/lib/config_model.py
+++ /dev/null
@@ -1,128 +0,0 @@
-from __future__ import annotations
-from dataclasses import dataclass, field
-from typing import Any, Dict, Literal, Optional, Set
-from datetime import datetime, timedelta
-from jinja2 import Environment, Template, meta
-import json
-
-
-# -------- Frequency --------
-@dataclass
-class Frequency:
-    value: int
-    unit: Literal["days", "weeks"]
-
-    def to_timedelta(self) -> timedelta:
-        """Convert frequency into a datetime.timedelta."""
-        if self.unit == "days":
-            return timedelta(days=self.value)
-        elif self.unit == "weeks":
-            return timedelta(weeks=self.value)
-        else:
-            raise ValueError(f"Unsupported unit: {self.unit}")
-
-
-# -------- Source --------
-_JINJA_ENV = Environment(autoescape=False)
-
-@dataclass
-class BenchmarkApiSource:
-    api_query_url: str
-    api_endpoint_params_template: str
-    default_ctx: Dict[str, Any] = field(default_factory=dict)
-
-    def required_template_vars(self) -> set[str]:
-        ast = _JINJA_ENV.parse(self.api_endpoint_params_template)
-        return set(meta.find_undeclared_variables(ast))
-
-    def render(self, ctx: Dict[str, Any], strict: bool = True) -> dict:
-        """Render with caller-supplied context (no special casing for start/end)."""
-        merged = {**self.default_ctx, **ctx}
-
-        if strict:
-            required = self.required_template_vars()
-            missing = required - merged.keys()
-            if missing:
-                raise ValueError(f"Missing required vars: {missing}")
-        rendered = Template(self.api_endpoint_params_template).render(**merged)
-        return json.loads(rendered)
-
-
-# -------- Policy: range windows --------
-@dataclass
-class DayRangeWindow:
-    value: int
-    # raw indicates fetch from the source data
-    source: Literal["raw"] = "raw"
-
-
-
-
-
-
-
-@dataclass
-class RangeConfig:
-    baseline: RangeWindow
-    comparison: RangeWindow
-
-
-# -------- Policy: metrics --------
-@dataclass
-class RegressionPolicy:
-    name: str
-    # Meaning:
-    # - "greater_than": higher is better; violation if value < baseline * threshold
-    # - "less_than":    lower  is better; violation if value > baseline * threshold
-    # - "equal_to":     value should be ~= baseline * threshold within rel_tol
-    condition: Literal["greater_than", "less_than", "equal_to"]
-    threshold: float
-    rel_tol: float = 1e-3  # used only for "equal_to"
-
-    def is_violation(self, value: float, baseline: float) -> bool:
-        target = baseline * self.threshold
-
-        if self.condition == "greater_than":
-            # value should be >= target
-            return value < target
-
-        if self.condition == "less_than":
-            # value should be <= target
-            return value > target
-
-        # equal_to: |value - target| should be within rel_tol * max(1, |target|)
-        denom = max(1.0, abs(target))
-        return abs(value - target) > self.rel_tol * denom
-
-@dataclass
-class Policy:
-    frequency: Frequency
-    range: RangeConfig
-    metrics: Dict[str, RegressionPolicy]
-
-
-# -------- Top-level benchmark regression config --------
-@dataclass
-class BenchmarkConfig:
-    """
-        BenchmarkConfig defines the benchmark regression config for a given benchmark.
-        source: defines the source of the benchmark data we want to query_params
-        policy: defines the policy for the benchmark regressions
-        name:  the name of the benchmark
-        id:    the id of the benchmark, this must be unique for each benchmark, and cannot be changed once set
-    """
-    name: str
-    id: str
-    source: Source
-    policy: Policy
-
-
-@dataclass
-class BenchmarkRegressionConfigBook:
-    configs: Dict[str, BenchmarkConfig] = field(default_factory=dict)
-
-    def __getitem__(self, key: str) -> BenchmarkConfig:
-        config = self.configs.get(key, None)
-        if not config:
-            raise KeyError(f"Config {key} not found")
-        return config

From f7ed34b16d42978e1e8742672a3739ba7dfd5e93 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 2 Sep 2025 22:08:44 -0700
Subject: [PATCH 19/27] addid

---
 .../common/benchmark_time_series_api_model.py |  16 +-
 .../common/config.py                          |  76 +++---
 .../common/config_model.py                    |   9 +-
 .../common/regression_utils.py                | 205 +++++++++++++++
 .../lambda_function.py                        | 248 +++++++++++-------
 torchci/lib/clickhouse.ts                     |   2 +-
 6 files changed, 418 insertions(+), 138 deletions(-)
 create mode 100644 aws/lambda/benchmark_regression_summary_report/common/regression_utils.py

diff --git a/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py
index 3a68648dd3..2fa8960013 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py
@@ -1,30 +1,35 @@
-
 from dataclasses import dataclass, field
 from typing import Optional, List, Dict, Any
 import requests
 
+
 @dataclass
 class TimeRange:
     start: str
     end: str
 
+
 @dataclass
 class BenchmarkTimeSeriesItem:
     group_info: Dict[str, Any]
     num_of_dp: int
     data: List[Dict[str, Any]] = field(default_factory=list)
 
+
 @dataclass
 class BenchmarkTimeSeriesApiData:
     time_series: List[BenchmarkTimeSeriesItem]
     time_range: TimeRange
 
+
 @dataclass
 class BenchmarkTimeSeriesApiResponse:
     data: BenchmarkTimeSeriesApiData
 
     @classmethod
-    def from_request(cls, url: str, query: dict, timeout: int = 60) -> "BenchmarkTimeSeriesApiResponse":
+    def from_request(
+        cls, url: str, query: dict, timeout: int = 60
+    ) -> "BenchmarkTimeSeriesApiResponse":
         """
         Send a POST request and parse into BenchmarkTimeSeriesApiResponse.
 
@@ -44,11 +49,12 @@ def from_request(cls, url: str, query: dict, timeout: int = 60) -> "BenchmarkTim
 
         if "error" in payload:
             raise RuntimeError(f"API error: {payload['error']}")
-
         try:
             tr = TimeRange(**payload["data"]["time_range"])
-            ts = [BenchmarkTimeSeriesItem(**item) for item in payload["data"]["time_series"]]
+            ts = [
+                BenchmarkTimeSeriesItem(**item)
+                for item in payload["data"]["time_series"]
+            ]
         except Exception as e:
             raise RuntimeError(f"Malformed API payload: {e}")
-
         return cls(data=BenchmarkTimeSeriesApiData(time_series=ts, time_range=tr))
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py
index b2677b3ce9..18831f6070 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/config.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/config.py
@@ -1,17 +1,25 @@
+from common.config_model import (
+    BenchmarkApiSource,
+    BenchmarkConfig,
+    BenchmarkRegressionConfigBook,
+    DayRangeWindow,
+    Frequency,
+    RegressionPolicy,
+    Policy,
+    RangeConfig,
+)
 
-
-from common.config_model import BenchmarkApiSource, BenchmarkConfig, BenchmarkRegressionConfigBook, DayRangeWindow, Frequency, RegressionPolicy, Policy, RangeConfig
 # compiler benchmark regression config
 # todo(elainewy): eventually each team should configure their own benchmark regression config, currenlty place here for lambda
 COMPILER_BENCHMARK_CONFIG = BenchmarkConfig(
-            name="Compiler Benchmark Regression",
-            id = "compiler_regression",
-            source=BenchmarkApiSource(
-                api_query_url="http://localhost:3000/api/benchmark/get_time_series",
-                type="benchmark_time_series_api",
-                # currently we only detect the regression for h100 with dtype bfloat16, and mode inference
-                # we can extend this to other devices, dtypes and mode in the future
-                api_endpoint_params_template="""
+    name="Compiler Benchmark Regression",
+    id="compiler_regression",
+    source=BenchmarkApiSource(
+        api_query_url="http://localhost:3000/api/benchmark/get_time_series",
+        type="benchmark_time_series_api",
+        # currently we only detect the regression for h100 with dtype bfloat16, and mode inference
+        # we can extend this to other devices, dtypes and mode in the future
+        api_endpoint_params_template="""
                 {
                   "name": "compiler_precompute",
                   "query_params": {
@@ -29,31 +37,35 @@
                     "branches": ["main"]
                   }
                 }
-                """
+                """,
+    ),
+    # set baseline from past 7 days using avg, and compare with the last 1 day
+    policy=Policy(
+        frequency=Frequency(value=1, unit="days"),
+        range=RangeConfig(
+            baseline=DayRangeWindow(value=7),
+            comparison=DayRangeWindow(value=1),
+        ),
+        metrics={
+            "passrate": RegressionPolicy(
+                name="passrate", condition="greater_than", threshold=0.9
             ),
-            # set baseline from past 7 days using avg, and compare with the last 1 day
-            policy=Policy(
-                frequency=Frequency(value=1, unit="days"),
-                range=RangeConfig(
-                    baseline=DayRangeWindow(value=7),
-                    comparison=DayRangeWindow(value=1),
-                ),
-                metrics={
-                    "passrate": RegressionPolicy(name="passrate",condition="greater_than", threshold=0.9),
-                    "geomean": RegressionPolicy(name="geomean",condition="greater_than", threshold=0.95),
-                    "dynamo_peak_mem": RegressionPolicy(
-                        name="dynamo_peak_mem",condition="greater_than", threshold=0.9
-                    ),
-                },
-                notification_config={
-                    "type":"github",
-                    "repo":"pytorch/test-infra",
-                    "issue": "7081"
-                }
+            "geomean": RegressionPolicy(
+                name="geomean", condition="greater_than", threshold=0.95
             ),
-        )
+            "dynamo_peak_mem": RegressionPolicy(
+                name="dynamo_peak_mem", condition="greater_than", threshold=0.9
+            ),
+        },
+        notification_config={
+            "type": "github",
+            "repo": "pytorch/test-infra",
+            "issue": "7081",
+        },
+    ),
+)
 BENCHMARK_REGRESSION_CONFIG = BenchmarkRegressionConfigBook(
     configs={
-        "compiler_regression":COMPILER_BENCHMARK_CONFIG,
+        "compiler_regression": COMPILER_BENCHMARK_CONFIG,
     }
 )
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
index ffc09c8980..dd892ccc45 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/config_model.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
@@ -96,11 +96,12 @@ def baseline_timedelta(self) -> timedelta:
 # -------- Policy: metrics --------
 @dataclass
 class RegressionPolicy:
+    """
+        - "greater_than": higher is better; violation if value < baseline * threshold
+        - "less_than":    lower  is better; violation if value > baseline * threshold
+        - "equal_to":     value should be ~= baseline * threshold within rel_tol
+    """
     name: str
-    # Meaning:
-    # - "greater_than": higher is better; violation if value < baseline * threshold
-    # - "less_than":    lower  is better; violation if value > baseline * threshold
-    # - "equal_to":     value should be ~= baseline * threshold within rel_tol
     condition: Literal["greater_than", "less_than", "equal_to"]
     threshold: float
     rel_tol: float = 1e-3  # used only for "equal_to"
diff --git a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
new file mode 100644
index 0000000000..008986e4c5
--- /dev/null
+++ b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
@@ -0,0 +1,205 @@
+import logging
+import math
+from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict
+import statistics
+from dateutil.parser import isoparse
+from common.config_model import RegressionPolicy
+from common.benchmark_time_series_api_model import (
+    BenchmarkTimeSeriesApiData,
+    BenchmarkTimeSeriesItem,
+)
+
+RegressionClassifyLabel = Literal[
+    "regression", "suspicious", "no_regression", "insufficient_data"
+]
+
+
+class BaselineItem(TypedDict):
+    group_info: Dict[str, Any]
+    value: float
+
+
+class LatestItem(TypedDict):
+    group_info: Dict[str, Any]
+    values: List[float]
+
+
+def to_latest_data_map(
+    data: BenchmarkTimeSeriesApiData, field="value"
+) -> Dict[tuple, LatestItem]:
+    result = {}
+    for ts_group in data.time_series:
+        group_keys = tuple(sorted(ts_group.group_info.items()))
+        values = [
+            float(d[field])
+            for d in sorted(
+                ts_group.data,
+                key=lambda d: isoparse(d["granularity_bucket"]),  # convert to datetime
+            )
+            if field in d
+        ]
+        result[group_keys] = {
+            "group_info": ts_group.group_info,
+            "values": values,
+        }
+    return result
+
+
+def to_baseline_map(
+    baseline: BenchmarkTimeSeriesApiData,
+    mode: str = "mean",
+    field: str = "value",
+) -> Dict[tuple, BaselineItem]:
+    """
+    return
+      {
+        group_key[tuple]: {
+          "group_info": {...},
+          "baseline": float
+        }
+      }
+    """
+    result = {}
+    for ts_group in baseline.time_series:
+        group_keys = tuple(sorted(ts_group.group_info.items()))
+        values = [float(d[field]) for d in ts_group.data if field in d]
+        if not values:
+            continue
+
+        if mode == "mean":
+            val = statistics.fmean(values)
+        elif mode == "p90":
+            val = percentile(values, 0.9)
+        else:
+            raise ValueError("mode must be 'mean' or 'p90'")
+
+        result[group_keys] = {
+            "group_info": ts_group.group_info,
+            "baseline": val,
+        }
+    return result
+
+
+def classify_flags(flags: list[bool], min_points: int = 3) -> RegressionClassifyLabel:
+    """
+    Classify a sequence of boolean flags to detect regression.
+
+    - regression: last run has >= 2 consecutive True values
+    - suspicious: there is a run of >= 3 consecutive True values, but not at the end
+    - no_regression: all other cases
+    - insufficient_data: not enough data points (< min_points)
+
+    Special case:
+    - If min_points == 1, then just look at the last flag:
+        True  -> regression
+        False -> no_regression
+    """
+    n = len(flags)
+    if n == 0:
+        return "insufficient_data"
+
+    if min_points == 1:
+        return "regression" if flags[-1] else "no_regression"
+
+    if n < min_points:
+        return "insufficient_data"
+
+    # trailing run length
+    t = 0
+    for v in reversed(flags):
+        if v:
+            t += 1
+        else:
+            break
+    if t >= 2:
+        return "regression"
+
+    # longest run anywhere
+    longest = cur = 0
+    for v in flags:
+        cur = cur + 1 if v else 0
+        longest = max(longest, cur)
+
+    if longest >= 3:
+        return "suspicious"
+
+    return "no_regression"
+
+
+def percentile(values, q: float):
+    if not values:
+        return None
+    v = sorted(values)
+    k = (len(v) - 1) * q
+    f = math.floor(k)
+    c = math.ceil(k)
+    if f == c:
+        return v[int(k)]
+    return v[f] + (v[c] - v[f]) * (k - f)
+
+
+def _resolve_policy(
+    metric_policies: Dict[str, RegressionPolicy],
+    metric: str,
+) -> Optional[RegressionPolicy]:
+    if not metric:
+        return None
+    m = metric.lower()
+    return metric_policies.get(m)
+
+
+def detect_regressions_with_policies(
+    baseline_map: Dict[tuple, BaselineItem],
+    latest_map: Dict[tuple, LatestItem],
+    *,
+    metric_policies: Dict[str, RegressionPolicy],
+    min_points: int = 2,
+) -> Tuple[List[Dict[str, Any]], bool]:
+    """
+    For each group:
+      - choose policy by group_info['metric']
+      - compute flags via policy.is_violation(value, baseline)
+      - classify with classify_flags
+    Returns a list of {group_info, baseline, values, flags, label, policy}
+    """
+    results: List[Dict[str, Any]] = []
+
+    is_any_regression = False
+
+    for key in sorted(latest_map.keys()):
+        cur_item = latest_map.get(key)
+        gi = cur_item["group_info"] if cur_item else {}
+        latest_vals = cur_item["values"] if cur_item else []
+        policy = _resolve_policy(metric_policies, gi.get("metric", ""))
+        if not policy:
+            logging.warning(
+                f"no policy for metric %s with group_info=%s", gi.get("metric", ""), gi
+            )
+            continue
+
+        base_item = baseline_map.get(key)
+        baseline_value = base_item.get("value") if base_item else None
+        if not base_item or not baseline_value:
+            logging.warning(
+                f"no baseline for metric %s with group_info=%s",
+                gi.get("metric", ""),
+                gi,
+            )
+            continue
+
+        # Per-point violations (True = regression)
+        flags = [policy.is_violation(v, baseline_value) for v in latest_vals]
+        label = classify_flags(flags, min_points=min_points)
+        results.append(
+            {
+                "group_info": gi,
+                "baseline": baseline_value,
+                "values": latest_vals,
+                "flags": flags,
+                "label": label,
+                "policy": policy,
+            }
+        )
+        if label == "regression":
+            is_any_regression = True
+    return results, is_any_regression
diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
index 2962e77d0a..0c112156a8 100644
--- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py
+++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
@@ -7,8 +7,24 @@
 from collections import defaultdict
 from concurrent.futures import as_completed, ThreadPoolExecutor
 import datetime as dt
-from common.benchmark_time_series_api_model import BenchmarkTimeSeriesApiData, BenchmarkTimeSeriesApiResponse, TimeRange
-from common.config_model import BenchmarkApiSource, BenchmarkConfig, Frequency, Policy, RangeConfig
+from common.regression_utils import (
+    detect_regressions_with_policies,
+    to_baseline_map,
+    to_latest_data_map,
+    to_time_series_item_map,
+)
+from common.benchmark_time_series_api_model import (
+    BenchmarkTimeSeriesApiData,
+    BenchmarkTimeSeriesApiResponse,
+    TimeRange,
+)
+from common.config_model import (
+    BenchmarkApiSource,
+    BenchmarkConfig,
+    Frequency,
+    Policy,
+    RangeConfig,
+)
 from common.config import BENCHMARK_REGRESSION_CONFIG
 from jinja2 import Template
 import requests
@@ -33,7 +49,8 @@
     "CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME", ""),
 }
 
-BENMARK_REGRESSION_REPORT_DB="fortesting.benchmark_regression_report"
+BENMARK_REGRESSION_REPORT_DB = "fortesting.benchmark_regression_report"
+
 
 def truncate_to_hour(ts: dt.datetime) -> dt.datetime:
     return ts.replace(minute=0, second=0, microsecond=0)
@@ -61,21 +78,6 @@ def get_clickhouse_client_environment() -> clickhouse_connect.driver.client.Clie
     )
 
 
-def is_unix_timestamp(value: str) -> bool:
-    """Check if the string is a valid Unix timestamp."""
-    if value.isdigit():  # Ensure it's numeric
-        try:
-            timestamp = int(value)
-            # Check if it's within a reasonable range (1970 to 2100)
-            datetime.fromtimestamp(timestamp)
-            return True
-        except (ValueError, OSError):
-            return False
-    return False
-
-def to_hour_str(ts: dt.datetime) -> str:
-    return truncate_to_hour(ts).isoformat().replace("+00:00", "Z")
-
 def write_to_file(data: Any, filename="", path=""):
     """
     Writes data to a specified file. If no path is provided, writes to the current directory.
@@ -101,9 +103,11 @@ def write_to_file(data: Any, filename="", path=""):
         file.write(data)
     logger.info(f"File written to: {os.path.abspath(file_path)}")
 
+
 BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = "benchmark_regression_summary_report"
 
-def get_config(config_id: str)-> BenchmarkConfig:
+
+def get_config(config_id: str) -> BenchmarkConfig:
     try:
         config: BenchmarkConfig = BENCHMARK_REGRESSION_CONFIG[config_id]
     except KeyError:
@@ -112,9 +116,9 @@ def get_config(config_id: str)-> BenchmarkConfig:
         raise e
     return config
 
+
 class BenchmarkSummaryProcessor:
-    """
-    """
+    """ """
 
     def __init__(
         self,
@@ -127,31 +131,48 @@ def should_generate_report(
         cc: clickhouse_connect.driver.client.Client,
         end_time: dt.datetime,
         config_id: str,
-        f: Frequency
+        f: Frequency,
     ) -> bool:
         """
-         decide wether should generate the report based on the frequency in policy
+        decide wether should generate the report based on the frequency in policy
         """
-        def get_latest_regression_report(
+
+        def _get_latest_record_ts(
             cc: clickhouse_connect.driver.Client,
             config_id: str,
-        ):
-            result = cc.query(
-                "SELECT max(report_date) FROM benchmark_regression_report WHERE report_id = {config_id:String}",
+        ) -> Optional[dt.datetime]:
+            res = cc.query(
+                """
+                SELECT max(last_record_ts)
+                FROM benchmark_regression_report
+                WHERE report_id = {config_id:String}
+                """,
                 parameters={"config_id": config_id},
             )
-            if not result.result_rows or result.result_rows[0][0] is None:
+            if not res.result_rows or res.result_rows[0][0] is None:
                 return None
-            return result.result_rows[0][0]
+            latest: dt.datetime = res.result_rows[0][
+                0
+            ]  # typically tz-aware UTC from clickhouse_connect
+            # If not tz-aware, force UTC:
+            if latest.tzinfo is None:
+                latest = latest.replace(tzinfo=dt.timezone.utc)
+            return latest
+
         freq_delta = f.to_timedelta()
-        latest_date = get_latest_regression_report(cc, config_id)
+        latest_record_ts = _get_latest_record_ts(cc, config_id)
+
         # No report exists yet, generate
-        if not latest_date:
+        if not latest_record_ts:
             return True
-        # we only verify by date to see if we should generate the data
-        cutoff = end_time.date() - freq_delta
-        return latest_date < cutoff
 
+        end_utc = (
+            end_time if end_time.tzinfo else end_time.replace(tzinfo=dt.timezone.utc)
+        )
+        end_utc = end_utc.astimezone(dt.timezone.utc)
+
+        cutoff = end_time - freq_delta
+        return latest_record_ts < cutoff
 
     def process(
         self,
@@ -176,39 +197,34 @@ def process(
             cc = tlocal.cc
         config = get_config(config_id)
 
-        # check if we should generate report for end_time
-        # currently we only verify if end_time > latest report date + policy.freq in db
+        # check if the current time is > policy's time_delta + previous record_ts from summary_table
         report_freq = config.policy.frequency
-        should_generate = self.should_generate_report(cc, end_time,config_id,report_freq)
-        if not should_generate:
-            logger.info("[%s] Skip generate report for date: %s with frequency %s",config_id, end_time.date(), report_freq.get_text())
-            return;
-        data_range = config.policy.range
-        total_timedelta = data_range.baseline_timedelta
-        logger.info("[%s] fetching benchmark data from source",config_id)
-
-        if config.source.type!="benchmark_time_series_api":
-            logger.error(f"{config_id}: currently we only suppport benchmark_time_series_api to fetch source data")
-            return;
-
-        # Comparison: [end_time - 1d, end_time)
-        comp_s = end_time - data_range.comparison_timedelta()
-        comp_e  = end_time
-        comparison_data = self._fetch_from_benchmark_ts_api(
-            config_id=config_id,
-            start_time=baseline_s,
-            end_time=baseline_e,
-            source=config.source,
+        should_generate = self.should_generate_report(
+            cc, end_time, config_id, report_freq
         )
+        if not should_generate:
+            logger.info(
+                "[%s] Skip generate report for date: %s with frequency %s",
+                config_id,
+                end_time.isoformat(),
+                report_freq.get_text(),
+            )
+            return
 
-        data = self._fetch_from_benchmark_ts_api(config_id, end_time, start_time, config.source)
-        latest_ts = data.time_range.end
-        # no data in the time range
-        if not latest_ts:
-            logger.info("[%s] No data found for report %s",config_id, end_time.date())
+        latest = self.get_latest(config, end_time)
+        if not latest:
             return
 
-        regression_policy = config.policy.metrics
+        latest_map = to_latest_data_map(latest)
+        baseline = self.get_basline(config, end_time)
+        if not baseline:
+            return
+        baseline_map = to_baseline_map(baseline)
+        detect_regressions_with_policies(
+            baseline_map=baseline_map,
+            latest_map=latest_map,
+            metric_policies=config.policy.metrics,
+        )
 
         return {
             "start_time": to_timestap_str(start_time),
@@ -217,11 +233,42 @@ def process(
             "records_count": len(records),
         }
 
-    def get_basline(self, config: BenchmarkConfig,end_time: dt.datetime):
+    def get_latest(self, config: BenchmarkConfig, end_time: dt.datetime):
         data_range = config.policy.range
-        baseline_s = end_time - data_range.total_timedelta()
-        baseline_e   = end_time - data_range.comparison_timedelta()
+        latest_s = end_time - data_range.comparison_timedelta()
+        latest_e = end_time
+        latest_data = self._fetch_from_benchmark_ts_api(
+            config_id=config.id,
+            start_time=latest_s,
+            end_time=latest_e,
+            source=config.source,
+        )
+        if not latest_data.time_range or latest_data.time_range.end:
+            logger.info(
+                "[%s] Skip generate report for date:"
+                "%s with frequency %s, no data found during [%s,%s]",
+                config.id,
+                latest_s.isoformat(),
+                latest_e.isoformat(),
+            )
+            return None
+
+        if not self.should_use_data(latest_data.time_range.end, end_time):
+            logger.info(
+                "[%s] Skip generate report for date: trying to get_basline"
+                " with frequency %s, but no data found during for [%s,%s]",
+                config.id,
+                config.policy.frequency.get_text(),
+                latest_s.isoformat(),
+                latest_e.isoformat(),
+            )
+            return None
+        return latest_data
 
+    def get_basline(self, config: BenchmarkConfig, end_time: dt.datetime):
+        data_range = config.policy.range
+        baseline_s = end_time - data_range.total_timedelta()
+        baseline_e = end_time - data_range.comparison_timedelta()
         # fetch baseline from api
         raw_data = self._fetch_from_benchmark_ts_api(
             config_id=config.id,
@@ -229,44 +276,54 @@ def get_basline(self, config: BenchmarkConfig,end_time: dt.datetime):
             end_time=baseline_e,
             source=config.source,
         )
+        if not self.should_use_data(raw_data.time_range.end, end_time):
+            logger.info(
+                "[%s][get_basline] Skip generate report, no data found during [%s,%s]",
+                config.id,
+                baseline_s.isoformat(),
+                baseline_e.isoformat(),
+            )
+            return None
+        return raw_data
 
-        def to_baseline(data:BenchmarkTimeSeriesApiData):
-            data.
-
-
-
-
-
-
-
-
-
-    def _detect_regression(self,end_time: dt.datetime, data: BenchmarkTimeSeriesApiData, policy: Policy):
-        metrics_dict = policy.metrics
-        baseline_range = policy.range.baseline_timedelta()
-        comparison = policy.range.comparison_timedelta()
-
-
-
+    def should_use_data(
+        self,
+        latest_ts_str: str,
+        end_time: dt.datetime,
+        min_delta: dt.timedelta = dt.timedelta(days=2),
+    ) -> bool:
+        if not latest_ts_str:
+            return False
+        latest_dt = isoparse(latest_ts_str)
+        cutoff = end_time - min_delta
+        return latest_dt >= cutoff
 
-        return
-    def _fetch_from_benchmark_ts_api(self,config_id:str, end_time: dt.datetime,start_time:dt.datetime, source: BenchmarkApiSource):
+    def _fetch_from_benchmark_ts_api(
+        self,
+        config_id: str,
+        end_time: dt.datetime,
+        start_time: dt.datetime,
+        source: BenchmarkApiSource,
+    ):
         str_end_time = end_time.isoformat()
         str_start_time = start_time.isoformat()
-        query = source.render(ctx={
-            "startTime": str_start_time,
-            "endTime":  str_end_time,
-        })
+        query = source.render(
+            ctx={
+                "startTime": str_start_time,
+                "endTime": str_end_time,
+            }
+        )
         url = source.api_query_url
         try:
-            resp:BenchmarkTimeSeriesApiResponse = BenchmarkTimeSeriesApiResponse.from_request(url, query)
+            resp: BenchmarkTimeSeriesApiResponse = (
+                BenchmarkTimeSeriesApiResponse.from_request(url, query)
+            )
 
             return resp.data
         except Exception as e:
             raise RuntimeError(f"[{config_id}]Fetch failed:", e)
 
 
-
 class WorkerPoolHandler:
     """
     WorkerPoolHandler runs workers in parallel to generate benchmark regression report
@@ -288,7 +345,8 @@ def start(
         args: Optional[argparse.Namespace] = None,
     ) -> None:
         logger.info(
-            "[WorkerPoolHandler] start to process benchmark summary data with config %s", config["name"]
+            "[WorkerPoolHandler] start to process benchmark summary data with config %s",
+            config["name"],
         )
         with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
             futures = []
@@ -313,6 +371,7 @@ def start(
                 logger.warning(f"Error processing future: {e}")
                 errors.append({"error": str(e)})
 
+
 def main(
     args: Optional[argparse.Namespace] = None,
     github_access_token: str = "",
@@ -336,13 +395,10 @@ def main(
         cc = get_clickhouse_client_environment()
     time_intervals = TimeIntervalGenerator().generate(cc)
 
-
     # get jobs in queue from clickhouse for list of time intervals, in parallel
     handler = WorkerPoolHandler(
         config_retrievers,
-        BenchmarkSummaryProcessor(
-            is_dry_run=is_dry_run
-        ),
+        BenchmarkSummaryProcessor(is_dry_run=is_dry_run),
     )
     handler.start(time_intervals, args)
     logger.info(" [Main] Done. work completed.")
diff --git a/torchci/lib/clickhouse.ts b/torchci/lib/clickhouse.ts
index b48673ad8c..f8c720caea 100644
--- a/torchci/lib/clickhouse.ts
+++ b/torchci/lib/clickhouse.ts
@@ -18,13 +18,13 @@ export function getClickhouseClient() {
     request_timeout: 180_000, // 3 mins
   });
 }
-//
 
 export function getClickhouseClientWritable() {
   return createClient({
     host: process.env.CLICKHOUSE_HUD_USER_URL ?? "http://localhost:8123",
     username: process.env.CLICKHOUSE_HUD_USER_WRITE_USERNAME ?? "default",
     password: process.env.CLICKHOUSE_HUD_USER_WRITE_PASSWORD ?? "",
+    request_timeout: 180_000, // 3 minutes
   });
 }
 

From abb2904e2715ae2e81ddd9bf6af475400c69aef7 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 2 Sep 2025 23:04:34 -0700
Subject: [PATCH 20/27] addid

---
 .../common/config.py                          |   2 +-
 .../common/config_model.py                    |   2 +-
 .../common/regression_utils.py                | 382 ++++++++++--------
 .../lambda_function.py                        |  74 +---
 4 files changed, 227 insertions(+), 233 deletions(-)

diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py
index 18831f6070..c0420a8d88 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/config.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/config.py
@@ -9,7 +9,7 @@
     RangeConfig,
 )
 
-# compiler benchmark regression config
+# Compiler benchmark regression config
 # todo(elainewy): eventually each team should configure their own benchmark regression config, currenlty place here for lambda
 COMPILER_BENCHMARK_CONFIG = BenchmarkConfig(
     name="Compiler Benchmark Regression",
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
index dd892ccc45..663bf4689d 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/config_model.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
@@ -166,6 +166,7 @@ def create_github_comment(self, body: str, github_token: str) -> Dict[str, Any]:
         resp = requests.post(url, headers=headers, json={"body": body})
         resp.raise_for_status()
         return resp.json()
+
 @dataclass
 class Policy:
     frequency: Frequency
@@ -180,7 +181,6 @@ def get_github_notification_config(self) -> Optional[GitHubNotificationConfig]:
         return notification_from_dict(self.notification_config)  # type: ignore
 
 
-
 # -------- Top-level benchmark regression config --------
 @dataclass
 class BenchmarkConfig:
diff --git a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
index 008986e4c5..11959a3c97 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
@@ -3,10 +3,9 @@
 from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict
 import statistics
 from dateutil.parser import isoparse
-from common.config_model import RegressionPolicy
+from common.config_model import BenchmarkConfig, RegressionPolicy
 from common.benchmark_time_series_api_model import (
     BenchmarkTimeSeriesApiData,
-    BenchmarkTimeSeriesItem,
 )
 
 RegressionClassifyLabel = Literal[
@@ -21,112 +20,19 @@ class BaselineItem(TypedDict):
 
 class LatestItem(TypedDict):
     group_info: Dict[str, Any]
-    values: List[float]
+    values: List[Dict[str, Any]]
 
 
-def to_latest_data_map(
-    data: BenchmarkTimeSeriesApiData, field="value"
-) -> Dict[tuple, LatestItem]:
-    result = {}
-    for ts_group in data.time_series:
-        group_keys = tuple(sorted(ts_group.group_info.items()))
-        values = [
-            float(d[field])
-            for d in sorted(
-                ts_group.data,
-                key=lambda d: isoparse(d["granularity_bucket"]),  # convert to datetime
-            )
-            if field in d
-        ]
-        result[group_keys] = {
-            "group_info": ts_group.group_info,
-            "values": values,
-        }
-    return result
-
-
-def to_baseline_map(
-    baseline: BenchmarkTimeSeriesApiData,
-    mode: str = "mean",
-    field: str = "value",
-) -> Dict[tuple, BaselineItem]:
-    """
-    return
-      {
-        group_key[tuple]: {
-          "group_info": {...},
-          "baseline": float
-        }
-      }
-    """
-    result = {}
-    for ts_group in baseline.time_series:
-        group_keys = tuple(sorted(ts_group.group_info.items()))
-        values = [float(d[field]) for d in ts_group.data if field in d]
-        if not values:
-            continue
-
-        if mode == "mean":
-            val = statistics.fmean(values)
-        elif mode == "p90":
-            val = percentile(values, 0.9)
-        else:
-            raise ValueError("mode must be 'mean' or 'p90'")
-
-        result[group_keys] = {
-            "group_info": ts_group.group_info,
-            "baseline": val,
-        }
-    return result
-
-
-def classify_flags(flags: list[bool], min_points: int = 3) -> RegressionClassifyLabel:
-    """
-    Classify a sequence of boolean flags to detect regression.
-
-    - regression: last run has >= 2 consecutive True values
-    - suspicious: there is a run of >= 3 consecutive True values, but not at the end
-    - no_regression: all other cases
-    - insufficient_data: not enough data points (< min_points)
-
-    Special case:
-    - If min_points == 1, then just look at the last flag:
-        True  -> regression
-        False -> no_regression
-    """
-    n = len(flags)
-    if n == 0:
-        return "insufficient_data"
-
-    if min_points == 1:
-        return "regression" if flags[-1] else "no_regression"
-
-    if n < min_points:
-        return "insufficient_data"
-
-    # trailing run length
-    t = 0
-    for v in reversed(flags):
-        if v:
-            t += 1
-        else:
-            break
-    if t >= 2:
-        return "regression"
-
-    # longest run anywhere
-    longest = cur = 0
-    for v in flags:
-        cur = cur + 1 if v else 0
-        longest = max(longest, cur)
-
-    if longest >= 3:
-        return "suspicious"
-
-    return "no_regression"
-
-
-def percentile(values, q: float):
+class PerGroupResult(TypedDict, total=True):
+    group_info: Dict[str, Any]
+    baseline: Optional[float]
+    points: List[Any]
+    flags: List[bool]
+    label: RegressionClassifyLabel
+    policy: Optional["RegressionPolicy"]
+
+
+def percentile(values: list[float], q: float):
     if not values:
         return None
     v = sorted(values)
@@ -138,68 +44,206 @@ def percentile(values, q: float):
     return v[f] + (v[c] - v[f]) * (k - f)
 
 
-def _resolve_policy(
-    metric_policies: Dict[str, RegressionPolicy],
-    metric: str,
-) -> Optional[RegressionPolicy]:
-    if not metric:
-        return None
-    m = metric.lower()
-    return metric_policies.get(m)
-
-
-def detect_regressions_with_policies(
-    baseline_map: Dict[tuple, BaselineItem],
-    latest_map: Dict[tuple, LatestItem],
-    *,
-    metric_policies: Dict[str, RegressionPolicy],
-    min_points: int = 2,
-) -> Tuple[List[Dict[str, Any]], bool]:
-    """
-    For each group:
-      - choose policy by group_info['metric']
-      - compute flags via policy.is_violation(value, baseline)
-      - classify with classify_flags
-    Returns a list of {group_info, baseline, values, flags, label, policy}
-    """
-    results: List[Dict[str, Any]] = []
-
-    is_any_regression = False
-
-    for key in sorted(latest_map.keys()):
-        cur_item = latest_map.get(key)
-        gi = cur_item["group_info"] if cur_item else {}
-        latest_vals = cur_item["values"] if cur_item else []
-        policy = _resolve_policy(metric_policies, gi.get("metric", ""))
-        if not policy:
-            logging.warning(
-                f"no policy for metric %s with group_info=%s", gi.get("metric", ""), gi
-            )
-            continue
-
-        base_item = baseline_map.get(key)
-        baseline_value = base_item.get("value") if base_item else None
-        if not base_item or not baseline_value:
-            logging.warning(
-                f"no baseline for metric %s with group_info=%s",
-                gi.get("metric", ""),
-                gi,
+class BenchmarkRegressionReportGenerator:
+    def __init__(
+        self,
+        config: BenchmarkConfig,
+        latest_ts: BenchmarkTimeSeriesApiData,
+        baseline_ts: BenchmarkTimeSeriesApiData,
+    ) -> None:
+        self.metric_policies = config.policy.metrics
+        self.latest_ts = self._to_latest_data_map(latest_ts)
+        self.baseline_ts = self._to_baseline_map(baseline_ts)
+
+    def generate(self) -> Tuple[List[PerGroupResult], bool]:
+        return self.detect_regressions_with_policies(
+            self.baseline_ts,
+            self.latest_ts,
+            metric_policies=self.metric_policies,
+        )
+
+    def detect_regressions_with_policies(
+        self,
+        baseline_map: Dict[tuple, BaselineItem],
+        dp_map: Dict[tuple, LatestItem],
+        *,
+        metric_policies: Dict[str, RegressionPolicy],
+        min_points: int = 2,
+    ) -> Tuple[List[PerGroupResult], bool]:
+        """
+        For each group:
+        - choose policy by group_info['metric']
+        - compute flags via policy.is_violation(value, baseline)
+        - classify with classify_flags
+        Returns a list of {group_info, baseline, values, flags, label, policy}
+        """
+        results: List[PerGroupResult] = []
+
+        is_any_regression = False
+
+        for key in sorted(dp_map.keys()):
+            cur_item = dp_map.get(key)
+            gi = cur_item["group_info"] if cur_item else {}
+            points: List[Any] = cur_item["values"] if cur_item else []
+
+            base_item = baseline_map.get(key)
+            baseline_value = base_item.get("value") if base_item else None
+
+            #
+            policy = self._resolve_policy(metric_policies, gi.get("metric", ""))
+            if not policy:
+                results.append(
+                    PerGroupResult(
+                        group_info=gi,
+                        baseline=baseline_value,
+                        points=[],
+                        flags=[],
+                        label="insufficient_data",
+                        policy=None,
+                    )
+                )
+                continue
+
+            if baseline_value is None or len(points) == 0:
+                results.append(
+                    PerGroupResult(
+                        group_info=gi,
+                        baseline=baseline_value,
+                        points=[],
+                        flags=[],
+                        label="insufficient_data",
+                        policy=policy,
+                    )
+                )
+                continue
+
+            # Per-point violations (True = regression)
+            flags: List[bool] = [
+                policy.is_violation(p["value"], baseline_value) for p in points
+            ]
+            label = self.classify_flags(flags, min_points=min_points)
+
+            enriched_points = [{**p, "flag": f} for p, f in zip(points, flags)]
+            results.append(
+                PerGroupResult(
+                    group_info=gi,
+                    baseline=baseline_value,
+                    points=enriched_points,
+                    flags=[],
+                    label=label,
+                    policy=policy,
+                )
             )
-            continue
-
-        # Per-point violations (True = regression)
-        flags = [policy.is_violation(v, baseline_value) for v in latest_vals]
-        label = classify_flags(flags, min_points=min_points)
-        results.append(
-            {
-                "group_info": gi,
-                "baseline": baseline_value,
-                "values": latest_vals,
-                "flags": flags,
-                "label": label,
-                "policy": policy,
+            if label == "regression":
+                is_any_regression = True
+        return results, is_any_regression
+
+    def _to_latest_data_map(
+        self, data: "BenchmarkTimeSeriesApiData", field: str = "value"
+    ) -> Dict[tuple, LatestItem]:
+        result: Dict[tuple, LatestItem] = {}
+        for ts_group in data.time_series:
+            group_keys = tuple(sorted(ts_group.group_info.items()))
+            points: List[Dict[str, Any]] = []
+            for d in sorted(
+                ts_group.data, key=lambda d: isoparse(d["granularity_bucket"])
+            ):
+                if field not in d:
+                    continue
+
+                points.append(
+                    {
+                        "value": float(d[field]),
+                        "commit": d.get("head_sha"),
+                        "branch": d.get("head_branch"),
+                        "timestamp": isoparse(d["granularity_bucket"]),
+                    }
+                )
+            result[group_keys] = {
+                "group_info": ts_group.group_info,
+                "values": points,
             }
-        )
-        if label == "regression":
-            is_any_regression = True
-    return results, is_any_regression
+        return result
+
+    def _to_baseline_map(
+        self,
+        baseline: BenchmarkTimeSeriesApiData,
+        mode: str = "mean",
+        field: str = "value",
+    ) -> Dict[tuple, BaselineItem]:
+        result = {}
+        for ts_group in baseline.time_series:
+            group_keys = tuple(sorted(ts_group.group_info.items()))
+            values = [float(d[field]) for d in ts_group.data if field in d]
+            if not values:
+                continue
+
+            if mode == "mean":
+                val = statistics.fmean(values)
+            elif mode == "p90":
+                val = percentile(values, 0.9)
+            else:
+                raise ValueError("mode must be 'mean' or 'p90'")
+
+            result[group_keys] = {
+                "group_info": ts_group.group_info,
+                "baseline": val,
+            }
+        return result
+
+    def classify_flags(
+        self, flags: list[bool], min_points: int = 3
+    ) -> RegressionClassifyLabel:
+        """
+        Classify a sequence of boolean flags to detect regression.
+
+        - regression: last run has >= 2 consecutive True values
+        - suspicious: there is a run of >= 3 consecutive True values, but not at the end
+        - no_regression: all other cases
+        - insufficient_data: not enough data points (< min_points)
+
+        Special case:
+        - If min_points == 1, then just look at the last flag:
+            True  -> regression
+            False -> no_regression
+        """
+        n = len(flags)
+        if n == 0:
+            return "insufficient_data"
+
+        if min_points == 1:
+            return "regression" if flags[-1] else "no_regression"
+
+        if n < min_points:
+            return "insufficient_data"
+
+        # trailing run length
+        t = 0
+        for v in reversed(flags):
+            if v:
+                t += 1
+            else:
+                break
+        if t >= 2:
+            return "regression"
+
+        # longest run anywhere
+        longest = cur = 0
+        for v in flags:
+            cur = cur + 1 if v else 0
+            longest = max(longest, cur)
+
+        if longest >= 3:
+            return "suspicious"
+
+        return "no_regression"
+
+    def _resolve_policy(
+        self,
+        metric_policies: Dict[str, RegressionPolicy],
+        metric: str,
+    ) -> Optional[RegressionPolicy]:
+        if not metric:
+            return None
+        m = metric.lower()
+        return metric_policies.get(m)
diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
index 0c112156a8..ab421a6bc7 100644
--- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py
+++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
@@ -4,37 +4,23 @@
 import logging
 import os
 import threading
-from collections import defaultdict
 from concurrent.futures import as_completed, ThreadPoolExecutor
 import datetime as dt
-from common.regression_utils import (
-    detect_regressions_with_policies,
-    to_baseline_map,
-    to_latest_data_map,
-    to_time_series_item_map,
-)
+from typing import Optional
+from common.regression_utils import BenchmarkRegressionReportGenerator
+import clickhouse_connect
 from common.benchmark_time_series_api_model import (
-    BenchmarkTimeSeriesApiData,
     BenchmarkTimeSeriesApiResponse,
-    TimeRange,
 )
 from common.config_model import (
     BenchmarkApiSource,
     BenchmarkConfig,
     Frequency,
-    Policy,
-    RangeConfig,
 )
 from common.config import BENCHMARK_REGRESSION_CONFIG
-from jinja2 import Template
-import requests
 from dateutil.parser import isoparse
 
-from typing import Any, Dict, Iterable, List, Optional, Set
-import clickhouse_connect
-import yaml
-from github import Auth, Github
-
+from pprint import pprint
 
 logging.basicConfig(
     level=logging.INFO,
@@ -77,36 +63,8 @@ def get_clickhouse_client_environment() -> clickhouse_connect.driver.client.Clie
         password=ENVS["CLICKHOUSE_PASSWORD"],
     )
 
-
-def write_to_file(data: Any, filename="", path=""):
-    """
-    Writes data to a specified file. If no path is provided, writes to the current directory.
-
-    :param data: The content to write to the file.
-    :param filename: The name of the file (default: 'output.txt').
-    :param path: The directory where the file should be saved (default: current directory).
-    """
-
-    if not filename:
-        filename = "output_snapshot.json"
-    if not path:
-        path = "."
-
-    # Ensure the path exists
-    os.makedirs(path, exist_ok=True)
-
-    # Construct full file path
-    file_path = os.path.join(path, filename)
-
-    # Write data to file
-    with open(file_path, "w", encoding="utf-8") as file:
-        file.write(data)
-    logger.info(f"File written to: {os.path.abspath(file_path)}")
-
-
 BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = "benchmark_regression_summary_report"
 
-
 def get_config(config_id: str) -> BenchmarkConfig:
     try:
         config: BenchmarkConfig = BENCHMARK_REGRESSION_CONFIG[config_id]
@@ -116,7 +74,6 @@ def get_config(config_id: str) -> BenchmarkConfig:
         raise e
     return config
 
-
 class BenchmarkSummaryProcessor:
     """ """
 
@@ -170,7 +127,6 @@ def _get_latest_record_ts(
             end_time if end_time.tzinfo else end_time.replace(tzinfo=dt.timezone.utc)
         )
         end_utc = end_utc.astimezone(dt.timezone.utc)
-
         cutoff = end_time - freq_delta
         return latest_record_ts < cutoff
 
@@ -214,24 +170,18 @@ def process(
         latest = self.get_latest(config, end_time)
         if not latest:
             return
-
-        latest_map = to_latest_data_map(latest)
         baseline = self.get_basline(config, end_time)
         if not baseline:
             return
-        baseline_map = to_baseline_map(baseline)
-        detect_regressions_with_policies(
-            baseline_map=baseline_map,
-            latest_map=latest_map,
-            metric_policies=config.policy.metrics,
-        )
 
-        return {
-            "start_time": to_timestap_str(start_time),
-            "end_time": to_timestap_str(end_time),
-            "jobs_count": len(queued_jobs),
-            "records_count": len(records),
-        }
+        generator = BenchmarkRegressionReportGenerator(
+            config=config, latest_ts=latest, baseline_ts=baseline
+        )
+        result, regression_detected = generator.generate()
+        if self.is_dry_run:
+            print("regression_detected: ", regression_detected)
+            print(json.dumps(result, indent=2, default=str))
+        return
 
     def get_latest(self, config: BenchmarkConfig, end_time: dt.datetime):
         data_range = config.policy.range

From a0a48c1078a8e43b58ef30f95257b9283522b211 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 2 Sep 2025 23:28:25 -0700
Subject: [PATCH 21/27] addid

---
 .../common/config.py                          |   8 +
 .../lambda_function.py                        | 171 ++++++++----------
 2 files changed, 85 insertions(+), 94 deletions(-)

diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py
index c0420a8d88..6ddf84696a 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/config.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/config.py
@@ -64,8 +64,16 @@
         },
     ),
 )
+
 BENCHMARK_REGRESSION_CONFIG = BenchmarkRegressionConfigBook(
     configs={
         "compiler_regression": COMPILER_BENCHMARK_CONFIG,
     }
 )
+
+
+def get_benchmark_regression_config(config_id: str) -> BenchmarkConfig:
+    try:
+        return BENCHMARK_REGRESSION_CONFIG[config_id]
+    except KeyError:
+        raise ValueError(f"Invalid config id: {config_id}")
diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
index ab421a6bc7..9ba17de7ed 100644
--- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py
+++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
@@ -17,11 +17,9 @@
     BenchmarkConfig,
     Frequency,
 )
-from common.config import BENCHMARK_REGRESSION_CONFIG
+from common.config import get_benchmark_regression_config
 from dateutil.parser import isoparse
 
-from pprint import pprint
-
 logging.basicConfig(
     level=logging.INFO,
 )
@@ -35,7 +33,9 @@
     "CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME", ""),
 }
 
-BENMARK_REGRESSION_REPORT_DB = "fortesting.benchmark_regression_report"
+BENCHMARK_REGRESSION_REPORT_TABLE = "fortesting.benchmark_regression_report"
+
+BENCHMARK_REGRESSION_TRACKING_CONFIG_IDS = ["compiler_regression"]
 
 
 def truncate_to_hour(ts: dt.datetime) -> dt.datetime:
@@ -63,16 +63,11 @@ def get_clickhouse_client_environment() -> clickhouse_connect.driver.client.Clie
         password=ENVS["CLICKHOUSE_PASSWORD"],
     )
 
-BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = "benchmark_regression_summary_report"
 
-def get_config(config_id: str) -> BenchmarkConfig:
-    try:
-        config: BenchmarkConfig = BENCHMARK_REGRESSION_CONFIG[config_id]
-    except KeyError:
-        raise ValueError(f"Invalid config id: {config_id}")
-    except Exception as e:
-        raise e
-    return config
+BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = (
+    "fortesting.benchmark_regression_summary_report"
+)
+
 
 class BenchmarkSummaryProcessor:
     """ """
@@ -83,53 +78,6 @@ def __init__(
     ) -> None:
         self.is_dry_run = is_dry_run
 
-    def should_generate_report(
-        self,
-        cc: clickhouse_connect.driver.client.Client,
-        end_time: dt.datetime,
-        config_id: str,
-        f: Frequency,
-    ) -> bool:
-        """
-        decide wether should generate the report based on the frequency in policy
-        """
-
-        def _get_latest_record_ts(
-            cc: clickhouse_connect.driver.Client,
-            config_id: str,
-        ) -> Optional[dt.datetime]:
-            res = cc.query(
-                """
-                SELECT max(last_record_ts)
-                FROM benchmark_regression_report
-                WHERE report_id = {config_id:String}
-                """,
-                parameters={"config_id": config_id},
-            )
-            if not res.result_rows or res.result_rows[0][0] is None:
-                return None
-            latest: dt.datetime = res.result_rows[0][
-                0
-            ]  # typically tz-aware UTC from clickhouse_connect
-            # If not tz-aware, force UTC:
-            if latest.tzinfo is None:
-                latest = latest.replace(tzinfo=dt.timezone.utc)
-            return latest
-
-        freq_delta = f.to_timedelta()
-        latest_record_ts = _get_latest_record_ts(cc, config_id)
-
-        # No report exists yet, generate
-        if not latest_record_ts:
-            return True
-
-        end_utc = (
-            end_time if end_time.tzinfo else end_time.replace(tzinfo=dt.timezone.utc)
-        )
-        end_utc = end_utc.astimezone(dt.timezone.utc)
-        cutoff = end_time - freq_delta
-        return latest_record_ts < cutoff
-
     def process(
         self,
         config_id: str,
@@ -151,11 +99,21 @@ def process(
                 else:
                     tlocal.cc = get_clickhouse_client_environment()
             cc = tlocal.cc
-        config = get_config(config_id)
+
+        try:
+            config = get_benchmark_regression_config(config_id)
+        except ValueError as e:
+            logger.error(f"Skip process, Invalid config: {e}")
+            return
+        except Exception as e:
+            print(
+                f"Something else went wrong when call get_benchmark_regression_config: {e}"
+            )
+            return
 
         # check if the current time is > policy's time_delta + previous record_ts from summary_table
         report_freq = config.policy.frequency
-        should_generate = self.should_generate_report(
+        should_generate = self._should_generate_report(
             cc, end_time, config_id, report_freq
         )
         if not should_generate:
@@ -273,6 +231,50 @@ def _fetch_from_benchmark_ts_api(
         except Exception as e:
             raise RuntimeError(f"[{config_id}]Fetch failed:", e)
 
+    def _should_generate_report(
+        self,
+        cc: clickhouse_connect.driver.client.Client,
+        end_time: dt.datetime,
+        config_id: str,
+        f: Frequency,
+    ) -> bool:
+        def _get_latest_record_ts(
+            cc: clickhouse_connect.driver.Client,
+            config_id: str,
+        ) -> Optional[dt.datetime]:
+            table = BENCHMARK_REGRESSION_REPORT_TABLE
+            res = cc.query(
+                f"""
+                SELECT max(last_record_ts)
+                FROM {table}
+                WHERE report_id = {{config_id:String}}
+                """,
+                parameters={"config_id": config_id},
+            )
+            if not res.result_rows or res.result_rows[0][0] is None:
+                return None
+            latest: dt.datetime = res.result_rows[0][
+                0
+            ]  # typically tz-aware UTC from clickhouse_connect
+            # If not tz-aware, force UTC:
+            if latest.tzinfo is None:
+                latest = latest.replace(tzinfo=dt.timezone.utc)
+            return latest
+
+        freq_delta = f.to_timedelta()
+        latest_record_ts = _get_latest_record_ts(cc, config_id)
+
+        # No report exists yet, generate
+        if not latest_record_ts:
+            return True
+
+        end_utc = (
+            end_time if end_time.tzinfo else end_time.replace(tzinfo=dt.timezone.utc)
+        )
+        end_utc = end_utc.astimezone(dt.timezone.utc)
+        cutoff = end_time - freq_delta
+        return latest_record_ts < cutoff
+
 
 class WorkerPoolHandler:
     """
@@ -284,26 +286,32 @@ class WorkerPoolHandler:
     def __init__(
         self,
         benchmark_summary_processor: BenchmarkSummaryProcessor,
-        max_workers: int = 4,
+        max_workers: int = 6,
     ):
         self.benchmark_summary_processor = benchmark_summary_processor
         self.max_workers = max_workers
 
     def start(
         self,
-        config: Dict[str, Any],
+        config_ids: list[str],
         args: Optional[argparse.Namespace] = None,
     ) -> None:
         logger.info(
-            "[WorkerPoolHandler] start to process benchmark summary data with config %s",
-            config["name"],
+            "[WorkerPoolHandler] start to process benchmark "
+            "summary data with config_ids %s",
+            config_ids,
+        )
+        end_time = dt.datetime.now(dt.timezone.utc).replace(
+            minute=0, second=0, microsecond=0
         )
+        logger.info("current time with hour granularity(utc) %s", end_time)
         with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
             futures = []
-            for interval in time_intervals:
+            for config_id in config_ids:
                 future = executor.submit(
                     self.benchmark_summary_processor.process,
-                    config,
+                    config_id,
+                    end_time,
                     cc=None,
                     args=args,
                 )
@@ -343,14 +351,12 @@ def main(
         )
     else:
         cc = get_clickhouse_client_environment()
-    time_intervals = TimeIntervalGenerator().generate(cc)
 
     # get jobs in queue from clickhouse for list of time intervals, in parallel
     handler = WorkerPoolHandler(
-        config_retrievers,
         BenchmarkSummaryProcessor(is_dry_run=is_dry_run),
     )
-    handler.start(time_intervals, args)
+    handler.start(BENCHMARK_REGRESSION_TRACKING_CONFIG_IDS, args)
     logger.info(" [Main] Done. work completed.")
 
 
@@ -395,12 +401,6 @@ def parse_args() -> argparse.Namespace:
         default=ENVS["GITHUB_ACCESS_TOKEN"],
         help="the github access token to access github api",
     )
-    parser.add_argument(
-        "--local-output",
-        action="store_true",
-        help="when set, generate json result in local environment. "
-        + "this is only used for local test environment when dry-run is enabled",
-    )
     parser.add_argument(
         "--not-dry-run",
         action="store_true",
@@ -408,20 +408,6 @@ def parse_args() -> argparse.Namespace:
         + "environment. By default, we run in dry-run mode for local "
         + "environment",
     )
-    parser.add_argument(
-        "--output-file-name",
-        type=str,
-        default="job_queue_times_snapshot.json",
-        help="the name of output file for local environment. this "
-        + "is only used for local test environment when local-output is enabled",
-    )
-    parser.add_argument(
-        "--output-file-path",
-        type=str,
-        default="",
-        help="the path of output file for local environment. this is "
-        + "only used for local test environment when local-output is enabled",
-    )
     args, _ = parser.parse_known_args()
     return args
 
@@ -442,9 +428,6 @@ def local_run() -> None:
         args,
         args.github_access_token,
         is_dry_run=is_dry_run,
-        local_output=args.local_output,
-        output_snapshot_file_name=args.output_file_name,
-        output_snapshot_file_path=args.output_file_path,
     )
 
 

From a692bb9d88294b70312014a6d9b803354b804dd1 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 3 Sep 2025 02:01:15 -0700
Subject: [PATCH 22/27] addid

---
 .../common/benchmark_time_series_api_model.py |  2 +-
 .../common/config_model.py                    |  1 -
 .../common/regression_utils.py                | 25 ++++----
 .../lambda_function.py                        | 61 ++++++++-----------
 4 files changed, 41 insertions(+), 48 deletions(-)

diff --git a/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py
index 2fa8960013..7aad397aa3 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py
@@ -28,7 +28,7 @@ class BenchmarkTimeSeriesApiResponse:
 
     @classmethod
     def from_request(
-        cls, url: str, query: dict, timeout: int = 60
+        cls, url: str, query: dict, timeout: int = 180
     ) -> "BenchmarkTimeSeriesApiResponse":
         """
         Send a POST request and parse into BenchmarkTimeSeriesApiResponse.
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
index 663bf4689d..53e59f80f9 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/config_model.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
@@ -177,7 +177,6 @@ class Policy:
     def get_github_notification_config(self) -> Optional[GitHubNotificationConfig]:
         if not self.notification_config:
             return None
-        if self.notification_config and self.notification_config.get("type") == "github":
         return notification_from_dict(self.notification_config)  # type: ignore
 
 
diff --git a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
index 11959a3c97..bf66bdd92b 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
@@ -7,6 +7,9 @@
 from common.benchmark_time_series_api_model import (
     BenchmarkTimeSeriesApiData,
 )
+import pprint
+
+logger = logging.getLogger()
 
 RegressionClassifyLabel = Literal[
     "regression", "suspicious", "no_regression", "insufficient_data"
@@ -27,7 +30,6 @@ class PerGroupResult(TypedDict, total=True):
     group_info: Dict[str, Any]
     baseline: Optional[float]
     points: List[Any]
-    flags: List[bool]
     label: RegressionClassifyLabel
     policy: Optional["RegressionPolicy"]
 
@@ -53,7 +55,7 @@ def __init__(
     ) -> None:
         self.metric_policies = config.policy.metrics
         self.latest_ts = self._to_latest_data_map(latest_ts)
-        self.baseline_ts = self._to_baseline_map(baseline_ts)
+        self.baseline_ts = self._to_baseline_map(baseline_ts, mode="max")
 
     def generate(self) -> Tuple[List[PerGroupResult], bool]:
         return self.detect_regressions_with_policies(
@@ -82,35 +84,34 @@ def detect_regressions_with_policies(
         is_any_regression = False
 
         for key in sorted(dp_map.keys()):
+            logger.info("key: %s", key)
             cur_item = dp_map.get(key)
             gi = cur_item["group_info"] if cur_item else {}
             points: List[Any] = cur_item["values"] if cur_item else []
 
             base_item = baseline_map.get(key)
+            logger.info("base_item for keys(%s):\n%s ",key, pprint.pformat(base_item))
             baseline_value = base_item.get("value") if base_item else None
-
-            #
             policy = self._resolve_policy(metric_policies, gi.get("metric", ""))
             if not policy:
+                logger.warning("No policy for %s", gi)
                 results.append(
                     PerGroupResult(
                         group_info=gi,
                         baseline=baseline_value,
                         points=[],
-                        flags=[],
                         label="insufficient_data",
                         policy=None,
                     )
                 )
                 continue
-
             if baseline_value is None or len(points) == 0:
+                logger.warning("baseline_value is %s, len(points) == %s", baseline_value,len(points))
                 results.append(
                     PerGroupResult(
                         group_info=gi,
                         baseline=baseline_value,
                         points=[],
-                        flags=[],
                         label="insufficient_data",
                         policy=policy,
                     )
@@ -129,7 +130,6 @@ def detect_regressions_with_policies(
                     group_info=gi,
                     baseline=baseline_value,
                     points=enriched_points,
-                    flags=[],
                     label=label,
                     policy=policy,
                 )
@@ -150,12 +150,11 @@ def _to_latest_data_map(
             ):
                 if field not in d:
                     continue
-
                 points.append(
                     {
                         "value": float(d[field]),
-                        "commit": d.get("head_sha"),
-                        "branch": d.get("head_branch"),
+                        "commit": d.get("commit"),
+                        "branch": d.get("branch"),
                         "timestamp": isoparse(d["granularity_bucket"]),
                     }
                 )
@@ -182,12 +181,14 @@ def _to_baseline_map(
                 val = statistics.fmean(values)
             elif mode == "p90":
                 val = percentile(values, 0.9)
+            elif mode == "max":
+                val = max(values)
             else:
                 raise ValueError("mode must be 'mean' or 'p90'")
 
             result[group_keys] = {
                 "group_info": ts_group.group_info,
-                "baseline": val,
+                "value": val,
             }
         return result
 
diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
index 9ba17de7ed..29c2596e0e 100644
--- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py
+++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python
 import argparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
 import json
 import logging
 import os
 import threading
-from concurrent.futures import as_completed, ThreadPoolExecutor
+import requests
 import datetime as dt
-from typing import Optional
+from typing import Any, Optional
 from common.regression_utils import BenchmarkRegressionReportGenerator
 import clickhouse_connect
 from common.benchmark_time_series_api_model import (
@@ -46,7 +47,8 @@ def get_clickhouse_client(
     host: str, user: str, password: str
 ) -> clickhouse_connect.driver.client.Client:
     # for local testing only, disable SSL verification
-    # return clickhouse_connect.get_client(host=host, user=user, password=password,secure=True, verify=False)
+    logger.info("trying to connect with clickhouse")
+    return clickhouse_connect.get_client(host=host, user=user, password=password,secure=True, verify=False)
 
     return clickhouse_connect.get_client(
         host=host, user=user, password=password, secure=True
@@ -63,7 +65,6 @@ def get_clickhouse_client_environment() -> clickhouse_connect.driver.client.Clie
         password=ENVS["CLICKHOUSE_PASSWORD"],
     )
 
-
 BENCHMARK_REGRESSION_SUMMARY_REPORT_TABLE = (
     "fortesting.benchmark_regression_summary_report"
 )
@@ -87,6 +88,7 @@ def process(
     ):
         # ensure each thread has its own clickhouse client. clickhouse client
         # is not thread-safe.
+        logger.info("here")
         if cc is None:
             tlocal = threading.local()
             if not hasattr(tlocal, "cc") or tlocal.cc is None:
@@ -99,9 +101,11 @@ def process(
                 else:
                     tlocal.cc = get_clickhouse_client_environment()
             cc = tlocal.cc
+        logger.info("i'm here")
 
         try:
             config = get_benchmark_regression_config(config_id)
+            logger.info("found config for config_id %s",config_id)
         except ValueError as e:
             logger.error(f"Skip process, Invalid config: {e}")
             return
@@ -118,15 +122,19 @@ def process(
         )
         if not should_generate:
             logger.info(
-                "[%s] Skip generate report for date: %s with frequency %s",
+                "[%s] Skip generate report for date:%s with frequency %s, no data found during [%s,%s]",
                 config_id,
                 end_time.isoformat(),
                 report_freq.get_text(),
             )
             return
+        else:
+            logger.info( "[%s] Plan to generate report for time: %s with frequency %s ...",
+                config_id,end_time,report_freq.get_text())
 
         latest = self.get_latest(config, end_time)
         if not latest:
+            logger.info("no latest data found")
             return
         baseline = self.get_basline(config, end_time)
         if not baseline:
@@ -151,25 +159,9 @@ def get_latest(self, config: BenchmarkConfig, end_time: dt.datetime):
             end_time=latest_e,
             source=config.source,
         )
-        if not latest_data.time_range or latest_data.time_range.end:
-            logger.info(
-                "[%s] Skip generate report for date:"
-                "%s with frequency %s, no data found during [%s,%s]",
-                config.id,
-                latest_s.isoformat(),
-                latest_e.isoformat(),
-            )
+        if not latest_data.time_range or not latest_data.time_range.end:
             return None
-
         if not self.should_use_data(latest_data.time_range.end, end_time):
-            logger.info(
-                "[%s] Skip generate report for date: trying to get_basline"
-                " with frequency %s, but no data found during for [%s,%s]",
-                config.id,
-                config.policy.frequency.get_text(),
-                latest_s.isoformat(),
-                latest_e.isoformat(),
-            )
             return None
         return latest_data
 
@@ -213,23 +205,27 @@ def _fetch_from_benchmark_ts_api(
         start_time: dt.datetime,
         source: BenchmarkApiSource,
     ):
-        str_end_time = end_time.isoformat()
-        str_start_time = start_time.isoformat()
+        str_end_time = end_time.strftime("%Y-%m-%dT%H:%M:%S")
+        str_start_time = start_time.strftime("%Y-%m-%dT%H:%M:%S")
         query = source.render(
             ctx={
                 "startTime": str_start_time,
-                "endTime": str_end_time,
+                "stopTime": str_end_time,
             }
         )
         url = source.api_query_url
+
+        logger.info("trying to call %s",url)
         try:
             resp: BenchmarkTimeSeriesApiResponse = (
                 BenchmarkTimeSeriesApiResponse.from_request(url, query)
             )
-
             return resp.data
+        except requests.exceptions.HTTPError as e:
+            logger.error("Server error message: %s", e.response.json().get("error"))
+            raise
         except Exception as e:
-            raise RuntimeError(f"[{config_id}]Fetch failed:", e)
+            raise RuntimeError(f"[{config_id}]Fetch failed: {e}")
 
     def _should_generate_report(
         self,
@@ -344,13 +340,7 @@ def main(
         raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN")
 
     # get time intervals.
-    logger.info(" [Main] generating time intervals ....")
-    if args:
-        cc = get_clickhouse_client(
-            args.clickhouse_endpoint, args.clickhouse_username, args.clickhouse_password
-        )
-    else:
-        cc = get_clickhouse_client_environment()
+    logger.info("[Main] start work ....")
 
     # get jobs in queue from clickhouse for list of time intervals, in parallel
     handler = WorkerPoolHandler(
@@ -419,6 +409,9 @@ def local_run() -> None:
 
     args = parse_args()
 
+
+    logger.info("args: %s",args)
+
     # update environment variables for input parameters
 
     # always run in dry-run mode in local environment, unless it's disabled.

From f2cd0e23adcb68d12a98bdcc7b661af886168e48 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 3 Sep 2025 02:43:49 -0700
Subject: [PATCH 23/27] addid

---
 .../common/config.py                          |  8 +-
 .../common/config_model.py                    | 33 +++++--
 .../common/regression_utils.py                | 94 +++++++++++--------
 3 files changed, 85 insertions(+), 50 deletions(-)

diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py
index 6ddf84696a..2c5280a63b 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/config.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/config.py
@@ -48,13 +48,13 @@
         ),
         metrics={
             "passrate": RegressionPolicy(
-                name="passrate", condition="greater_than", threshold=0.9
+                name="passrate", condition="greater_than", threshold=0.9, baseline_aggregation="max"
             ),
             "geomean": RegressionPolicy(
-                name="geomean", condition="greater_than", threshold=0.95
+                name="geomean", condition="greater_than", threshold=0.95,baseline_aggregation="max"
             ),
-            "dynamo_peak_mem": RegressionPolicy(
-                name="dynamo_peak_mem", condition="greater_than", threshold=0.9
+            "compression_ratio": RegressionPolicy(
+                name="compression_ratio", condition="greater_than", threshold=0.9, baseline_aggregation="max"
             ),
         },
         notification_config={
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
index 53e59f80f9..66e9ed04ed 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/config_model.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
@@ -97,30 +97,45 @@ def baseline_timedelta(self) -> timedelta:
 @dataclass
 class RegressionPolicy:
     """
-        - "greater_than": higher is better; violation if value < baseline * threshold
-        - "less_than":    lower  is better; violation if value > baseline * threshold
-        - "equal_to":     value should be ~= baseline * threshold within rel_tol
+    Defines the policy for a given metric.
+        - "greater_than": higher is better; violation if new value < baseline * threshold
+        - "less_than":    lower  is better; violation if new value > baseline * threshold
+        - "equal_to":     new value should be ~= baseline * threshold within rel_tol
+        - "greater_equal": higher is better; violation if new value <= baseline * threshold
+        - "less_equal":    lower  is better; violation if new value >= baseline * threshold
+
     """
     name: str
-    condition: Literal["greater_than", "less_than", "equal_to"]
+    condition: Literal["greater_than", "less_than", "equal_to","greater_equal","less_equal"]
     threshold: float
+    baseline_aggregation: Literal["avg", "max", "min", "p50", "p90", "p95","latest","earliest"] = "max"
     rel_tol: float = 1e-3  # used only for "equal_to"
 
     def is_violation(self, value: float, baseline: float) -> bool:
         target = baseline * self.threshold
 
         if self.condition == "greater_than":
-            # value should be >= target
+            # value must be strictly greater than target
+            return value <= target
+
+        if self.condition == "greater_equal":
+            # value must be greater or equal to target
             return value < target
 
         if self.condition == "less_than":
-            # value should be <= target
+            # value must be strictly less than target
+            return value >= target
+
+        if self.condition == "less_equal":
+            # value must be less or equal to target
             return value > target
 
-        # equal_to: |value - target| should be within rel_tol * max(1, |target|)
-        denom = max(1.0, abs(target))
-        return abs(value - target) > self.rel_tol * denom
+        if self.condition == "equal_to":
+            # |value - target| should be within rel_tol * max(1, |target|)
+            denom = max(1.0, abs(target))
+            return abs(value - target) > self.rel_tol * denom
 
+        raise ValueError(f"Unknown condition: {self.condition}")
 class BaseNotificationConfig:
     # every subclass must override this
     type_tag: ClassVar[str]
diff --git a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
index bf66bdd92b..978fd8b71e 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
@@ -6,6 +6,7 @@
 from common.config_model import BenchmarkConfig, RegressionPolicy
 from common.benchmark_time_series_api_model import (
     BenchmarkTimeSeriesApiData,
+    BenchmarkTimeSeriesItem,
 )
 import pprint
 
@@ -21,7 +22,7 @@ class BaselineItem(TypedDict):
     value: float
 
 
-class LatestItem(TypedDict):
+class BenchmarkValueItem(TypedDict):
     group_info: Dict[str, Any]
     values: List[Dict[str, Any]]
 
@@ -35,8 +36,6 @@ class PerGroupResult(TypedDict, total=True):
 
 
 def percentile(values: list[float], q: float):
-    if not values:
-        return None
     v = sorted(values)
     k = (len(v) - 1) * q
     f = math.floor(k)
@@ -54,20 +53,20 @@ def __init__(
         baseline_ts: BenchmarkTimeSeriesApiData,
     ) -> None:
         self.metric_policies = config.policy.metrics
-        self.latest_ts = self._to_latest_data_map(latest_ts)
-        self.baseline_ts = self._to_baseline_map(baseline_ts, mode="max")
+        self.latest_ts = self._to_data_map(latest_ts)
+        self.baseline_raw = self._to_data_map(baseline_ts)
 
     def generate(self) -> Tuple[List[PerGroupResult], bool]:
         return self.detect_regressions_with_policies(
-            self.baseline_ts,
+            self.baseline_raw,
             self.latest_ts,
             metric_policies=self.metric_policies,
         )
 
     def detect_regressions_with_policies(
         self,
-        baseline_map: Dict[tuple, BaselineItem],
-        dp_map: Dict[tuple, LatestItem],
+        baseline_map: Dict[tuple, BenchmarkValueItem],
+        dp_map: Dict[tuple, BenchmarkValueItem],
         *,
         metric_policies: Dict[str, RegressionPolicy],
         min_points: int = 2,
@@ -90,27 +89,41 @@ def detect_regressions_with_policies(
             points: List[Any] = cur_item["values"] if cur_item else []
 
             base_item = baseline_map.get(key)
+            if not base_item:
+                logger.warning("Skip. No baseline item found for %s", gi)
+                results.append(
+                    PerGroupResult(
+                        group_info=gi,
+                        baseline=None,
+                        points=[],
+                        label="insufficient_data",
+                        policy=None,
+                    )
+                )
+                continue
             logger.info("base_item for keys(%s):\n%s ",key, pprint.pformat(base_item))
-            baseline_value = base_item.get("value") if base_item else None
             policy = self._resolve_policy(metric_policies, gi.get("metric", ""))
             if not policy:
                 logger.warning("No policy for %s", gi)
                 results.append(
                     PerGroupResult(
                         group_info=gi,
-                        baseline=baseline_value,
+                        baseline=None,
                         points=[],
                         label="insufficient_data",
                         policy=None,
                     )
                 )
                 continue
+
+            baseline_aggre_mode = policy.baseline_aggregation
+            baseline_value = self._get_baseline(base_item,baseline_aggre_mode)
             if baseline_value is None or len(points) == 0:
                 logger.warning("baseline_value is %s, len(points) == %s", baseline_value,len(points))
                 results.append(
                     PerGroupResult(
                         group_info=gi,
-                        baseline=baseline_value,
+                        baseline=None,
                         points=[],
                         label="insufficient_data",
                         policy=policy,
@@ -120,7 +133,7 @@ def detect_regressions_with_policies(
 
             # Per-point violations (True = regression)
             flags: List[bool] = [
-                policy.is_violation(p["value"], baseline_value) for p in points
+                policy.is_violation(p["value"], baseline_value["value"]) for p in points
             ]
             label = self.classify_flags(flags, min_points=min_points)
 
@@ -138,10 +151,10 @@ def detect_regressions_with_policies(
                 is_any_regression = True
         return results, is_any_regression
 
-    def _to_latest_data_map(
+    def _to_data_map(
         self, data: "BenchmarkTimeSeriesApiData", field: str = "value"
-    ) -> Dict[tuple, LatestItem]:
-        result: Dict[tuple, LatestItem] = {}
+    ) -> Dict[tuple, BenchmarkValueItem]:
+        result: Dict[tuple, BenchmarkValueItem] = {}
         for ts_group in data.time_series:
             group_keys = tuple(sorted(ts_group.group_info.items()))
             points: List[Dict[str, Any]] = []
@@ -164,32 +177,39 @@ def _to_latest_data_map(
             }
         return result
 
-    def _to_baseline_map(
+    def _get_baseline(
         self,
-        baseline: BenchmarkTimeSeriesApiData,
+        data: BenchmarkValueItem,
         mode: str = "mean",
         field: str = "value",
-    ) -> Dict[tuple, BaselineItem]:
-        result = {}
-        for ts_group in baseline.time_series:
-            group_keys = tuple(sorted(ts_group.group_info.items()))
-            values = [float(d[field]) for d in ts_group.data if field in d]
-            if not values:
-                continue
-
-            if mode == "mean":
-                val = statistics.fmean(values)
-            elif mode == "p90":
-                val = percentile(values, 0.9)
-            elif mode == "max":
-                val = max(values)
-            else:
-                raise ValueError("mode must be 'mean' or 'p90'")
+    ) -> Optional[BaselineItem]:
+        values = [float(d[field]) for d in data["values"] if field in d]
+        if not values:
+            return None
 
-            result[group_keys] = {
-                "group_info": ts_group.group_info,
-                "value": val,
-            }
+        if mode == "mean":
+            val = statistics.fmean(values)
+        elif mode == "p90":
+            val = percentile(values, 0.9)
+        elif mode == "max":
+            val = max(values)
+        elif mode == "min":
+            val = min(values)
+        elif mode == "latest":
+            val = values[-1]
+        elif mode == "earliest":
+            val = values[0]
+        elif mode == "p50":
+            val = percentile(values, 0.5)
+        elif mode == "p95":
+            val = percentile(values, 0.95)
+        else:
+            logger.warning("Unknown mode: %s", mode)
+            return None
+        result:BaselineItem =  {
+            "group_info": data["group_info"],
+            "value": val,
+        }
         return result
 
     def classify_flags(

From f9d7f66f741d682fd286f2a0d151780ad6d066ac Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 3 Sep 2025 02:53:20 -0700
Subject: [PATCH 24/27] addid

---
 .../common/config.py                                   |  6 +++---
 .../common/config_model.py                             | 10 +++++-----
 .../lambda_function.py                                 |  5 +----
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py
index 2c5280a63b..d894c5c544 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/config.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/config.py
@@ -48,13 +48,13 @@
         ),
         metrics={
             "passrate": RegressionPolicy(
-                name="passrate", condition="greater_than", threshold=0.9, baseline_aggregation="max"
+                name="passrate", condition="greater_equal", threshold=0.9, baseline_aggregation="max",
             ),
             "geomean": RegressionPolicy(
-                name="geomean", condition="greater_than", threshold=0.95,baseline_aggregation="max"
+                name="geomean", condition="greater_equal", threshold=0.95,baseline_aggregation="max",
             ),
             "compression_ratio": RegressionPolicy(
-                name="compression_ratio", condition="greater_than", threshold=0.9, baseline_aggregation="max"
+                name="compression_ratio", condition="greater_equal", threshold=0.9, baseline_aggregation="max",
             ),
         },
         notification_config={
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
index 66e9ed04ed..59c2f86d9a 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/config_model.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
@@ -98,12 +98,12 @@ def baseline_timedelta(self) -> timedelta:
 class RegressionPolicy:
     """
     Defines the policy for a given metric.
-        - "greater_than": higher is better; violation if new value < baseline * threshold
-        - "less_than":    lower  is better; violation if new value > baseline * threshold
+    - new value muset be {x} baseline value:
+        - "greater_than": higher is better; new value must be strictly greater to baseline
+        - "less_than":    lower  is better; new value must be strictly lower to baseline
         - "equal_to":     new value should be ~= baseline * threshold within rel_tol
-        - "greater_equal": higher is better; violation if new value <= baseline * threshold
-        - "less_equal":    lower  is better; violation if new value >= baseline * threshold
-
+        - "greater_equal": higher is better; new value must be greater or equal to baseline
+        - "less_equal":    lower  is better; new value must be less or equal to baseline
     """
     name: str
     condition: Literal["greater_than", "less_than", "equal_to","greater_equal","less_equal"]
diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
index 29c2596e0e..adb98d2d9c 100644
--- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py
+++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
@@ -88,7 +88,6 @@ def process(
     ):
         # ensure each thread has its own clickhouse client. clickhouse client
         # is not thread-safe.
-        logger.info("here")
         if cc is None:
             tlocal = threading.local()
             if not hasattr(tlocal, "cc") or tlocal.cc is None:
@@ -101,8 +100,6 @@ def process(
                 else:
                     tlocal.cc = get_clickhouse_client_environment()
             cc = tlocal.cc
-        logger.info("i'm here")
-
         try:
             config = get_benchmark_regression_config(config_id)
             logger.info("found config for config_id %s",config_id)
@@ -215,7 +212,7 @@ def _fetch_from_benchmark_ts_api(
         )
         url = source.api_query_url
 
-        logger.info("trying to call %s",url)
+        logger.info("[%s]trying to call %s, with query %s",config_id, url,query)
         try:
             resp: BenchmarkTimeSeriesApiResponse = (
                 BenchmarkTimeSeriesApiResponse.from_request(url, query)

From 4bbb803f442cf9317e7500efec15f03664ec5088 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 3 Sep 2025 03:11:19 -0700
Subject: [PATCH 25/27] addid

---
 .../common/regression_utils.py                                | 4 +---
 .../benchmark_regression_summary_report/lambda_function.py    | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
index 978fd8b71e..10091192b9 100644
--- a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
+++ b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
@@ -83,7 +83,6 @@ def detect_regressions_with_policies(
         is_any_regression = False
 
         for key in sorted(dp_map.keys()):
-            logger.info("key: %s", key)
             cur_item = dp_map.get(key)
             gi = cur_item["group_info"] if cur_item else {}
             points: List[Any] = cur_item["values"] if cur_item else []
@@ -101,7 +100,6 @@ def detect_regressions_with_policies(
                     )
                 )
                 continue
-            logger.info("base_item for keys(%s):\n%s ",key, pprint.pformat(base_item))
             policy = self._resolve_policy(metric_policies, gi.get("metric", ""))
             if not policy:
                 logger.warning("No policy for %s", gi)
@@ -141,7 +139,7 @@ def detect_regressions_with_policies(
             results.append(
                 PerGroupResult(
                     group_info=gi,
-                    baseline=baseline_value,
+                    baseline= baseline_value["value"],
                     points=enriched_points,
                     label=label,
                     policy=policy,
diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
index adb98d2d9c..4c8a0f5863 100644
--- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py
+++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
@@ -212,7 +212,7 @@ def _fetch_from_benchmark_ts_api(
         )
         url = source.api_query_url
 
-        logger.info("[%s]trying to call %s, with query %s",config_id, url,query)
+        logger.info("[%s]trying to call %s, with query\n %s",config_id, url,query)
         try:
             resp: BenchmarkTimeSeriesApiResponse = (
                 BenchmarkTimeSeriesApiResponse.from_request(url, query)

From c193c8eb71b14b49e17b72913b87a4982df71658 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 3 Sep 2025 13:11:18 -0700
Subject: [PATCH 26/27] addid

---
 .../params.json                               | 17 ------
 .../query.sql                                 | 61 -------------------
 2 files changed, 78 deletions(-)
 delete mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json
 delete mode 100644 torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql

diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json
deleted file mode 100644
index 95f00e1501..0000000000
--- a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/params.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "params": {
-    "branches": "Array(String)",
-    "commits": "Array(String)",
-    "compilers": "Array(String)",
-    "device": "String",
-    "arch": "String",
-    "dtype": "String",
-    "granularity": "String",
-    "mode": "String",
-    "startTime": "DateTime64(3)",
-    "stopTime": "DateTime64(3)",
-    "suites": "Array(String)",
-    "workflowId": "Int64"
-  },
-  "tests": []
-}
diff --git a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql b/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
deleted file mode 100644
index bf233f6ff8..0000000000
--- a/torchci/clickhouse_queries/compilers_benchmark_performance_v2/query.sql
+++ /dev/null
@@ -1,61 +0,0 @@
-WITH benchmarks AS (
-  SELECT
-    workflow_id,
-    job_id,
-    suite,
-    model_name,
-    metric_name,
-    value,
-    metric_extra_info AS extra_info,
-    DATE_TRUNC({granularity:String}, fromUnixTimestamp(timestamp)) AS granularity_bucket,
-    benchmark_dtype,
-    benchmark_mode,
-    device,
-    arch,
-    replaceOne(head_branch, 'refs/heads/', '') AS head_branch,
-
-    benchmark_extra_info['output'] AS output,
-
-    REGEXP_REPLACE(
-      benchmark_extra_info['output'],
-      CONCAT('_', suite, '_', {dtype:String}, '_', {mode:String}, '_', {device:String}, '_.*'),
-      ''
-    ) AS temp
-
-  FROM benchmark.oss_ci_benchmark_torchinductor
-  WHERE
-    timestamp >= toUnixTimestamp({startTime:DateTime64(3)}) AND
-    timestamp <  toUnixTimestamp({stopTime:DateTime64(3)}) AND
-    (has({commits:Array(String)}, head_sha) OR empty({commits:Array(String)})) AND
-    (has({suites:Array(String)}, suite)     OR empty({suites:Array(String)})) AND
-    (workflow_id = {workflowId:Int64} OR {workflowId:Int64} = 0)
-)
-
-SELECT
-  workflow_id,
-  job_id,
-  REGEXP_REPLACE(temp, '.*/', '') AS backend,
-  suite,
-  model_name AS model,
-  metric_name AS metric,
-  value,
-  output,
-  granularity_bucket,
-  extra_info,
-FROM benchmarks
-WHERE
-  (has({branches:Array(String)}, head_branch) OR empty({branches:Array(String)}))
-  AND (
-    (
-      ({arch:String} = '' OR {arch:String} = 'a100') AND
-      output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_%')
-    ) OR (
-      {arch:String} != '' AND
-      output LIKE CONCAT('%\_', {dtype:String}, '\_', {mode:String}, '\_', {device:String}, '\_', {arch:String}, '\_%')
-    ) OR (
-      benchmark_dtype = {dtype:String} AND
-      benchmark_mode  = {mode:String}  AND
-      device          = {device:String} AND
-      arch            = {arch:String}
-    )
-  );

From 7d882c530180de90bab6828199c28f91a6f62fb2 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 3 Sep 2025 13:12:09 -0700
Subject: [PATCH 27/27] addid

---
 torchci/lib/benchmark/compilerUtils.ts        |   6 -
 .../pages/api/benchmark/get_time_series.ts    | 190 ------------------
 2 files changed, 196 deletions(-)

diff --git a/torchci/lib/benchmark/compilerUtils.ts b/torchci/lib/benchmark/compilerUtils.ts
index 6d3e0902d9..00212177b3 100644
--- a/torchci/lib/benchmark/compilerUtils.ts
+++ b/torchci/lib/benchmark/compilerUtils.ts
@@ -100,10 +100,7 @@ export function computePassrate(
     const [bucket, workflowId, suite, compiler] = key.split("+");
     passrate.push({
       metric: "passrate",
-<<<<<<< HEAD
       value: p,
-=======
->>>>>>> 556c0ef04 (addid)
       granularity_bucket: bucket,
       workflow_id: workflowId,
       suite: suite,
@@ -169,10 +166,7 @@ export function computeGeomean(
     const [bucket, workflowId, suite, compiler] = key.split("+");
     returnedGeomean.push({
       metric: "geomean",
-<<<<<<< HEAD
       value: Number(gm),
-=======
->>>>>>> 556c0ef04 (addid)
       granularity_bucket: bucket,
       workflow_id: workflowId,
       suite: suite,
diff --git a/torchci/pages/api/benchmark/get_time_series.ts b/torchci/pages/api/benchmark/get_time_series.ts
index a51a4f77cd..ce069f5590 100644
--- a/torchci/pages/api/benchmark/get_time_series.ts
+++ b/torchci/pages/api/benchmark/get_time_series.ts
@@ -1,28 +1,6 @@
-<<<<<<< HEAD
 import { getCompilerBenchmarkData } from "lib/benchmark/api_helper/compilers/precompute";
 import { readApiGetParams } from "lib/benchmark/api_helper/utils";
 import type { NextApiRequest, NextApiResponse } from "next";
-=======
-import {
-  computeGeomean,
-  computePassrate,
-  computePeakMemoryUsage,
-  convertToCompilerPerformanceData,
-  getPassingModels,
-} from "lib/benchmark/compilerUtils";
-import { queryClickhouseSaved } from "lib/clickhouse";
-import type { NextApiRequest, NextApiResponse } from "next";
-import { getNestedField } from "./group_data";
-
-type GroupInfo = Record<string, string>;
-type Subgroup<T> = { group_Info: GroupInfo; data: T[] };
-type GroupedItem<T> = {
-  group_Info: GroupInfo;
-  rows: Record<string, Subgroup<T>>;
-};
-type Params = Record<string, any>;
-const BENCNMARK_TABLE_NAME = "compilers_benchmark_performance_v2";
->>>>>>> 556c0ef04 (addid)
 
 /**
  * API Route: /api/benchmark/get_time_series
@@ -83,171 +61,3 @@ async function getBenmarkTimeSeriesData(
       throw new Error(`Unsupported request_name: ${request_name}`);
   }
 }
-<<<<<<< HEAD
-=======
-
-// Utility to extract params from either GET or POST
-// it accepts both ?parameters=<json string> and POST with JSON body
-function readParams(req: NextApiRequest): Params {
-  // 1) If POST with parsed JSON body
-  if (req.method === "POST" && req.body && typeof req.body === "object") {
-    return req.body as Params;
-  }
-
-  // 2) If POST with raw string body
-  if (
-    req.method === "POST" &&
-    typeof req.body === "string" &&
-    req.body.trim()
-  ) {
-    try {
-      return JSON.parse(req.body) as Params;
-    } catch {}
-  }
-
-  // 3) If GET with ?parameters=<json string>
-  const raw = req.query.parameters as string | undefined;
-  if (raw) {
-    try {
-      return JSON.parse(raw) as Params;
-    } catch {}
-  }
-
-  // 4) Fallback: use query params directly
-  const q: Params = {};
-  Object.entries(req.query).forEach(([k, v]) => {
-    if (k !== "parameters") q[k] = Array.isArray(v) ? v[0] : v;
-  });
-  return q;
-}
-
-/**
- * Group data by `keys`, and inside each group further subgroup by `subGroupKeys`.
- */
-function groupBy<T>(
-  data: T[],
-  keys: string[],
-  subGroupKeys: string[] = []
-): GroupedItem<T>[] {
-  const groups = new Map<string, Map<string, Subgroup<T>>>();
-  const mainInfo = new Map<string, GroupInfo>();
-
-  for (const row of data as any[]) {
-    // build main group key
-    const mainKeyParts = keys.map((k) => String(getNestedField(row, k)));
-    const mainKey = mainKeyParts.join("|");
-    if (!mainInfo.has(mainKey)) {
-      const info: GroupInfo = {};
-      keys.forEach((k, i) => (info[k] = mainKeyParts[i]));
-      mainInfo.set(mainKey, info);
-    }
-
-    // build subgroup key
-    const subKeyParts =
-      subGroupKeys.length > 0
-        ? subGroupKeys.map((k) => String(getNestedField(row, k)))
-        : ["__ALL__"]; // default single subgroup if none provided
-    const subKey = subKeyParts.join("|");
-    const subInfo: GroupInfo = {};
-
-    subGroupKeys.forEach((k, i) => (subInfo[k] = subKeyParts[i]));
-
-    if (!groups.has(mainKey)) groups.set(mainKey, new Map());
-    const subMap = groups.get(mainKey)!;
-
-    if (!subMap.has(subKey)) {
-      subMap.set(subKey, { group_Info: subInfo, data: [] });
-    }
-    subMap.get(subKey)!.data.push(row as T);
-  }
-
-  // build result array
-  const result: GroupedItem<T>[] = [];
-  for (const [mainKey, subMap] of groups.entries()) {
-    const rowsObj = Object.fromEntries(subMap.entries());
-    result.push({
-      group_Info: mainInfo.get(mainKey)!,
-      rows: rowsObj,
-    });
-  }
-  return result;
-}
-
-async function getCompilerBenchmarkData(inputparams: any) {
-  const start = Date.now();
-  const rows = await queryClickhouseSaved(BENCNMARK_TABLE_NAME, inputparams);
-  const end = Date.now();
-  const result = toPrecomputeCompiler(rows, inputparams, "time_series");
-  console.log("time to get data", end - start);
-  return result;
-}
-
-function toPrecomputeCompiler(
-  rawData: any[],
-  inputparams: any,
-  type: string = "time_series"
-) {
-  const data = convertToCompilerPerformanceData(rawData);
-  const models = getPassingModels(data);
-
-  const passrate = computePassrate(data, models);
-  const geomean = computeGeomean(data, models);
-  const peakMemory = computePeakMemoryUsage(data, models);
-
-  const all_data = [passrate, geomean, peakMemory].flat();
-
-  all_data.map((row) => {
-    row["dtype"] = inputparams["dtype"];
-    row["arch"] = inputparams["arch"];
-    row["device"] = inputparams["device"];
-    row["mode"] = inputparams["mode"];
-  });
-
-  let res: any[] = [];
-  switch (type) {
-    case "time_series":
-      // grouping data by comipler, device, arch, dtype, suite, metric, mode
-      // then sorted it with granularity_bucket in ascending order
-      const tsd = groupBy(
-        all_data,
-        ["dtype", "arch", "device", "suite", "compiler", "metric", "mode"],
-        ["workflow_id"]
-      );
-      res = tsd.map((group) => {
-        const group_info = group.group_Info;
-        const group_data = group.rows;
-
-        // no need for the group_info for subgroup, directly get the data
-        const ts_list = Object.values(group_data)
-          .filter((item) => item.data.length > 0)
-          .map((item) => item.data[0])
-          .sort(
-            (a, b) =>
-              new Date(a.granularity_bucket).getTime() -
-              new Date(b.granularity_bucket).getTime()
-          );
-        return {
-          group_info,
-          num_of_dp: ts_list.length,
-          result: ts_list,
-        };
-      });
-      return res;
-    case "table":
-      res = groupBy(
-        all_data,
-        [
-          "dtype",
-          "arch",
-          "device",
-          "mode",
-          "workflow_id",
-          "granularity_bucket",
-        ],
-        ["metric", "compiler"]
-      );
-  }
-
-  return res;
-}
->>>>>>> 556c0ef04 (addid)