bench_tools: run and compare benchmark

AvivYossef-starkware · AvivYossef-starkware · commit ebe62e611011 · 2025-10-22T15:25:28.000+03:00
diff --git a/crates/bench_tools/src/comparison.rs b/crates/bench_tools/src/comparison.rs
@@ -0,0 +1,112 @@
+use std::fs;
+use std::path::PathBuf;
+
+use crate::types::estimates::Estimates;
+
+/// Result of a benchmark comparison.
+#[derive(Debug)]
+pub struct BenchmarkComparison {
+    pub name: String,
+    pub change_percentage: f64,
+    pub exceeds_limit: bool,
+}
+
+/// Loads change estimates from criterion's change directory for a given benchmark.
+/// Panics if the change file doesn't exist.
+fn load_change_estimates(bench_name: &str) -> Estimates {
+    let change_path =
+        PathBuf::from("target/criterion").join(bench_name).join("change/estimates.json");
+
+    if !change_path.exists() {
+        panic!(
+            "Change file not found for benchmark '{}': {}\nThis likely means no baseline exists. \
+             Run the benchmark at least once before using run-and-compare.",
+            bench_name,
+            change_path.display()
+        );
+    }
+
+    let data = fs::read_to_string(&change_path)
+        .unwrap_or_else(|e| panic!("Failed to read {}: {}", change_path.display(), e));
+
+    serde_json::from_str(&data).unwrap_or_else(|e| {
+        panic!("Failed to deserialize {}: {}\nContent: {}", change_path.display(), e, data)
+    })
+}
+
+/// Converts change estimates to percentage.
+/// The mean.point_estimate in change/estimates.json represents fractional change
+/// (e.g., 0.0706 = 7.06% change).
+fn get_regression_percentage(change_estimates: &Estimates) -> f64 {
+    change_estimates.mean.point_estimate * 100.0
+}
+
+/// Checks all benchmarks for regressions against a specified limit.
+/// Returns a vector of comparison results for all benchmarks.
+/// If any benchmark exceeds the regression limit, returns an error with detailed results.
+/// Panics if change file is not found for any benchmark.
+pub fn check_regressions(
+    bench_names: &[&str],
+    regression_limit: f64,
+) -> Result<Vec<BenchmarkComparison>, (String, Vec<BenchmarkComparison>)> {
+    let mut results = Vec::new();
+    let mut exceeded_count = 0;
+
+    for bench_name in bench_names {
+        let change_estimates = load_change_estimates(bench_name);
+        let change_percentage = get_regression_percentage(&change_estimates);
+        let exceeds_limit = change_percentage > regression_limit;
+
+        if exceeds_limit {
+            exceeded_count += 1;
+        }
+
+        results.push(BenchmarkComparison {
+            name: bench_name.to_string(),
+            change_percentage,
+            exceeds_limit,
+        });
+    }
+
+    if exceeded_count > 0 {
+        let error_msg = format!("{} benchmark(s) exceeded regression threshold!", exceeded_count);
+        Err((error_msg, results))
+    } else {
+        Ok(results)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_get_regression_percentage() {
+        let estimates = Estimates {
+            mean: crate::types::estimates::Stat {
+                point_estimate: 0.0706,
+                standard_error: 0.01,
+                confidence_interval: crate::types::estimates::ConfidenceInterval {
+                    confidence_level: 0.95,
+                    lower_bound: 0.05,
+                    upper_bound: 0.09,
+                },
+            },
+            median: crate::types::estimates::Stat {
+                point_estimate: 0.03,
+                standard_error: 0.01,
+                confidence_interval: crate::types::estimates::ConfidenceInterval {
+                    confidence_level: 0.95,
+                    lower_bound: 0.01,
+                    upper_bound: 0.05,
+                },
+            },
+            std_dev: None,
+            median_abs_dev: None,
+            slope: None,
+        };
+
+        let percentage = get_regression_percentage(&estimates);
+        assert!((percentage - 7.06).abs() < 0.01);
+    }
+}
diff --git a/crates/bench_tools/src/lib.rs b/crates/bench_tools/src/lib.rs
@@ -1,5 +1,6 @@
 #[cfg(test)]
 pub(crate) mod benches;
+pub mod comparison;
 pub mod gcs;
 pub mod runner;
 pub mod types;
diff --git a/crates/bench_tools/src/main.rs b/crates/bench_tools/src/main.rs
@@ -30,6 +30,22 @@ enum Commands {
         #[arg(long)]
         input_dir: Option<String>,
     },
+    /// Run benchmarks, compare to previous run, and fail if regression exceeds limit.
+    RunAndCompare {
+        /// Package name to run benchmarks for.
+        #[arg(short, long)]
+        package: String,
+        /// Output directory for results.
+        #[arg(short, long)]
+        out: String,
+        /// Optional: Local directory containing input files. If not provided, inputs will be
+        /// downloaded from GCS for benchmarks that require them.
+        #[arg(long)]
+        input_dir: Option<String>,
+        /// Maximum acceptable regression percentage (e.g., 5.0 for 5%).
+        #[arg(long)]
+        regression_limit: f64,
+    },
     /// List benchmarks for a package.
     List {
         /// Package name to list benchmarks for. If not provided, lists all benchmarks.
@@ -60,6 +76,21 @@ async fn main() {
 
             bench_tools::runner::run_benchmarks(&benchmarks, input_dir.as_deref(), &out).await;
         }
+        Commands::RunAndCompare { package, out, input_dir, regression_limit } => {
+            let benchmarks = find_benchmarks_by_package(&package);
+
+            if benchmarks.is_empty() {
+                panic!("No benchmarks found for package: {}", package);
+            }
+
+            bench_tools::runner::run_and_compare_benchmarks(
+                &benchmarks,
+                input_dir.as_deref(),
+                &out,
+                regression_limit,
+            )
+            .await;
+        }
         Commands::List { package } => match package {
             Some(package_name) => {
                 let benchmarks = find_benchmarks_by_package(&package_name);
diff --git a/crates/bench_tools/src/runner.rs b/crates/bench_tools/src/runner.rs
@@ -43,7 +43,11 @@ async fn prepare_inputs(bench: &BenchmarkConfig, input_dir: Option<&str>) {
     } else {
         gcs::download_inputs(bench.name, &benchmark_input_dir).await;
         if !benchmark_input_dir.exists() {
-            panic!("Failed to download inputs for {}: {}", bench.name, benchmark_input_dir.display());
+            panic!(
+                "Failed to download inputs for {}: {}",
+                bench.name,
+                benchmark_input_dir.display()
+            );
         }
     }
 }
@@ -116,3 +120,47 @@ pub async fn run_benchmarks(
 
     println!("\n✓ All benchmarks completed! Results saved to: {}", output_dir);
 }
+
+/// Runs benchmarks and compares them against previous results, failing if regression exceeds limit.
+pub async fn run_and_compare_benchmarks(
+    benchmarks: &[&BenchmarkConfig],
+    input_dir: Option<&str>,
+    output_dir: &str,
+    regression_limit: f64,
+) {
+    // Run benchmarks first.
+    run_benchmarks(benchmarks, input_dir, output_dir).await;
+
+    // Collect all criterion benchmark names from configs.
+    let mut bench_names = Vec::new();
+    for bench in benchmarks {
+        bench_names.extend(bench.criterion_benchmark_names.unwrap_or(&[bench.name]));
+    }
+
+    println!("\n📊 Checking for performance regressions (limit: {}%):", regression_limit);
+    let regression_result = crate::comparison::check_regressions(&bench_names, regression_limit);
+
+    match regression_result {
+        Ok(_) => {
+            println!("\n✅ All benchmarks passed regression check!");
+        }
+        Err((error_msg, results)) => {
+            // Some benchmarks exceeded the limit - print detailed results.
+            println!("\nBenchmark Results:");
+            for result in results {
+                if result.exceeds_limit {
+                    println!(
+                        "  ❌ {}: {:+.2}% (EXCEEDS {:.1}% limit)",
+                        result.name, result.change_percentage, regression_limit
+                    );
+                } else {
+                    println!(
+                        "  ✓ {}: {:+.2}% (within {:.1}% limit)",
+                        result.name, result.change_percentage, regression_limit
+                    );
+                }
+            }
+            panic!("\n{}", error_msg);
+        }
+    }
+}