Use interleaved_bench for run_example

jansel · jansel · commit 0a8d64770406 · 2025-10-15T14:20:26.000-07:00
stack-info: PR: #945, branch: jansel/stack/194
diff --git a/helion/_testing.py b/helion/_testing.py
@@ -19,12 +19,13 @@
 import torch
 from torch.utils._pytree import tree_map
 import triton
-from triton.testing import do_bench
 
 from ._utils import counters
 from .runtime.config import Config
 import helion
 from helion._compat import get_tensor_descriptor_fn_name
+from helion.autotuner.benchmarking import compute_repeat
+from helion.autotuner.benchmarking import interleaved_bench
 from helion.runtime.ref_mode import is_ref_mode_enabled
 
 if TYPE_CHECKING:
@@ -560,11 +561,11 @@ def run_example(
                     t.grad = None
 
     # Benchmark all functions
-    all_times = {
-        name: do_bench(lambda fn=fn: fn(*args))
-        for name, fn in {**kernels, **baselines}.items()
-    }
-
+    all_benchmarks = {**kernels, **baselines}
+    bench_fns = [functools.partial(fn, *args) for fn in all_benchmarks.values()]
+    repeat = compute_repeat(bench_fns[0])
+    timings = interleaved_bench(bench_fns, repeat=repeat, desc="Benchmarking")
+    all_times = dict(zip(all_benchmarks.keys(), timings, strict=True))
     best_baseline_time = min(all_times[name] for name in baselines)  # pyright: ignore[reportArgumentType]
 
     # Print results
diff --git a/helion/autotuner/benchmarking.py b/helion/autotuner/benchmarking.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import functools
+import math
 import statistics
 from typing import Callable
 
@@ -9,6 +10,43 @@
 from .progress_bar import iter_with_progress
 
 
+def compute_repeat(
+    fn: Callable[[], object],
+    *,
+    target_ms: float = 100.0,
+    min_repeat: int = 10,
+    max_repeat: int = 1000,
+    estimate_runs: int = 5,
+) -> int:
+    """
+    Estimate how many repetitions are needed to collect a stable benchmark for a
+    single function call, mirroring Triton's ``do_bench`` heuristic while
+    clamping the result between ``min_repeat`` and ``max_repeat``.
+    """
+    di = runtime.driver.active.get_device_interface()  # type: ignore[attr-defined]
+    cache = runtime.driver.active.get_empty_cache_for_benchmark()  # type: ignore[attr-defined]
+
+    # Warm the pipeline once before collecting timing samples.
+    fn()
+    di.synchronize()
+
+    start_event = di.Event(enable_timing=True)
+    end_event = di.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(estimate_runs):
+        runtime.driver.active.clear_cache(cache)  # type: ignore[attr-defined]
+        fn()
+    end_event.record()
+    di.synchronize()
+
+    estimate_ms = start_event.elapsed_time(end_event) / max(estimate_runs, 1)
+    if not math.isfinite(estimate_ms) or estimate_ms <= 0:
+        return max_repeat
+
+    repeat = int(target_ms / estimate_ms)
+    return max(min_repeat, min(max_repeat, max(1, repeat)))
+
+
 def interleaved_bench(
     fns: list[Callable[[], object]], *, repeat: int, desc: str | None = None
 ) -> list[float]: