[Benchmarks] Dry run without phony sycl or ur args

PatKamin · PatKamin · commit 508260e65455 · 2025-06-09T13:31:10.000Z
Create metadata during dry runs for all benchmarks even when --sycl,
--ur, and --umf options are not set
diff --git a/devops/scripts/benchmarks/benches/base.py b/devops/scripts/benchmarks/benches/base.py
@@ -56,6 +56,11 @@ def explicit_group(self) -> str:
         Can be modified."""
         return ""
 
+    def enabled(self) -> bool:
+        """Returns whether this benchmark is enabled.
+        By default, it returns True, but can be overridden to disable a benchmark."""
+        return True
+
     @abstractmethod
     def setup(self):
         pass
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
@@ -115,37 +115,11 @@ def additional_metadata(self) -> dict[str, BenchmarkMetadata]:
             ),
         }
 
-    def enabled_runtimes(self, supported_runtimes=None, extra_runtimes=None):
-        # all runtimes in the RUNTIMES enum
-        runtimes = supported_runtimes or list(RUNTIMES)
-
-        # filter out SYCL_PREVIEW which is not supported by default in all benchmarks
-        runtimes = [r for r in runtimes if r != RUNTIMES.SYCL_PREVIEW]
-
-        if extra_runtimes is not None:
-            runtimes.extend(extra_runtimes)
-
-        # Filter out UR if not available
-        if options.ur is None:
-            runtimes = [r for r in runtimes if r != RUNTIMES.UR]
-
-        # Filter out L0 if cuda backend
-        if options.ur_adapter == "cuda":
-            runtimes = [r for r in runtimes if r != RUNTIMES.LEVEL_ZERO]
-
-        return runtimes
-
     def benchmarks(self) -> list[Benchmark]:
-        if options.sycl is None:
-            return []
-
-        if options.ur_adapter == "hip":
-            return []
-
         benches = []
 
-        # Add SubmitKernel benchmarks using loops
-        for runtime in self.enabled_runtimes(extra_runtimes=[RUNTIMES.SYCL_PREVIEW]):
+        for runtime in list(RUNTIMES):
+            # Add SubmitKernel benchmarks using loops
             for in_order_queue in [0, 1]:
                 for measure_completion in [0, 1]:
                     for use_events in [0, 1]:
@@ -161,21 +135,18 @@ def benchmarks(self) -> list[Benchmark]:
                                 )
                             )
 
-        # Add SinKernelGraph benchmarks
-        for runtime in self.enabled_runtimes():
+            # Add SinKernelGraph benchmarks
             for with_graphs in [0, 1]:
                 for num_kernels in [5, 100]:
                     benches.append(
                         GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
                     )
 
-        # Add ULLS benchmarks
-        for runtime in self.enabled_runtimes([RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]):
+            # Add ULLS benchmarks
             benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
             benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
 
-        # Add GraphApiSubmitGraph benchmarks
-        for runtime in self.enabled_runtimes():
+            # Add GraphApiSubmitGraph benchmarks
             for in_order_queue in [0, 1]:
                 for num_kernels in [4, 10, 32]:
                     for measure_completion_time in [0, 1]:
@@ -201,24 +172,24 @@ def benchmarks(self) -> list[Benchmark]:
         ]
 
         # Add UR-specific benchmarks
-        if options.ur is not None:
-            benches += [
-                MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 1, 1, 1, 1, 0),
-                MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 0, 1, 1, 1, 0),
-                MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 1, 0),
-                MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 0, 0),
-                MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 0),
-                MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 1),
-                UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256, "Both"),
-                UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256 * 1024, "Both"),
-                UsmBatchMemoryAllocation(self, RUNTIMES.UR, "Device", 128, 256, "Both"),
-                UsmBatchMemoryAllocation(
-                    self, RUNTIMES.UR, "Device", 128, 16 * 1024, "Both"
-                ),
-                UsmBatchMemoryAllocation(
-                    self, RUNTIMES.UR, "Device", 128, 128 * 1024, "Both"
-                ),
-            ]
+        benches += [
+            MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 1, 1, 1, 1, 0),
+            MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 0, 1, 1, 1, 0),
+            MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 1, 0),
+            MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 0, 0),
+            MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 0),
+            MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 1),
+            UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256, "Both"),
+            UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256 * 1024, "Both"),
+            UsmBatchMemoryAllocation(self, RUNTIMES.UR, "Device", 128, 256, "Both"),
+            UsmBatchMemoryAllocation(
+                self, RUNTIMES.UR, "Device", 128, 16 * 1024, "Both"
+            ),
+            UsmBatchMemoryAllocation(
+                self, RUNTIMES.UR, "Device", 128, 128 * 1024, "Both"
+            ),
+        ]
+
         benches += [
             MemcpyExecute(
                 self, RUNTIMES.SYCL_PREVIEW, 4096, 1, 1024, 40, 1, 1, 0, 1, 0
@@ -252,6 +223,13 @@ def __init__(self, bench, name, test):
         self.bench_name = name
         self.test = test
 
+    def enabled(self) -> bool:
+        if options.sycl is None:
+            return False
+        if options.ur_adapter == "hip":
+            return False
+        return True
+
     def bin_args(self) -> list[str]:
         return []
 
@@ -269,6 +247,26 @@ def explicit_group(self):
     def description(self) -> str:
         return ""
 
+    def enabled_runtimes(self, supported_runtimes=None, extra_runtimes=None):
+        # all runtimes in the RUNTIMES enum
+        runtimes = supported_runtimes or list(RUNTIMES)
+
+        # filter out SYCL_PREVIEW which is not supported by default in all benchmarks
+        runtimes = [r for r in runtimes if r != RUNTIMES.SYCL_PREVIEW]
+
+        if extra_runtimes is not None:
+            runtimes.extend(extra_runtimes)
+
+        # Filter out UR if not available
+        if options.ur is None:
+            runtimes = [r for r in runtimes if r != RUNTIMES.UR]
+
+        # Filter out L0 if cuda backend
+        if options.ur_adapter == "cuda":
+            runtimes = [r for r in runtimes if r != RUNTIMES.LEVEL_ZERO]
+
+        return runtimes
+
     def run(self, env_vars) -> list[Result]:
         command = [
             f"{self.benchmark_bin}",
@@ -347,6 +345,15 @@ def __init__(
             bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel"
         )
 
+    def enabled(self) -> bool:
+        if not super().enabled():
+            return False
+        if self.runtime not in self.enabled_runtimes(
+            extra_runtimes=[RUNTIMES.SYCL_PREVIEW]
+        ):
+            return False
+        return True
+
     def get_tags(self):
         return ["submit", "latency", runtime_to_tag_name(self.runtime), "micro"]
 
@@ -631,6 +638,13 @@ def __init__(
             bench, f"multithread_benchmark_{self.runtime.value}", "MemcpyExecute"
         )
 
+    def enabled(self) -> bool:
+        if not super().enabled():
+            return False
+        if self.runtime == RUNTIMES.UR and options.ur is None:
+            return False
+        return True
+
     def extra_env_vars(self) -> dict:
         if not self.useCopyOffload:
             return {"UR_L0_V2_FORCE_DISABLE_COPY_OFFLOAD": "1"}
@@ -709,6 +723,13 @@ def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels):
             bench, f"graph_api_benchmark_{runtime.value}", "SinKernelGraph"
         )
 
+    def enabled(self) -> bool:
+        if not super().enabled():
+            return False
+        if self.runtime not in self.enabled_runtimes():
+            return False
+        return True
+
     def explicit_group(self):
         return f"SinKernelGraph, numKernels: {self.numKernels}"
 
@@ -761,6 +782,13 @@ def __init__(
         self.measureCompletionTime = measureCompletionTime
         super().__init__(bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph")
 
+    def enabled(self) -> bool:
+        if not super().enabled():
+            return False
+        if self.runtime not in self.enabled_runtimes():
+            return False
+        return True
+
     def explicit_group(self):
         return f"SubmitGraph, numKernels: {self.numKernels}"
 
@@ -805,6 +833,15 @@ def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
         self.runtime = runtime
         super().__init__(bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel")
 
+    def enabled(self) -> bool:
+        if not super().enabled():
+            return False
+        if self.runtime not in self.enabled_runtimes(
+            [RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
+        ):
+            return False
+        return True
+
     def explicit_group(self):
         return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}"
 
@@ -851,6 +888,15 @@ def __init__(
         self.ioq = ioq
         super().__init__(bench, f"ulls_benchmark_{runtime.value}", "KernelSwitch")
 
+    def enabled(self) -> bool:
+        if not super().enabled():
+            return False
+        if self.runtime not in self.enabled_runtimes(
+            [RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
+        ):
+            return False
+        return True
+
     def explicit_group(self):
         return f"KernelSwitch, count: {self.count}, kernelTime: {self.kernelTime}"
 
@@ -890,6 +936,13 @@ def __init__(
             bench, f"api_overhead_benchmark_{runtime.value}", "UsmMemoryAllocation"
         )
 
+    def enabled(self) -> bool:
+        if not super().enabled():
+            return False
+        if self.runtime == RUNTIMES.UR and options.ur is None:
+            return False
+        return True
+
     def get_tags(self):
         return [runtime_to_tag_name(self.runtime), "micro", "latency", "memory"]
 
@@ -948,6 +1001,13 @@ def __init__(
             bench, f"api_overhead_benchmark_{runtime.value}", "UsmBatchMemoryAllocation"
         )
 
+    def enabled(self) -> bool:
+        if not super().enabled():
+            return False
+        if self.runtime == RUNTIMES.UR and options.ur is None:
+            return False
+        return True
+
     def get_tags(self):
         return [runtime_to_tag_name(self.runtime), "micro", "latency", "memory"]
 
diff --git a/devops/scripts/benchmarks/benches/llamacpp.py b/devops/scripts/benchmarks/benches/llamacpp.py
@@ -17,9 +17,6 @@
 
 class LlamaCppBench(Suite):
     def __init__(self, directory):
-        if options.sycl is None:
-            return
-
         self.directory = directory
 
     def name(self) -> str:
@@ -80,12 +77,6 @@ def setup(self):
         )
 
     def benchmarks(self) -> list[Benchmark]:
-        if options.sycl is None:
-            return []
-
-        if options.ur_adapter == "cuda" or options.ur_adapter == "hip":
-            return []
-
         return [LlamaBench(self)]
 
 
@@ -94,6 +85,13 @@ def __init__(self, bench):
         super().__init__(bench.directory, bench)
         self.bench = bench
 
+    def enabled(self):
+        if options.sycl is None:
+            return False
+        if options.ur_adapter == "cuda" or options.ur_adapter == "hip":
+            return False
+        return True
+
     def setup(self):
         self.benchmark_bin = os.path.join(self.bench.build_path, "bin", "llama-bench")
 
diff --git a/devops/scripts/benchmarks/benches/syclbench.py b/devops/scripts/benchmarks/benches/syclbench.py
@@ -14,9 +14,6 @@
 
 class SyclBench(Suite):
     def __init__(self, directory):
-        if options.sycl is None:
-            return
-
         self.directory = directory
         return
 
@@ -67,9 +64,6 @@ def setup(self):
         self.built = True
 
     def benchmarks(self) -> list[Benchmark]:
-        if options.sycl is None:
-            return []
-
         return [
             # Blocked_transform(self), # run time < 1ms
             DagTaskI(self),
@@ -117,6 +111,9 @@ def __init__(self, bench, name, test):
         self.bench_name = name
         self.test = test
 
+    def enabled(self) -> bool:
+        return options.sycl is not None
+
     def bin_args(self) -> list[str]:
         return []
 
diff --git a/devops/scripts/benchmarks/benches/umf.py b/devops/scripts/benchmarks/benches/umf.py
@@ -33,9 +33,6 @@ def setup(self):
         self.built = True
 
     def benchmarks(self) -> list[Benchmark]:
-        if not isUMFAvailable():
-            return []
-
         benches = [
             GBench(self),
             GBenchUmfProxy(self),
@@ -52,8 +49,6 @@ def __init__(self, bench):
 
         self.bench = bench
         self.bench_name = "umf-benchmark"
-        self.oneapi = get_oneapi()
-        self.umf_lib = options.umf + "lib"
 
         self.fragmentation_prefix = "FRAGMENTATION_"
 
@@ -80,6 +75,9 @@ def __init__(self, bench):
     def name(self):
         return self.bench_name
 
+    def enabled(self):
+        return isUMFAvailable()
+
     # --benchmark_format describes stdout output
     # --benchmark_out=<file> and --benchmark_out_format=<format>
     # describe output to a file
@@ -98,6 +96,8 @@ def setup(self):
             print("UMF prefix path not provided")
             return
 
+        self.oneapi = get_oneapi()
+        self.umf_lib = options.umf + "lib"
         self.benchmark_bin = os.path.join(options.umf, "benchmark", self.bench_name)
 
     def is_memory_statistics_included(self, data_row):
diff --git a/devops/scripts/benchmarks/benches/velocity.py b/devops/scripts/benchmarks/benches/velocity.py
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py