Skip to content

Commit 49407a2

Browse files
PatKaminpbalcer
andcommitted
[Benchmarks] Dry run without phony sycl or ur args
Create metadata during dry runs for all benchmarks even when --sycl, --ur, and --umf options are not set Co-authored-by: Piotr Balcer <[email protected]>
1 parent 4d4fde2 commit 49407a2

File tree

7 files changed

+123
-105
lines changed

7 files changed

+123
-105
lines changed

devops/scripts/benchmarks/benches/base.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,11 @@ def explicit_group(self) -> str:
5656
Can be modified."""
5757
return ""
5858

59+
def enabled(self) -> bool:
60+
"""Returns whether this benchmark is enabled.
61+
By default, it returns True, but can be overridden to disable a benchmark."""
62+
return True
63+
5964
@abstractmethod
6065
def setup(self):
6166
pass

devops/scripts/benchmarks/benches/compute.py

Lines changed: 86 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -115,37 +115,11 @@ def additional_metadata(self) -> dict[str, BenchmarkMetadata]:
115115
),
116116
}
117117

118-
def enabled_runtimes(self, supported_runtimes=None, extra_runtimes=None):
119-
# all runtimes in the RUNTIMES enum
120-
runtimes = supported_runtimes or list(RUNTIMES)
121-
122-
# filter out SYCL_PREVIEW which is not supported by default in all benchmarks
123-
runtimes = [r for r in runtimes if r != RUNTIMES.SYCL_PREVIEW]
124-
125-
if extra_runtimes is not None:
126-
runtimes.extend(extra_runtimes)
127-
128-
# Filter out UR if not available
129-
if options.ur is None:
130-
runtimes = [r for r in runtimes if r != RUNTIMES.UR]
131-
132-
# Filter out L0 if cuda backend
133-
if options.ur_adapter == "cuda":
134-
runtimes = [r for r in runtimes if r != RUNTIMES.LEVEL_ZERO]
135-
136-
return runtimes
137-
138118
def benchmarks(self) -> list[Benchmark]:
139-
if options.sycl is None:
140-
return []
141-
142-
if options.ur_adapter == "hip":
143-
return []
144-
145119
benches = []
146120

147-
# Add SubmitKernel benchmarks using loops
148-
for runtime in self.enabled_runtimes(extra_runtimes=[RUNTIMES.SYCL_PREVIEW]):
121+
for runtime in list(RUNTIMES):
122+
# Add SubmitKernel benchmarks using loops
149123
for in_order_queue in [0, 1]:
150124
for measure_completion in [0, 1]:
151125
for use_events in [0, 1]:
@@ -161,21 +135,18 @@ def benchmarks(self) -> list[Benchmark]:
161135
)
162136
)
163137

164-
# Add SinKernelGraph benchmarks
165-
for runtime in self.enabled_runtimes():
138+
# Add SinKernelGraph benchmarks
166139
for with_graphs in [0, 1]:
167140
for num_kernels in [5, 100]:
168141
benches.append(
169142
GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
170143
)
171144

172-
# Add ULLS benchmarks
173-
for runtime in self.enabled_runtimes([RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]):
145+
# Add ULLS benchmarks
174146
benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
175147
benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
176148

177-
# Add GraphApiSubmitGraph benchmarks
178-
for runtime in self.enabled_runtimes():
149+
# Add GraphApiSubmitGraph benchmarks
179150
for in_order_queue in [0, 1]:
180151
for num_kernels in [4, 10, 32]:
181152
for measure_completion_time in [0, 1]:
@@ -201,24 +172,24 @@ def benchmarks(self) -> list[Benchmark]:
201172
]
202173

203174
# Add UR-specific benchmarks
204-
if options.ur is not None:
205-
benches += [
206-
MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 1, 1, 1, 1, 0),
207-
MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 0, 1, 1, 1, 0),
208-
MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 1, 0),
209-
MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 0, 0),
210-
MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 0),
211-
MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 1),
212-
UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256, "Both"),
213-
UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256 * 1024, "Both"),
214-
UsmBatchMemoryAllocation(self, RUNTIMES.UR, "Device", 128, 256, "Both"),
215-
UsmBatchMemoryAllocation(
216-
self, RUNTIMES.UR, "Device", 128, 16 * 1024, "Both"
217-
),
218-
UsmBatchMemoryAllocation(
219-
self, RUNTIMES.UR, "Device", 128, 128 * 1024, "Both"
220-
),
221-
]
175+
benches += [
176+
MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 1, 1, 1, 1, 0),
177+
MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 0, 1, 1, 1, 0),
178+
MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 1, 0),
179+
MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 0, 0),
180+
MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 0),
181+
MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 1),
182+
UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256, "Both"),
183+
UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256 * 1024, "Both"),
184+
UsmBatchMemoryAllocation(self, RUNTIMES.UR, "Device", 128, 256, "Both"),
185+
UsmBatchMemoryAllocation(
186+
self, RUNTIMES.UR, "Device", 128, 16 * 1024, "Both"
187+
),
188+
UsmBatchMemoryAllocation(
189+
self, RUNTIMES.UR, "Device", 128, 128 * 1024, "Both"
190+
),
191+
]
192+
222193
benches += [
223194
MemcpyExecute(
224195
self, RUNTIMES.SYCL_PREVIEW, 4096, 1, 1024, 40, 1, 1, 0, 1, 0
@@ -246,11 +217,44 @@ def parse_unit_type(compute_unit):
246217

247218

248219
class ComputeBenchmark(Benchmark):
249-
def __init__(self, bench, name, test):
220+
def __init__(self, bench, name, test, runtime: RUNTIMES = None):
250221
super().__init__(bench.directory, bench)
251222
self.bench = bench
252223
self.bench_name = name
253224
self.test = test
225+
self.runtime = runtime
226+
227+
def supported_runtimes(self) -> list[RUNTIMES]:
228+
"""Base runtimes supported by this benchmark, can be overridden."""
229+
# By default, support all runtimes except SYCL_PREVIEW
230+
return [r for r in RUNTIMES if r != RUNTIMES.SYCL_PREVIEW]
231+
232+
def enabled_runtimes(self) -> list[RUNTIMES]:
233+
"""Runtimes available given the current configuration."""
234+
# Start with all supported runtimes and apply configuration filters
235+
runtimes = self.supported_runtimes()
236+
237+
# Remove UR if not available
238+
if options.ur is None:
239+
runtimes = [r for r in runtimes if r != RUNTIMES.UR]
240+
241+
# Remove Level Zero if using CUDA backend
242+
if options.ur_adapter == "cuda":
243+
runtimes = [r for r in runtimes if r != RUNTIMES.LEVEL_ZERO]
244+
245+
return runtimes
246+
247+
def enabled(self) -> bool:
248+
# SYCL is required for all benchmarks
249+
if options.sycl is None:
250+
return False
251+
252+
# HIP adapter is not supported
253+
if options.ur_adapter == "hip":
254+
return False
255+
256+
# Check if the specific runtime is enabled (or no specific runtime required)
257+
return self.runtime is None or self.runtime in self.enabled_runtimes()
254258

255259
def bin_args(self) -> list[str]:
256260
return []
@@ -338,15 +342,17 @@ def __init__(
338342
KernelExecTime=1,
339343
):
340344
self.ioq = ioq
341-
self.runtime = runtime
342345
self.MeasureCompletion = MeasureCompletion
343346
self.UseEvents = UseEvents
344347
self.KernelExecTime = KernelExecTime
345348
self.NumKernels = 10
346349
super().__init__(
347-
bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel"
350+
bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel", runtime
348351
)
349352

353+
def supported_runtimes(self) -> list[RUNTIMES]:
354+
return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
355+
350356
def get_tags(self):
351357
return ["submit", "latency", runtime_to_tag_name(self.runtime), "micro"]
352358

@@ -617,7 +623,6 @@ def __init__(
617623
useCopyOffload,
618624
useBarrier,
619625
):
620-
self.runtime = runtime
621626
self.numOpsPerThread = numOpsPerThread
622627
self.numThreads = numThreads
623628
self.allocSize = allocSize
@@ -628,7 +633,7 @@ def __init__(
628633
self.useCopyOffload = useCopyOffload
629634
self.useBarrier = useBarrier
630635
super().__init__(
631-
bench, f"multithread_benchmark_{self.runtime.value}", "MemcpyExecute"
636+
bench, f"multithread_benchmark_{runtime.value}", "MemcpyExecute", runtime
632637
)
633638

634639
def extra_env_vars(self) -> dict:
@@ -704,9 +709,8 @@ class GraphApiSinKernelGraph(ComputeBenchmark):
704709
def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels):
705710
self.withGraphs = withGraphs
706711
self.numKernels = numKernels
707-
self.runtime = runtime
708712
super().__init__(
709-
bench, f"graph_api_benchmark_{runtime.value}", "SinKernelGraph"
713+
bench, f"graph_api_benchmark_{runtime.value}", "SinKernelGraph", runtime
710714
)
711715

712716
def explicit_group(self):
@@ -757,9 +761,10 @@ def __init__(
757761
):
758762
self.inOrderQueue = inOrderQueue
759763
self.numKernels = numKernels
760-
self.runtime = runtime
761764
self.measureCompletionTime = measureCompletionTime
762-
super().__init__(bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph")
765+
super().__init__(
766+
bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph", runtime
767+
)
763768

764769
def explicit_group(self):
765770
return f"SubmitGraph, numKernels: {self.numKernels}"
@@ -802,8 +807,12 @@ class UllsEmptyKernel(ComputeBenchmark):
802807
def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
803808
self.wgc = wgc
804809
self.wgs = wgs
805-
self.runtime = runtime
806-
super().__init__(bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel")
810+
super().__init__(
811+
bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel", runtime
812+
)
813+
814+
def supported_runtimes(self) -> list[RUNTIMES]:
815+
return [RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
807816

808817
def explicit_group(self):
809818
return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}"
@@ -847,9 +856,13 @@ def __init__(
847856
self.barrier = barrier
848857
self.hostVisible = hostVisible
849858
self.ctrBasedEvents = ctrBasedEvents
850-
self.runtime = runtime
851859
self.ioq = ioq
852-
super().__init__(bench, f"ulls_benchmark_{runtime.value}", "KernelSwitch")
860+
super().__init__(
861+
bench, f"ulls_benchmark_{runtime.value}", "KernelSwitch", runtime
862+
)
863+
864+
def supported_runtimes(self):
865+
return [RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
853866

854867
def explicit_group(self):
855868
return f"KernelSwitch, count: {self.count}, kernelTime: {self.kernelTime}"
@@ -882,12 +895,14 @@ class UsmMemoryAllocation(ComputeBenchmark):
882895
def __init__(
883896
self, bench, runtime: RUNTIMES, usm_memory_placement, size, measure_mode
884897
):
885-
self.runtime = runtime
886898
self.usm_memory_placement = usm_memory_placement
887899
self.size = size
888900
self.measure_mode = measure_mode
889901
super().__init__(
890-
bench, f"api_overhead_benchmark_{runtime.value}", "UsmMemoryAllocation"
902+
bench,
903+
f"api_overhead_benchmark_{runtime.value}",
904+
"UsmMemoryAllocation",
905+
runtime,
891906
)
892907

893908
def get_tags(self):
@@ -939,13 +954,15 @@ def __init__(
939954
size,
940955
measure_mode,
941956
):
942-
self.runtime = runtime
943957
self.usm_memory_placement = usm_memory_placement
944958
self.allocation_count = allocation_count
945959
self.size = size
946960
self.measure_mode = measure_mode
947961
super().__init__(
948-
bench, f"api_overhead_benchmark_{runtime.value}", "UsmBatchMemoryAllocation"
962+
bench,
963+
f"api_overhead_benchmark_{runtime.value}",
964+
"UsmBatchMemoryAllocation",
965+
runtime,
949966
)
950967

951968
def get_tags(self):

devops/scripts/benchmarks/benches/llamacpp.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,6 @@
1717

1818
class LlamaCppBench(Suite):
1919
def __init__(self, directory):
20-
if options.sycl is None:
21-
return
22-
2320
self.directory = directory
2421

2522
def name(self) -> str:
@@ -80,12 +77,6 @@ def setup(self):
8077
)
8178

8279
def benchmarks(self) -> list[Benchmark]:
83-
if options.sycl is None:
84-
return []
85-
86-
if options.ur_adapter == "cuda" or options.ur_adapter == "hip":
87-
return []
88-
8980
return [LlamaBench(self)]
9081

9182

@@ -94,6 +85,13 @@ def __init__(self, bench):
9485
super().__init__(bench.directory, bench)
9586
self.bench = bench
9687

88+
def enabled(self):
89+
if options.sycl is None:
90+
return False
91+
if options.ur_adapter == "cuda" or options.ur_adapter == "hip":
92+
return False
93+
return True
94+
9795
def setup(self):
9896
self.benchmark_bin = os.path.join(self.bench.build_path, "bin", "llama-bench")
9997

devops/scripts/benchmarks/benches/syclbench.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@
1414

1515
class SyclBench(Suite):
1616
def __init__(self, directory):
17-
if options.sycl is None:
18-
return
19-
2017
self.directory = directory
2118
return
2219

@@ -67,9 +64,6 @@ def setup(self):
6764
self.built = True
6865

6966
def benchmarks(self) -> list[Benchmark]:
70-
if options.sycl is None:
71-
return []
72-
7367
return [
7468
# Blocked_transform(self), # run time < 1ms
7569
DagTaskI(self),
@@ -117,6 +111,9 @@ def __init__(self, bench, name, test):
117111
self.bench_name = name
118112
self.test = test
119113

114+
def enabled(self) -> bool:
115+
return options.sycl is not None
116+
120117
def bin_args(self) -> list[str]:
121118
return []
122119

devops/scripts/benchmarks/benches/umf.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,6 @@ def setup(self):
3333
self.built = True
3434

3535
def benchmarks(self) -> list[Benchmark]:
36-
if not isUMFAvailable():
37-
return []
38-
3936
benches = [
4037
GBench(self),
4138
GBenchUmfProxy(self),
@@ -52,8 +49,6 @@ def __init__(self, bench):
5249

5350
self.bench = bench
5451
self.bench_name = "umf-benchmark"
55-
self.oneapi = get_oneapi()
56-
self.umf_lib = options.umf + "lib"
5752

5853
self.fragmentation_prefix = "FRAGMENTATION_"
5954

@@ -80,6 +75,9 @@ def __init__(self, bench):
8075
def name(self):
8176
return self.bench_name
8277

78+
def enabled(self):
79+
return isUMFAvailable()
80+
8381
# --benchmark_format describes stdout output
8482
# --benchmark_out=<file> and --benchmark_out_format=<format>
8583
# describe output to a file
@@ -98,6 +96,8 @@ def setup(self):
9896
print("UMF prefix path not provided")
9997
return
10098

99+
self.oneapi = get_oneapi()
100+
self.umf_lib = options.umf + "lib"
101101
self.benchmark_bin = os.path.join(options.umf, "benchmark", self.bench_name)
102102

103103
def is_memory_statistics_included(self, data_row):

0 commit comments

Comments
 (0)