Skip to content

Commit 508260e

Browse files
committed
[Benchmarks] Dry run without phony sycl or ur args
Create metadata during dry runs for all benchmarks even when --sycl, --ur, and --umf options are not set
1 parent 4d4fde2 commit 508260e

File tree

7 files changed

+149
-88
lines changed

7 files changed

+149
-88
lines changed

devops/scripts/benchmarks/benches/base.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,11 @@ def explicit_group(self) -> str:
5656
Can be modified."""
5757
return ""
5858

59+
def enabled(self) -> bool:
60+
"""Returns whether this benchmark is enabled.
61+
By default, it returns True, but can be overridden to disable a benchmark."""
62+
return True
63+
5964
@abstractmethod
6065
def setup(self):
6166
pass

devops/scripts/benchmarks/benches/compute.py

Lines changed: 112 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -115,37 +115,11 @@ def additional_metadata(self) -> dict[str, BenchmarkMetadata]:
115115
),
116116
}
117117

118-
def enabled_runtimes(self, supported_runtimes=None, extra_runtimes=None):
119-
# all runtimes in the RUNTIMES enum
120-
runtimes = supported_runtimes or list(RUNTIMES)
121-
122-
# filter out SYCL_PREVIEW which is not supported by default in all benchmarks
123-
runtimes = [r for r in runtimes if r != RUNTIMES.SYCL_PREVIEW]
124-
125-
if extra_runtimes is not None:
126-
runtimes.extend(extra_runtimes)
127-
128-
# Filter out UR if not available
129-
if options.ur is None:
130-
runtimes = [r for r in runtimes if r != RUNTIMES.UR]
131-
132-
# Filter out L0 if cuda backend
133-
if options.ur_adapter == "cuda":
134-
runtimes = [r for r in runtimes if r != RUNTIMES.LEVEL_ZERO]
135-
136-
return runtimes
137-
138118
def benchmarks(self) -> list[Benchmark]:
139-
if options.sycl is None:
140-
return []
141-
142-
if options.ur_adapter == "hip":
143-
return []
144-
145119
benches = []
146120

147-
# Add SubmitKernel benchmarks using loops
148-
for runtime in self.enabled_runtimes(extra_runtimes=[RUNTIMES.SYCL_PREVIEW]):
121+
for runtime in list(RUNTIMES):
122+
# Add SubmitKernel benchmarks using loops
149123
for in_order_queue in [0, 1]:
150124
for measure_completion in [0, 1]:
151125
for use_events in [0, 1]:
@@ -161,21 +135,18 @@ def benchmarks(self) -> list[Benchmark]:
161135
)
162136
)
163137

164-
# Add SinKernelGraph benchmarks
165-
for runtime in self.enabled_runtimes():
138+
# Add SinKernelGraph benchmarks
166139
for with_graphs in [0, 1]:
167140
for num_kernels in [5, 100]:
168141
benches.append(
169142
GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
170143
)
171144

172-
# Add ULLS benchmarks
173-
for runtime in self.enabled_runtimes([RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]):
145+
# Add ULLS benchmarks
174146
benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
175147
benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
176148

177-
# Add GraphApiSubmitGraph benchmarks
178-
for runtime in self.enabled_runtimes():
149+
# Add GraphApiSubmitGraph benchmarks
179150
for in_order_queue in [0, 1]:
180151
for num_kernels in [4, 10, 32]:
181152
for measure_completion_time in [0, 1]:
@@ -201,24 +172,24 @@ def benchmarks(self) -> list[Benchmark]:
201172
]
202173

203174
# Add UR-specific benchmarks
204-
if options.ur is not None:
205-
benches += [
206-
MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 1, 1, 1, 1, 0),
207-
MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 0, 1, 1, 1, 0),
208-
MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 1, 0),
209-
MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 0, 0),
210-
MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 0),
211-
MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 1),
212-
UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256, "Both"),
213-
UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256 * 1024, "Both"),
214-
UsmBatchMemoryAllocation(self, RUNTIMES.UR, "Device", 128, 256, "Both"),
215-
UsmBatchMemoryAllocation(
216-
self, RUNTIMES.UR, "Device", 128, 16 * 1024, "Both"
217-
),
218-
UsmBatchMemoryAllocation(
219-
self, RUNTIMES.UR, "Device", 128, 128 * 1024, "Both"
220-
),
221-
]
175+
benches += [
176+
MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 1, 1, 1, 1, 0),
177+
MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 0, 1, 1, 1, 0),
178+
MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 1, 0),
179+
MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 0, 0),
180+
MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 0),
181+
MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 1),
182+
UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256, "Both"),
183+
UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256 * 1024, "Both"),
184+
UsmBatchMemoryAllocation(self, RUNTIMES.UR, "Device", 128, 256, "Both"),
185+
UsmBatchMemoryAllocation(
186+
self, RUNTIMES.UR, "Device", 128, 16 * 1024, "Both"
187+
),
188+
UsmBatchMemoryAllocation(
189+
self, RUNTIMES.UR, "Device", 128, 128 * 1024, "Both"
190+
),
191+
]
192+
222193
benches += [
223194
MemcpyExecute(
224195
self, RUNTIMES.SYCL_PREVIEW, 4096, 1, 1024, 40, 1, 1, 0, 1, 0
@@ -252,6 +223,13 @@ def __init__(self, bench, name, test):
252223
self.bench_name = name
253224
self.test = test
254225

226+
def enabled(self) -> bool:
227+
if options.sycl is None:
228+
return False
229+
if options.ur_adapter == "hip":
230+
return False
231+
return True
232+
255233
def bin_args(self) -> list[str]:
256234
return []
257235

@@ -269,6 +247,26 @@ def explicit_group(self):
269247
def description(self) -> str:
270248
return ""
271249

250+
def enabled_runtimes(self, supported_runtimes=None, extra_runtimes=None):
251+
# all runtimes in the RUNTIMES enum
252+
runtimes = supported_runtimes or list(RUNTIMES)
253+
254+
# filter out SYCL_PREVIEW which is not supported by default in all benchmarks
255+
runtimes = [r for r in runtimes if r != RUNTIMES.SYCL_PREVIEW]
256+
257+
if extra_runtimes is not None:
258+
runtimes.extend(extra_runtimes)
259+
260+
# Filter out UR if not available
261+
if options.ur is None:
262+
runtimes = [r for r in runtimes if r != RUNTIMES.UR]
263+
264+
# Filter out L0 if cuda backend
265+
if options.ur_adapter == "cuda":
266+
runtimes = [r for r in runtimes if r != RUNTIMES.LEVEL_ZERO]
267+
268+
return runtimes
269+
272270
def run(self, env_vars) -> list[Result]:
273271
command = [
274272
f"{self.benchmark_bin}",
@@ -347,6 +345,15 @@ def __init__(
347345
bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel"
348346
)
349347

348+
def enabled(self) -> bool:
349+
if not super().enabled():
350+
return False
351+
if self.runtime not in self.enabled_runtimes(
352+
extra_runtimes=[RUNTIMES.SYCL_PREVIEW]
353+
):
354+
return False
355+
return True
356+
350357
def get_tags(self):
351358
return ["submit", "latency", runtime_to_tag_name(self.runtime), "micro"]
352359

@@ -631,6 +638,13 @@ def __init__(
631638
bench, f"multithread_benchmark_{self.runtime.value}", "MemcpyExecute"
632639
)
633640

641+
def enabled(self) -> bool:
642+
if not super().enabled():
643+
return False
644+
if self.runtime == RUNTIMES.UR and options.ur is None:
645+
return False
646+
return True
647+
634648
def extra_env_vars(self) -> dict:
635649
if not self.useCopyOffload:
636650
return {"UR_L0_V2_FORCE_DISABLE_COPY_OFFLOAD": "1"}
@@ -709,6 +723,13 @@ def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels):
709723
bench, f"graph_api_benchmark_{runtime.value}", "SinKernelGraph"
710724
)
711725

726+
def enabled(self) -> bool:
727+
if not super().enabled():
728+
return False
729+
if self.runtime not in self.enabled_runtimes():
730+
return False
731+
return True
732+
712733
def explicit_group(self):
713734
return f"SinKernelGraph, numKernels: {self.numKernels}"
714735

@@ -761,6 +782,13 @@ def __init__(
761782
self.measureCompletionTime = measureCompletionTime
762783
super().__init__(bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph")
763784

785+
def enabled(self) -> bool:
786+
if not super().enabled():
787+
return False
788+
if self.runtime not in self.enabled_runtimes():
789+
return False
790+
return True
791+
764792
def explicit_group(self):
765793
return f"SubmitGraph, numKernels: {self.numKernels}"
766794

@@ -805,6 +833,15 @@ def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
805833
self.runtime = runtime
806834
super().__init__(bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel")
807835

836+
def enabled(self) -> bool:
837+
if not super().enabled():
838+
return False
839+
if self.runtime not in self.enabled_runtimes(
840+
[RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
841+
):
842+
return False
843+
return True
844+
808845
def explicit_group(self):
809846
return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}"
810847

@@ -851,6 +888,15 @@ def __init__(
851888
self.ioq = ioq
852889
super().__init__(bench, f"ulls_benchmark_{runtime.value}", "KernelSwitch")
853890

891+
def enabled(self) -> bool:
892+
if not super().enabled():
893+
return False
894+
if self.runtime not in self.enabled_runtimes(
895+
[RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
896+
):
897+
return False
898+
return True
899+
854900
def explicit_group(self):
855901
return f"KernelSwitch, count: {self.count}, kernelTime: {self.kernelTime}"
856902

@@ -890,6 +936,13 @@ def __init__(
890936
bench, f"api_overhead_benchmark_{runtime.value}", "UsmMemoryAllocation"
891937
)
892938

939+
def enabled(self) -> bool:
940+
if not super().enabled():
941+
return False
942+
if self.runtime == RUNTIMES.UR and options.ur is None:
943+
return False
944+
return True
945+
893946
def get_tags(self):
894947
return [runtime_to_tag_name(self.runtime), "micro", "latency", "memory"]
895948

@@ -948,6 +1001,13 @@ def __init__(
9481001
bench, f"api_overhead_benchmark_{runtime.value}", "UsmBatchMemoryAllocation"
9491002
)
9501003

1004+
def enabled(self) -> bool:
1005+
if not super().enabled():
1006+
return False
1007+
if self.runtime == RUNTIMES.UR and options.ur is None:
1008+
return False
1009+
return True
1010+
9511011
def get_tags(self):
9521012
return [runtime_to_tag_name(self.runtime), "micro", "latency", "memory"]
9531013

devops/scripts/benchmarks/benches/llamacpp.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,6 @@
1717

1818
class LlamaCppBench(Suite):
1919
def __init__(self, directory):
20-
if options.sycl is None:
21-
return
22-
2320
self.directory = directory
2421

2522
def name(self) -> str:
@@ -80,12 +77,6 @@ def setup(self):
8077
)
8178

8279
def benchmarks(self) -> list[Benchmark]:
83-
if options.sycl is None:
84-
return []
85-
86-
if options.ur_adapter == "cuda" or options.ur_adapter == "hip":
87-
return []
88-
8980
return [LlamaBench(self)]
9081

9182

@@ -94,6 +85,13 @@ def __init__(self, bench):
9485
super().__init__(bench.directory, bench)
9586
self.bench = bench
9687

88+
def enabled(self):
89+
if options.sycl is None:
90+
return False
91+
if options.ur_adapter == "cuda" or options.ur_adapter == "hip":
92+
return False
93+
return True
94+
9795
def setup(self):
9896
self.benchmark_bin = os.path.join(self.bench.build_path, "bin", "llama-bench")
9997

devops/scripts/benchmarks/benches/syclbench.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@
1414

1515
class SyclBench(Suite):
1616
def __init__(self, directory):
17-
if options.sycl is None:
18-
return
19-
2017
self.directory = directory
2118
return
2219

@@ -67,9 +64,6 @@ def setup(self):
6764
self.built = True
6865

6966
def benchmarks(self) -> list[Benchmark]:
70-
if options.sycl is None:
71-
return []
72-
7367
return [
7468
# Blocked_transform(self), # run time < 1ms
7569
DagTaskI(self),
@@ -117,6 +111,9 @@ def __init__(self, bench, name, test):
117111
self.bench_name = name
118112
self.test = test
119113

114+
def enabled(self) -> bool:
115+
return options.sycl is not None
116+
120117
def bin_args(self) -> list[str]:
121118
return []
122119

devops/scripts/benchmarks/benches/umf.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,6 @@ def setup(self):
3333
self.built = True
3434

3535
def benchmarks(self) -> list[Benchmark]:
36-
if not isUMFAvailable():
37-
return []
38-
3936
benches = [
4037
GBench(self),
4138
GBenchUmfProxy(self),
@@ -52,8 +49,6 @@ def __init__(self, bench):
5249

5350
self.bench = bench
5451
self.bench_name = "umf-benchmark"
55-
self.oneapi = get_oneapi()
56-
self.umf_lib = options.umf + "lib"
5752

5853
self.fragmentation_prefix = "FRAGMENTATION_"
5954

@@ -80,6 +75,9 @@ def __init__(self, bench):
8075
def name(self):
8176
return self.bench_name
8277

78+
def enabled(self):
79+
return isUMFAvailable()
80+
8381
# --benchmark_format describes stdout output
8482
# --benchmark_out=<file> and --benchmark_out_format=<format>
8583
# describe output to a file
@@ -98,6 +96,8 @@ def setup(self):
9896
print("UMF prefix path not provided")
9997
return
10098

99+
self.oneapi = get_oneapi()
100+
self.umf_lib = options.umf + "lib"
101101
self.benchmark_bin = os.path.join(options.umf, "benchmark", self.bench_name)
102102

103103
def is_memory_statistics_included(self, data_row):

0 commit comments

Comments
 (0)