@@ -115,37 +115,11 @@ def additional_metadata(self) -> dict[str, BenchmarkMetadata]:
115
115
),
116
116
}
117
117
118
- def enabled_runtimes (self , supported_runtimes = None , extra_runtimes = None ):
119
- # all runtimes in the RUNTIMES enum
120
- runtimes = supported_runtimes or list (RUNTIMES )
121
-
122
- # filter out SYCL_PREVIEW which is not supported by default in all benchmarks
123
- runtimes = [r for r in runtimes if r != RUNTIMES .SYCL_PREVIEW ]
124
-
125
- if extra_runtimes is not None :
126
- runtimes .extend (extra_runtimes )
127
-
128
- # Filter out UR if not available
129
- if options .ur is None :
130
- runtimes = [r for r in runtimes if r != RUNTIMES .UR ]
131
-
132
- # Filter out L0 if cuda backend
133
- if options .ur_adapter == "cuda" :
134
- runtimes = [r for r in runtimes if r != RUNTIMES .LEVEL_ZERO ]
135
-
136
- return runtimes
137
-
138
118
def benchmarks (self ) -> list [Benchmark ]:
139
- if options .sycl is None :
140
- return []
141
-
142
- if options .ur_adapter == "hip" :
143
- return []
144
-
145
119
benches = []
146
120
147
- # Add SubmitKernel benchmarks using loops
148
- for runtime in self . enabled_runtimes ( extra_runtimes = [ RUNTIMES . SYCL_PREVIEW ]):
121
+ for runtime in list ( RUNTIMES ):
122
+ # Add SubmitKernel benchmarks using loops
149
123
for in_order_queue in [0 , 1 ]:
150
124
for measure_completion in [0 , 1 ]:
151
125
for use_events in [0 , 1 ]:
@@ -161,21 +135,18 @@ def benchmarks(self) -> list[Benchmark]:
161
135
)
162
136
)
163
137
164
- # Add SinKernelGraph benchmarks
165
- for runtime in self .enabled_runtimes ():
138
+ # Add SinKernelGraph benchmarks
166
139
for with_graphs in [0 , 1 ]:
167
140
for num_kernels in [5 , 100 ]:
168
141
benches .append (
169
142
GraphApiSinKernelGraph (self , runtime , with_graphs , num_kernels )
170
143
)
171
144
172
- # Add ULLS benchmarks
173
- for runtime in self .enabled_runtimes ([RUNTIMES .SYCL , RUNTIMES .LEVEL_ZERO ]):
145
+ # Add ULLS benchmarks
174
146
benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 ))
175
147
benches .append (UllsKernelSwitch (self , runtime , 8 , 200 , 0 , 0 , 1 , 1 ))
176
148
177
- # Add GraphApiSubmitGraph benchmarks
178
- for runtime in self .enabled_runtimes ():
149
+ # Add GraphApiSubmitGraph benchmarks
179
150
for in_order_queue in [0 , 1 ]:
180
151
for num_kernels in [4 , 10 , 32 ]:
181
152
for measure_completion_time in [0 , 1 ]:
@@ -201,24 +172,24 @@ def benchmarks(self) -> list[Benchmark]:
201
172
]
202
173
203
174
# Add UR-specific benchmarks
204
- if options . ur is not None :
205
- benches += [
206
- MemcpyExecute (self , RUNTIMES .UR , 400 , 1 , 102400 , 10 , 1 , 1 , 1 , 1 , 0 ),
207
- MemcpyExecute (self , RUNTIMES .UR , 400 , 1 , 102400 , 10 , 0 , 1 , 1 , 1 , 0 ),
208
- MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 1 , 0 ),
209
- MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 0 , 0 ),
210
- MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 0 ),
211
- MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 1 ),
212
- UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 , "Both" ),
213
- UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 * 1024 , "Both" ),
214
- UsmBatchMemoryAllocation (self , RUNTIMES . UR , "Device" , 128 , 256 , "Both" ),
215
- UsmBatchMemoryAllocation (
216
- self , RUNTIMES . UR , "Device" , 128 , 16 * 1024 , "Both"
217
- ),
218
- UsmBatchMemoryAllocation (
219
- self , RUNTIMES . UR , "Device" , 128 , 128 * 1024 , "Both"
220
- ),
221
- ]
175
+ benches += [
176
+ MemcpyExecute ( self , RUNTIMES . UR , 400 , 1 , 102400 , 10 , 1 , 1 , 1 , 1 , 0 ),
177
+ MemcpyExecute (self , RUNTIMES .UR , 400 , 1 , 102400 , 10 , 0 , 1 , 1 , 1 , 0 ),
178
+ MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 1 , 0 ),
179
+ MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 0 , 0 ),
180
+ MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 0 ),
181
+ MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 1 ),
182
+ UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 , "Both" ),
183
+ UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 * 1024 , "Both" ),
184
+ UsmBatchMemoryAllocation (self , RUNTIMES .UR , "Device" , 128 , 256 , "Both" ),
185
+ UsmBatchMemoryAllocation (
186
+ self , RUNTIMES . UR , "Device" , 128 , 16 * 1024 , "Both"
187
+ ),
188
+ UsmBatchMemoryAllocation (
189
+ self , RUNTIMES . UR , "Device" , 128 , 128 * 1024 , "Both"
190
+ ),
191
+ ]
192
+
222
193
benches += [
223
194
MemcpyExecute (
224
195
self , RUNTIMES .SYCL_PREVIEW , 4096 , 1 , 1024 , 40 , 1 , 1 , 0 , 1 , 0
@@ -252,6 +223,13 @@ def __init__(self, bench, name, test):
252
223
self .bench_name = name
253
224
self .test = test
254
225
226
+ def enabled (self ) -> bool :
227
+ if options .sycl is None :
228
+ return False
229
+ if options .ur_adapter == "hip" :
230
+ return False
231
+ return True
232
+
255
233
def bin_args (self ) -> list [str ]:
256
234
return []
257
235
@@ -269,6 +247,26 @@ def explicit_group(self):
269
247
def description (self ) -> str :
270
248
return ""
271
249
250
+ def enabled_runtimes (self , supported_runtimes = None , extra_runtimes = None ):
251
+ # all runtimes in the RUNTIMES enum
252
+ runtimes = supported_runtimes or list (RUNTIMES )
253
+
254
+ # filter out SYCL_PREVIEW which is not supported by default in all benchmarks
255
+ runtimes = [r for r in runtimes if r != RUNTIMES .SYCL_PREVIEW ]
256
+
257
+ if extra_runtimes is not None :
258
+ runtimes .extend (extra_runtimes )
259
+
260
+ # Filter out UR if not available
261
+ if options .ur is None :
262
+ runtimes = [r for r in runtimes if r != RUNTIMES .UR ]
263
+
264
+ # Filter out L0 if cuda backend
265
+ if options .ur_adapter == "cuda" :
266
+ runtimes = [r for r in runtimes if r != RUNTIMES .LEVEL_ZERO ]
267
+
268
+ return runtimes
269
+
272
270
def run (self , env_vars ) -> list [Result ]:
273
271
command = [
274
272
f"{ self .benchmark_bin } " ,
@@ -347,6 +345,15 @@ def __init__(
347
345
bench , f"api_overhead_benchmark_{ runtime .value } " , "SubmitKernel"
348
346
)
349
347
348
+ def enabled (self ) -> bool :
349
+ if not super ().enabled ():
350
+ return False
351
+ if self .runtime not in self .enabled_runtimes (
352
+ extra_runtimes = [RUNTIMES .SYCL_PREVIEW ]
353
+ ):
354
+ return False
355
+ return True
356
+
350
357
def get_tags (self ):
351
358
return ["submit" , "latency" , runtime_to_tag_name (self .runtime ), "micro" ]
352
359
@@ -631,6 +638,13 @@ def __init__(
631
638
bench , f"multithread_benchmark_{ self .runtime .value } " , "MemcpyExecute"
632
639
)
633
640
641
+ def enabled (self ) -> bool :
642
+ if not super ().enabled ():
643
+ return False
644
+ if self .runtime == RUNTIMES .UR and options .ur is None :
645
+ return False
646
+ return True
647
+
634
648
def extra_env_vars (self ) -> dict :
635
649
if not self .useCopyOffload :
636
650
return {"UR_L0_V2_FORCE_DISABLE_COPY_OFFLOAD" : "1" }
@@ -709,6 +723,13 @@ def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels):
709
723
bench , f"graph_api_benchmark_{ runtime .value } " , "SinKernelGraph"
710
724
)
711
725
726
+ def enabled (self ) -> bool :
727
+ if not super ().enabled ():
728
+ return False
729
+ if self .runtime not in self .enabled_runtimes ():
730
+ return False
731
+ return True
732
+
712
733
def explicit_group (self ):
713
734
return f"SinKernelGraph, numKernels: { self .numKernels } "
714
735
@@ -761,6 +782,13 @@ def __init__(
761
782
self .measureCompletionTime = measureCompletionTime
762
783
super ().__init__ (bench , f"graph_api_benchmark_{ runtime .value } " , "SubmitGraph" )
763
784
785
+ def enabled (self ) -> bool :
786
+ if not super ().enabled ():
787
+ return False
788
+ if self .runtime not in self .enabled_runtimes ():
789
+ return False
790
+ return True
791
+
764
792
def explicit_group (self ):
765
793
return f"SubmitGraph, numKernels: { self .numKernels } "
766
794
@@ -805,6 +833,15 @@ def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
805
833
self .runtime = runtime
806
834
super ().__init__ (bench , f"ulls_benchmark_{ runtime .value } " , "EmptyKernel" )
807
835
836
+ def enabled (self ) -> bool :
837
+ if not super ().enabled ():
838
+ return False
839
+ if self .runtime not in self .enabled_runtimes (
840
+ [RUNTIMES .SYCL , RUNTIMES .LEVEL_ZERO ]
841
+ ):
842
+ return False
843
+ return True
844
+
808
845
def explicit_group (self ):
809
846
return f"EmptyKernel, wgc: { self .wgc } , wgs: { self .wgs } "
810
847
@@ -851,6 +888,15 @@ def __init__(
851
888
self .ioq = ioq
852
889
super ().__init__ (bench , f"ulls_benchmark_{ runtime .value } " , "KernelSwitch" )
853
890
891
+ def enabled (self ) -> bool :
892
+ if not super ().enabled ():
893
+ return False
894
+ if self .runtime not in self .enabled_runtimes (
895
+ [RUNTIMES .SYCL , RUNTIMES .LEVEL_ZERO ]
896
+ ):
897
+ return False
898
+ return True
899
+
854
900
def explicit_group (self ):
855
901
return f"KernelSwitch, count: { self .count } , kernelTime: { self .kernelTime } "
856
902
@@ -890,6 +936,13 @@ def __init__(
890
936
bench , f"api_overhead_benchmark_{ runtime .value } " , "UsmMemoryAllocation"
891
937
)
892
938
939
+ def enabled (self ) -> bool :
940
+ if not super ().enabled ():
941
+ return False
942
+ if self .runtime == RUNTIMES .UR and options .ur is None :
943
+ return False
944
+ return True
945
+
893
946
def get_tags (self ):
894
947
return [runtime_to_tag_name (self .runtime ), "micro" , "latency" , "memory" ]
895
948
@@ -948,6 +1001,13 @@ def __init__(
948
1001
bench , f"api_overhead_benchmark_{ runtime .value } " , "UsmBatchMemoryAllocation"
949
1002
)
950
1003
1004
+ def enabled (self ) -> bool :
1005
+ if not super ().enabled ():
1006
+ return False
1007
+ if self .runtime == RUNTIMES .UR and options .ur is None :
1008
+ return False
1009
+ return True
1010
+
951
1011
def get_tags (self ):
952
1012
return [runtime_to_tag_name (self .runtime ), "micro" , "latency" , "memory" ]
953
1013
0 commit comments