@@ -115,37 +115,11 @@ def additional_metadata(self) -> dict[str, BenchmarkMetadata]:
115
115
),
116
116
}
117
117
118
- def enabled_runtimes (self , supported_runtimes = None , extra_runtimes = None ):
119
- # all runtimes in the RUNTIMES enum
120
- runtimes = supported_runtimes or list (RUNTIMES )
121
-
122
- # filter out SYCL_PREVIEW which is not supported by default in all benchmarks
123
- runtimes = [r for r in runtimes if r != RUNTIMES .SYCL_PREVIEW ]
124
-
125
- if extra_runtimes is not None :
126
- runtimes .extend (extra_runtimes )
127
-
128
- # Filter out UR if not available
129
- if options .ur is None :
130
- runtimes = [r for r in runtimes if r != RUNTIMES .UR ]
131
-
132
- # Filter out L0 if cuda backend
133
- if options .ur_adapter == "cuda" :
134
- runtimes = [r for r in runtimes if r != RUNTIMES .LEVEL_ZERO ]
135
-
136
- return runtimes
137
-
138
118
def benchmarks (self ) -> list [Benchmark ]:
139
- if options .sycl is None :
140
- return []
141
-
142
- if options .ur_adapter == "hip" :
143
- return []
144
-
145
119
benches = []
146
120
147
- # Add SubmitKernel benchmarks using loops
148
- for runtime in self . enabled_runtimes ( extra_runtimes = [ RUNTIMES . SYCL_PREVIEW ]):
121
+ for runtime in list ( RUNTIMES ):
122
+ # Add SubmitKernel benchmarks using loops
149
123
for in_order_queue in [0 , 1 ]:
150
124
for measure_completion in [0 , 1 ]:
151
125
for use_events in [0 , 1 ]:
@@ -161,21 +135,18 @@ def benchmarks(self) -> list[Benchmark]:
161
135
)
162
136
)
163
137
164
- # Add SinKernelGraph benchmarks
165
- for runtime in self .enabled_runtimes ():
138
+ # Add SinKernelGraph benchmarks
166
139
for with_graphs in [0 , 1 ]:
167
140
for num_kernels in [5 , 100 ]:
168
141
benches .append (
169
142
GraphApiSinKernelGraph (self , runtime , with_graphs , num_kernels )
170
143
)
171
144
172
- # Add ULLS benchmarks
173
- for runtime in self .enabled_runtimes ([RUNTIMES .SYCL , RUNTIMES .LEVEL_ZERO ]):
145
+ # Add ULLS benchmarks
174
146
benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 ))
175
147
benches .append (UllsKernelSwitch (self , runtime , 8 , 200 , 0 , 0 , 1 , 1 ))
176
148
177
- # Add GraphApiSubmitGraph benchmarks
178
- for runtime in self .enabled_runtimes ():
149
+ # Add GraphApiSubmitGraph benchmarks
179
150
for in_order_queue in [0 , 1 ]:
180
151
for num_kernels in [4 , 10 , 32 ]:
181
152
for measure_completion_time in [0 , 1 ]:
@@ -201,24 +172,24 @@ def benchmarks(self) -> list[Benchmark]:
201
172
]
202
173
203
174
# Add UR-specific benchmarks
204
- if options . ur is not None :
205
- benches += [
206
- MemcpyExecute (self , RUNTIMES .UR , 400 , 1 , 102400 , 10 , 1 , 1 , 1 , 1 , 0 ),
207
- MemcpyExecute (self , RUNTIMES .UR , 400 , 1 , 102400 , 10 , 0 , 1 , 1 , 1 , 0 ),
208
- MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 1 , 0 ),
209
- MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 0 , 0 ),
210
- MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 0 ),
211
- MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 1 ),
212
- UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 , "Both" ),
213
- UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 * 1024 , "Both" ),
214
- UsmBatchMemoryAllocation (self , RUNTIMES . UR , "Device" , 128 , 256 , "Both" ),
215
- UsmBatchMemoryAllocation (
216
- self , RUNTIMES . UR , "Device" , 128 , 16 * 1024 , "Both"
217
- ),
218
- UsmBatchMemoryAllocation (
219
- self , RUNTIMES . UR , "Device" , 128 , 128 * 1024 , "Both"
220
- ),
221
- ]
175
+ benches += [
176
+ MemcpyExecute ( self , RUNTIMES . UR , 400 , 1 , 102400 , 10 , 1 , 1 , 1 , 1 , 0 ),
177
+ MemcpyExecute (self , RUNTIMES .UR , 400 , 1 , 102400 , 10 , 0 , 1 , 1 , 1 , 0 ),
178
+ MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 1 , 0 ),
179
+ MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 0 , 0 ),
180
+ MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 0 ),
181
+ MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 1 ),
182
+ UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 , "Both" ),
183
+ UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 * 1024 , "Both" ),
184
+ UsmBatchMemoryAllocation (self , RUNTIMES .UR , "Device" , 128 , 256 , "Both" ),
185
+ UsmBatchMemoryAllocation (
186
+ self , RUNTIMES . UR , "Device" , 128 , 16 * 1024 , "Both"
187
+ ),
188
+ UsmBatchMemoryAllocation (
189
+ self , RUNTIMES . UR , "Device" , 128 , 128 * 1024 , "Both"
190
+ ),
191
+ ]
192
+
222
193
benches += [
223
194
MemcpyExecute (
224
195
self , RUNTIMES .SYCL_PREVIEW , 4096 , 1 , 1024 , 40 , 1 , 1 , 0 , 1 , 0
@@ -246,11 +217,44 @@ def parse_unit_type(compute_unit):
246
217
247
218
248
219
class ComputeBenchmark (Benchmark ):
249
- def __init__ (self , bench , name , test ):
220
+ def __init__ (self , bench , name , test , runtime : RUNTIMES = None ):
250
221
super ().__init__ (bench .directory , bench )
251
222
self .bench = bench
252
223
self .bench_name = name
253
224
self .test = test
225
+ self .runtime = runtime
226
+
227
+ def supported_runtimes (self ) -> list [RUNTIMES ]:
228
+ """Base runtimes supported by this benchmark, can be overridden."""
229
+ # By default, support all runtimes except SYCL_PREVIEW
230
+ return [r for r in RUNTIMES if r != RUNTIMES .SYCL_PREVIEW ]
231
+
232
+ def enabled_runtimes (self ) -> list [RUNTIMES ]:
233
+ """Runtimes available given the current configuration."""
234
+ # Start with all supported runtimes and apply configuration filters
235
+ runtimes = self .supported_runtimes ()
236
+
237
+ # Remove UR if not available
238
+ if options .ur is None :
239
+ runtimes = [r for r in runtimes if r != RUNTIMES .UR ]
240
+
241
+ # Remove Level Zero if using CUDA backend
242
+ if options .ur_adapter == "cuda" :
243
+ runtimes = [r for r in runtimes if r != RUNTIMES .LEVEL_ZERO ]
244
+
245
+ return runtimes
246
+
247
+ def enabled (self ) -> bool :
248
+ # SYCL is required for all benchmarks
249
+ if options .sycl is None :
250
+ return False
251
+
252
+ # HIP adapter is not supported
253
+ if options .ur_adapter == "hip" :
254
+ return False
255
+
256
+ # Check if the specific runtime is enabled (or no specific runtime required)
257
+ return self .runtime is None or self .runtime in self .enabled_runtimes ()
254
258
255
259
def bin_args (self ) -> list [str ]:
256
260
return []
@@ -338,15 +342,17 @@ def __init__(
338
342
KernelExecTime = 1 ,
339
343
):
340
344
self .ioq = ioq
341
- self .runtime = runtime
342
345
self .MeasureCompletion = MeasureCompletion
343
346
self .UseEvents = UseEvents
344
347
self .KernelExecTime = KernelExecTime
345
348
self .NumKernels = 10
346
349
super ().__init__ (
347
- bench , f"api_overhead_benchmark_{ runtime .value } " , "SubmitKernel"
350
+ bench , f"api_overhead_benchmark_{ runtime .value } " , "SubmitKernel" , runtime
348
351
)
349
352
353
+ def supported_runtimes (self ) -> list [RUNTIMES ]:
354
+ return super ().supported_runtimes () + [RUNTIMES .SYCL_PREVIEW ]
355
+
350
356
def get_tags (self ):
351
357
return ["submit" , "latency" , runtime_to_tag_name (self .runtime ), "micro" ]
352
358
@@ -617,7 +623,6 @@ def __init__(
617
623
useCopyOffload ,
618
624
useBarrier ,
619
625
):
620
- self .runtime = runtime
621
626
self .numOpsPerThread = numOpsPerThread
622
627
self .numThreads = numThreads
623
628
self .allocSize = allocSize
@@ -628,7 +633,7 @@ def __init__(
628
633
self .useCopyOffload = useCopyOffload
629
634
self .useBarrier = useBarrier
630
635
super ().__init__ (
631
- bench , f"multithread_benchmark_{ self . runtime .value } " , "MemcpyExecute"
636
+ bench , f"multithread_benchmark_{ runtime .value } " , "MemcpyExecute" , runtime
632
637
)
633
638
634
639
def extra_env_vars (self ) -> dict :
@@ -704,9 +709,8 @@ class GraphApiSinKernelGraph(ComputeBenchmark):
704
709
def __init__ (self , bench , runtime : RUNTIMES , withGraphs , numKernels ):
705
710
self .withGraphs = withGraphs
706
711
self .numKernels = numKernels
707
- self .runtime = runtime
708
712
super ().__init__ (
709
- bench , f"graph_api_benchmark_{ runtime .value } " , "SinKernelGraph"
713
+ bench , f"graph_api_benchmark_{ runtime .value } " , "SinKernelGraph" , runtime
710
714
)
711
715
712
716
def explicit_group (self ):
@@ -757,9 +761,10 @@ def __init__(
757
761
):
758
762
self .inOrderQueue = inOrderQueue
759
763
self .numKernels = numKernels
760
- self .runtime = runtime
761
764
self .measureCompletionTime = measureCompletionTime
762
- super ().__init__ (bench , f"graph_api_benchmark_{ runtime .value } " , "SubmitGraph" )
765
+ super ().__init__ (
766
+ bench , f"graph_api_benchmark_{ runtime .value } " , "SubmitGraph" , runtime
767
+ )
763
768
764
769
def explicit_group (self ):
765
770
return f"SubmitGraph, numKernels: { self .numKernels } "
@@ -802,8 +807,12 @@ class UllsEmptyKernel(ComputeBenchmark):
802
807
def __init__ (self , bench , runtime : RUNTIMES , wgc , wgs ):
803
808
self .wgc = wgc
804
809
self .wgs = wgs
805
- self .runtime = runtime
806
- super ().__init__ (bench , f"ulls_benchmark_{ runtime .value } " , "EmptyKernel" )
810
+ super ().__init__ (
811
+ bench , f"ulls_benchmark_{ runtime .value } " , "EmptyKernel" , runtime
812
+ )
813
+
814
+ def supported_runtimes (self ) -> list [RUNTIMES ]:
815
+ return [RUNTIMES .SYCL , RUNTIMES .LEVEL_ZERO ]
807
816
808
817
def explicit_group (self ):
809
818
return f"EmptyKernel, wgc: { self .wgc } , wgs: { self .wgs } "
@@ -847,9 +856,13 @@ def __init__(
847
856
self .barrier = barrier
848
857
self .hostVisible = hostVisible
849
858
self .ctrBasedEvents = ctrBasedEvents
850
- self .runtime = runtime
851
859
self .ioq = ioq
852
- super ().__init__ (bench , f"ulls_benchmark_{ runtime .value } " , "KernelSwitch" )
860
+ super ().__init__ (
861
+ bench , f"ulls_benchmark_{ runtime .value } " , "KernelSwitch" , runtime
862
+ )
863
+
864
+ def supported_runtimes (self ):
865
+ return [RUNTIMES .SYCL , RUNTIMES .LEVEL_ZERO ]
853
866
854
867
def explicit_group (self ):
855
868
return f"KernelSwitch, count: { self .count } , kernelTime: { self .kernelTime } "
@@ -882,12 +895,14 @@ class UsmMemoryAllocation(ComputeBenchmark):
882
895
def __init__ (
883
896
self , bench , runtime : RUNTIMES , usm_memory_placement , size , measure_mode
884
897
):
885
- self .runtime = runtime
886
898
self .usm_memory_placement = usm_memory_placement
887
899
self .size = size
888
900
self .measure_mode = measure_mode
889
901
super ().__init__ (
890
- bench , f"api_overhead_benchmark_{ runtime .value } " , "UsmMemoryAllocation"
902
+ bench ,
903
+ f"api_overhead_benchmark_{ runtime .value } " ,
904
+ "UsmMemoryAllocation" ,
905
+ runtime ,
891
906
)
892
907
893
908
def get_tags (self ):
@@ -939,13 +954,15 @@ def __init__(
939
954
size ,
940
955
measure_mode ,
941
956
):
942
- self .runtime = runtime
943
957
self .usm_memory_placement = usm_memory_placement
944
958
self .allocation_count = allocation_count
945
959
self .size = size
946
960
self .measure_mode = measure_mode
947
961
super ().__init__ (
948
- bench , f"api_overhead_benchmark_{ runtime .value } " , "UsmBatchMemoryAllocation"
962
+ bench ,
963
+ f"api_overhead_benchmark_{ runtime .value } " ,
964
+ "UsmBatchMemoryAllocation" ,
965
+ runtime ,
949
966
)
950
967
951
968
def get_tags (self ):
0 commit comments