Skip to content

Commit db861ba

Browse files
committed
[Benchmarks] Measure cpu instructions optionally
It makes sense to measure either time elapsed or cpu instructions retired, not both. Adding an argument to scripts to run Compute Benchmarks scenarios with only one of two scenarios would lower the number of benchmark scenarios significantly. This would make the tests to take less amount of time to complete, giving the user an option to produce either time or cpu instructions count results, where applicable.
1 parent dfcd9c5 commit db861ba

File tree

8 files changed

+116
-32
lines changed

8 files changed

+116
-32
lines changed

.github/workflows/sycl-linux-precommit.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ jobs:
241241
benchmark_upload_results: false
242242
benchmark_preset: 'Minimal'
243243
benchmark_dry_run: true
244+
benchmark_profiler_type: 'cpuCounter'
244245
repo_ref: ${{ github.sha }}
245246
toolchain_artifact: ${{ needs.build.outputs.toolchain_artifact }}
246247
toolchain_artifact_filename: ${{ needs.build.outputs.toolchain_artifact_filename }}

.github/workflows/sycl-linux-run-tests.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,13 @@ on:
140140
type: string
141141
default: 'false'
142142
required: False
143+
benchmark_profiler_type:
144+
description: |
145+
Type of profiler to use for benchmarks. Options are "timer" and
146+
"cpuCounter". Default is "cpuCounter".
147+
type: string
148+
default: 'cpuCounter'
149+
required: False
143150

144151
workflow_dispatch:
145152
inputs:
@@ -359,6 +366,7 @@ jobs:
359366
preset: ${{ inputs.benchmark_preset }}
360367
dry_run: ${{ inputs.benchmark_dry_run }}
361368
build_ref: ${{ inputs.repo_ref }}
369+
profiler_type: ${{ inputs.benchmark_profiler_type }}
362370
env:
363371
RUNNER_TAG: ${{ inputs.runner }}
364372
GITHUB_TOKEN: ${{ secrets.LLVM_SYCL_BENCHMARK_TOKEN }}

.github/workflows/sycl-nightly-benchmarking.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ jobs:
4545
benchmark_upload_results: true
4646
benchmark_save_name: ${{ matrix.save_name }}
4747
benchmark_preset: ${{ matrix.preset }}
48+
benchmark_profiler_type: cpuCounter
4849
repo_ref: ${{ matrix.ref }}
4950
toolchain_artifact: ${{ needs.ubuntu2204_build.outputs.toolchain_artifact }}
5051
toolchain_artifact_filename: ${{ needs.ubuntu2204_build.outputs.toolchain_artifact_filename }}

.github/workflows/sycl-ur-perf-benchmarking.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,15 @@ on:
6161
- Normal
6262
- Test
6363
default: 'Minimal' # Only compute-benchmarks
64+
benchmark_profiler_type:
65+
description: |
66+
Type of profiler to use for benchmarks. Options are "timer" and
67+
"cpuCounter". Default is "cpuCounter".
68+
type: choice
69+
options:
70+
- timer
71+
- cpuCounter
72+
default: 'cpuCounter'
6473
pr_no:
6574
type: string
6675
description: |
@@ -192,6 +201,7 @@ jobs:
192201
benchmark_upload_results: ${{ inputs.upload_results }}
193202
benchmark_save_name: ${{ needs.sanitize_inputs.outputs.benchmark_save_name }}
194203
benchmark_preset: ${{ inputs.preset }}
204+
benchmark_profiler_type: ${{ inputs.benchmark_profiler_type }}
195205
repo_ref: ${{ needs.sanitize_inputs.outputs.build_ref }}
196206
toolchain_artifact: ${{ needs.build_sycl.outputs.toolchain_artifact }}
197207
toolchain_artifact_filename: ${{ needs.build_sycl.outputs.toolchain_artifact_filename }}

devops/actions/run-tests/benchmark/action.yml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ inputs:
3232
dry_run:
3333
type: string
3434
required: False
35+
profiler_type:
36+
type: string
37+
required: False
38+
default: "cpuCounter" # Other option is "timer"
3539

3640
runs:
3741
using: "composite"
@@ -41,6 +45,7 @@ runs:
4145
env:
4246
TARGET_DEVICE: ${{ inputs.target_devices }}
4347
PRESET: ${{ inputs.preset }}
48+
PROFILER_TYPE: ${{ inputs.profiler_type }}
4449
run: |
4550
case "$RUNNER_TAG" in
4651
'["PVC_PERF"]' ) ;;
@@ -75,6 +80,17 @@ runs:
7580
python3 ./devops/scripts/benchmarks/presets.py query "$PRESET"
7681
[ "$?" -ne 0 ] && exit 1 # Stop workflow if invalid preset
7782
echo "PRESET=$PRESET" >> $GITHUB_ENV
83+
84+
# Validate profiler type input
85+
case "$PROFILER_TYPE" in
86+
"timer") PROFILER_TYPE="timer" ;;
87+
"cpuCounter") PROFILER_TYPE="cpuCounter" ;;
88+
*)
89+
echo "Invalid profiler type specified: $PROFILER_TYPE"
90+
exit 1
91+
;;
92+
esac
93+
echo "PROFILER_TYPE=$PROFILER_TYPE" >> $GITHUB_ENV
7894
- name: Compute CPU core range to run benchmarks on
7995
shell: bash
8096
run: |
@@ -203,7 +219,8 @@ runs:
203219
--output-dir "./llvm-ci-perf-results/" \
204220
--preset "$PRESET" \
205221
--timestamp-override "$SAVE_TIMESTAMP" \
206-
--detect-version sycl,compute_runtime
222+
--detect-version sycl,compute_runtime \
223+
--profiler-type "$PROFILER_TYPE"
207224

208225
echo "-----"
209226
python3 ./devops/scripts/benchmarks/compare.py to_hist \

devops/scripts/benchmarks/benches/compute.py

Lines changed: 68 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,13 @@ def benchmark_bin(self) -> Path:
370370
"""Returns the path to the benchmark binary"""
371371
return self.bench.project.build_dir / "bin" / self.bench_name
372372

373+
def cpu_count_str(self, separator: str = " ") -> str:
374+
return (
375+
f"{separator}CPU count"
376+
if self.profiler_type == PROFILERS.CPU_COUNTER
377+
else ""
378+
)
379+
373380
def get_iters(self, run_trace: TracingType):
374381
"""Returns the number of iterations to run for the given tracing type."""
375382
return (
@@ -539,11 +546,16 @@ def supported_runtimes(self) -> list[RUNTIMES]:
539546
return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
540547

541548
def enabled(self) -> bool:
542-
# This is a workaround for the BMG server where we have old results for self.KernelExecTime=20
543-
# The benchmark instance gets created just to make metadata for these old results
544549
if not super().enabled():
545550
return False
546551

552+
if (
553+
self.runtime in (RUNTIMES.SYCL, RUNTIMES.UR)
554+
) and options.profiler_type != self.profiler_type.value:
555+
return False
556+
557+
# This is a workaround for the BMG server where we have old results for self.KernelExecTime=20
558+
# The benchmark instance gets created just to make metadata for these old results
547559
device_arch = getattr(options, "device_architecture", "")
548560
if "bmg" in device_arch and self.KernelExecTime == 20:
549561
# Disable this benchmark for BMG server, just create metadata
@@ -568,7 +580,7 @@ def name(self):
568580
f" KernelExecTime={self.KernelExecTime}" if self.KernelExecTime != 1 else ""
569581
)
570582

571-
return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}"
583+
return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}{self.cpu_count_str()}"
572584

573585
def display_name(self) -> str:
574586
order = "in order" if self.ioq else "out of order"
@@ -580,7 +592,7 @@ def display_name(self) -> str:
580592
if self.KernelExecTime != 1:
581593
info.append(f"KernelExecTime={self.KernelExecTime}")
582594
additional_info = f" {' '.join(info)}" if info else ""
583-
return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}"
595+
return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}{self.cpu_count_str(', ')}"
584596

585597
def explicit_group(self):
586598
order = "in order" if self.ioq else "out of order"
@@ -589,7 +601,7 @@ def explicit_group(self):
589601

590602
kernel_exec_time_str = f" long kernel" if self.KernelExecTime != 1 else ""
591603

592-
return f"SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}"
604+
return f"SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}{self.cpu_count_str(', ')}"
593605

594606
def description(self) -> str:
595607
order = "in-order" if self.ioq else "out-of-order"
@@ -607,18 +619,16 @@ def range(self) -> tuple[float, float]:
607619

608620
def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
609621
iters = self.get_iters(run_trace)
610-
bin_args = [
622+
return [
611623
f"--iterations={iters}",
612624
f"--Ioq={self.ioq}",
613625
f"--MeasureCompletion={self.MeasureCompletion}",
614626
"--Profiling=0",
615627
f"--NumKernels={self.NumKernels}",
616628
f"--KernelExecTime={self.KernelExecTime}",
617629
f"--UseEvents={self.UseEvents}",
630+
f"--profilerType={self.profiler_type.value}",
618631
]
619-
if self.runtime == RUNTIMES.SYCL or self.runtime == RUNTIMES.UR:
620-
bin_args.append(f"--profilerType={self.profiler_type.value}")
621-
return bin_args
622632

623633
def get_metadata(self) -> dict[str, BenchmarkMetadata]:
624634
metadata_dict = super().get_metadata()
@@ -656,13 +666,18 @@ def __init__(
656666
profiler_type=profiler_type,
657667
)
658668

669+
def enabled(self) -> bool:
670+
if options.profiler_type != self.profiler_type.value:
671+
return False
672+
return super().enabled()
673+
659674
def name(self):
660675
order = "in order" if self.ioq else "out of order"
661-
return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
676+
return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
662677

663678
def display_name(self) -> str:
664679
order = "in order" if self.ioq else "out of order"
665-
return f"SYCL ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
680+
return f"SYCL ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(', ')}"
666681

667682
def description(self) -> str:
668683
order = "in-order" if self.ioq else "out-of-order"
@@ -706,11 +721,16 @@ def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
706721
profiler_type=profiler_type,
707722
)
708723

724+
def enabled(self) -> bool:
725+
if options.profiler_type != self.profiler_type.value:
726+
return False
727+
return super().enabled()
728+
709729
def name(self):
710-
return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
730+
return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
711731

712732
def display_name(self) -> str:
713-
return f"SYCL QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
733+
return f"SYCL QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(', ')}"
714734

715735
def description(self) -> str:
716736
operation = "copy-only" if self.isCopyOnly else "copy and command submission"
@@ -748,11 +768,16 @@ def __init__(self, bench, source, destination, size, profiler_type):
748768
bench, "memory_benchmark_sycl", "QueueMemcpy", profiler_type=profiler_type
749769
)
750770

771+
def enabled(self) -> bool:
772+
if options.profiler_type != self.profiler_type.value:
773+
return False
774+
return super().enabled()
775+
751776
def name(self):
752-
return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
777+
return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
753778

754779
def display_name(self) -> str:
755-
return f"SYCL QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
780+
return f"SYCL QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(', ')}"
756781

757782
def description(self) -> str:
758783
return (
@@ -1038,8 +1063,16 @@ def __init__(
10381063
def supported_runtimes(self) -> list[RUNTIMES]:
10391064
return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
10401065

1066+
def enabled(self) -> bool:
1067+
if (
1068+
self.runtime == RUNTIMES.SYCL
1069+
and options.profiler_type != self.profiler_type.value
1070+
):
1071+
return False
1072+
return super().enabled()
1073+
10411074
def explicit_group(self):
1042-
return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels"
1075+
return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(', ')}"
10431076

10441077
def description(self) -> str:
10451078
return (
@@ -1048,10 +1081,10 @@ def description(self) -> str:
10481081
)
10491082

10501083
def name(self):
1051-
return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}"
1084+
return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}{self.cpu_count_str()}"
10521085

10531086
def display_name(self) -> str:
1054-
return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels"
1087+
return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(', ')}"
10551088

10561089
def get_tags(self):
10571090
return [
@@ -1064,7 +1097,7 @@ def get_tags(self):
10641097

10651098
def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
10661099
iters = self.get_iters(run_trace)
1067-
bin_args = [
1100+
return [
10681101
f"--iterations={iters}",
10691102
f"--NumKernels={self.numKernels}",
10701103
f"--MeasureCompletionTime={self.measureCompletionTime}",
@@ -1074,10 +1107,8 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
10741107
f"--UseEvents={self.useEvents}",
10751108
"--UseExplicit=0",
10761109
f"--UseHostTasks={self.useHostTasks}",
1110+
f"--profilerType={self.profiler_type.value}",
10771111
]
1078-
if self.runtime == RUNTIMES.SYCL:
1079-
bin_args.append(f"--profilerType={self.profiler_type.value}")
1080-
return bin_args
10811112

10821113
def get_metadata(self) -> dict[str, BenchmarkMetadata]:
10831114
metadata_dict = super().get_metadata()
@@ -1116,33 +1147,39 @@ def __init__(
11161147
def supported_runtimes(self) -> list[RUNTIMES]:
11171148
return [RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
11181149

1150+
def enabled(self) -> bool:
1151+
if (
1152+
self.runtime == RUNTIMES.SYCL
1153+
and options.profiler_type != self.profiler_type.value
1154+
):
1155+
return False
1156+
return super().enabled()
1157+
11191158
def explicit_group(self):
1120-
return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}"
1159+
return (
1160+
f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}{self.cpu_count_str(', ')}"
1161+
)
11211162

11221163
def description(self) -> str:
11231164
return ""
11241165

11251166
def name(self):
1126-
return f"ulls_benchmark_{self.runtime.value} EmptyKernel wgc:{self.wgc}, wgs:{self.wgs}"
1167+
return f"ulls_benchmark_{self.runtime.value} EmptyKernel wgc:{self.wgc}, wgs:{self.wgs}{self.cpu_count_str()}"
11271168

11281169
def display_name(self) -> str:
1129-
return (
1130-
f"{self.runtime.value.upper()} EmptyKernel, wgc {self.wgc}, wgs {self.wgs}"
1131-
)
1170+
return f"{self.runtime.value.upper()} EmptyKernel, wgc {self.wgc}, wgs {self.wgs}{self.cpu_count_str(', ')}"
11321171

11331172
def get_tags(self):
11341173
return [runtime_to_tag_name(self.runtime), "micro", "latency", "submit"]
11351174

11361175
def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
11371176
iters = self.get_iters(run_trace)
1138-
bin_args = [
1177+
return [
11391178
f"--iterations={iters}",
11401179
f"--wgs={self.wgs}",
11411180
f"--wgc={self.wgc}",
1181+
f"--profilerType={self.profiler_type.value}",
11421182
]
1143-
if self.runtime == RUNTIMES.SYCL:
1144-
bin_args.append(f"--profilerType={self.profiler_type.value}")
1145-
return bin_args
11461183

11471184

11481185
class UllsKernelSwitch(ComputeBenchmark):

devops/scripts/benchmarks/main.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,14 @@ def validate_and_parse_env_args(env_args):
689689
help="Set the logging level",
690690
default="info",
691691
)
692+
parser.add_argument(
693+
"--profiler-type",
694+
type=str,
695+
choices=["timer", "cpuCounter"],
696+
help="Set the profiler type for benchmarks. 'timer' measures execution time, "
697+
"'cpuCounter' measures CPU instruction count for supported benchmarks.",
698+
default="timer",
699+
)
692700

693701
args = parser.parse_args()
694702
additional_env_vars = validate_and_parse_env_args(args.env)
@@ -720,6 +728,7 @@ def validate_and_parse_env_args(env_args):
720728
options.build_jobs = args.build_jobs
721729
options.hip_arch = args.hip_arch
722730
options.flamegraph = args.flamegraph is not None
731+
options.profiler_type = args.profiler_type
723732

724733
# Initialize logger with command line arguments
725734
log.initialize(args.verbose, args.log_level)

devops/scripts/benchmarks/options.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ class Options:
7474
exit_on_failure: bool = False
7575
flamegraph: bool = False
7676
unitrace: bool = False
77+
profiler_type: str = "timer"
7778

7879
# Options intended for CI:
7980

0 commit comments

Comments
 (0)