[Benchmarks] Measure cpu instructions optionally

PatKamin · PatKamin · commit db861ba23c38 · 2025-10-07T09:47:51.000Z
It makes sense to measure either time elapsed or cpu instructions
retired, not both. Adding an argument to scripts to run Compute
Benchmarks scenarios with only one of two scenarios would lower
the number of benchmark scenarios significantly. This would make
the tests to take less amount of time to complete, giving the user an
option to produce either time or cpu instructions count results,
where applicable.
diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml
@@ -241,6 +241,7 @@ jobs:
       benchmark_upload_results: false
       benchmark_preset: 'Minimal'
       benchmark_dry_run: true
+      benchmark_profiler_type: 'cpuCounter'
       repo_ref: ${{ github.sha }}
       toolchain_artifact: ${{ needs.build.outputs.toolchain_artifact }}
       toolchain_artifact_filename: ${{ needs.build.outputs.toolchain_artifact_filename }}
diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml
@@ -140,6 +140,13 @@ on:
         type: string
         default: 'false'
         required: False
+      benchmark_profiler_type:
+        description: |
+          Type of profiler to use for benchmarks. Options are "timer" and
+          "cpuCounter". Default is "cpuCounter".
+        type: string
+        default: 'cpuCounter'
+        required: False
 
   workflow_dispatch:
     inputs:
@@ -359,6 +366,7 @@ jobs:
         preset: ${{ inputs.benchmark_preset }}
         dry_run: ${{ inputs.benchmark_dry_run }}
         build_ref: ${{ inputs.repo_ref }}
+        profiler_type: ${{ inputs.benchmark_profiler_type }}
       env:
         RUNNER_TAG: ${{ inputs.runner }}
         GITHUB_TOKEN: ${{ secrets.LLVM_SYCL_BENCHMARK_TOKEN }}
diff --git a/.github/workflows/sycl-nightly-benchmarking.yml b/.github/workflows/sycl-nightly-benchmarking.yml
@@ -45,6 +45,7 @@ jobs:
       benchmark_upload_results: true
       benchmark_save_name: ${{ matrix.save_name }}
       benchmark_preset: ${{ matrix.preset }}
+      benchmark_profiler_type: cpuCounter
       repo_ref: ${{ matrix.ref }}
       toolchain_artifact: ${{ needs.ubuntu2204_build.outputs.toolchain_artifact }}
       toolchain_artifact_filename: ${{ needs.ubuntu2204_build.outputs.toolchain_artifact_filename }}
diff --git a/.github/workflows/sycl-ur-perf-benchmarking.yml b/.github/workflows/sycl-ur-perf-benchmarking.yml
@@ -61,6 +61,15 @@ on:
           - Normal
           - Test
         default: 'Minimal'  # Only compute-benchmarks
+      benchmark_profiler_type:
+        description: |
+          Type of profiler to use for benchmarks. Options are "timer" and
+          "cpuCounter". Default is "cpuCounter".
+        type: choice
+        options:
+          - timer
+          - cpuCounter
+        default: 'cpuCounter'
       pr_no:
         type: string
         description: |
@@ -192,6 +201,7 @@ jobs:
       benchmark_upload_results: ${{ inputs.upload_results }}
       benchmark_save_name: ${{ needs.sanitize_inputs.outputs.benchmark_save_name }}
       benchmark_preset: ${{ inputs.preset }}
+      benchmark_profiler_type: ${{ inputs.benchmark_profiler_type }}
       repo_ref: ${{ needs.sanitize_inputs.outputs.build_ref }}
       toolchain_artifact: ${{ needs.build_sycl.outputs.toolchain_artifact }}
       toolchain_artifact_filename: ${{ needs.build_sycl.outputs.toolchain_artifact_filename }}
diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml
@@ -32,6 +32,10 @@ inputs:
   dry_run:
     type: string
     required: False
+  profiler_type:
+    type: string
+    required: False
+    default: "cpuCounter"  # Other option is "timer"
 
 runs:
   using: "composite"
@@ -41,6 +45,7 @@ runs:
     env:
       TARGET_DEVICE: ${{ inputs.target_devices }}
       PRESET: ${{ inputs.preset }}
+      PROFILER_TYPE: ${{ inputs.profiler_type }}
     run: |
       case "$RUNNER_TAG" in
         '["PVC_PERF"]' ) ;;
@@ -75,6 +80,17 @@ runs:
       python3 ./devops/scripts/benchmarks/presets.py query "$PRESET"
       [ "$?" -ne 0 ] && exit 1  # Stop workflow if invalid preset
       echo "PRESET=$PRESET" >> $GITHUB_ENV
+
+      # Validate profiler type input
+      case "$PROFILER_TYPE" in
+        "timer") PROFILER_TYPE="timer" ;;
+        "cpuCounter") PROFILER_TYPE="cpuCounter" ;;
+      *) 
+        echo "Invalid profiler type specified: $PROFILER_TYPE"
+        exit 1
+        ;;
+      esac
+      echo "PROFILER_TYPE=$PROFILER_TYPE" >> $GITHUB_ENV
   - name: Compute CPU core range to run benchmarks on
     shell: bash
     run: |
@@ -203,7 +219,8 @@ runs:
         --output-dir "./llvm-ci-perf-results/" \
         --preset "$PRESET" \
         --timestamp-override "$SAVE_TIMESTAMP" \
-        --detect-version sycl,compute_runtime
+        --detect-version sycl,compute_runtime \
+        --profiler-type "$PROFILER_TYPE"
 
       echo "-----"
       python3 ./devops/scripts/benchmarks/compare.py to_hist \
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
@@ -370,6 +370,13 @@ def benchmark_bin(self) -> Path:
         """Returns the path to the benchmark binary"""
         return self.bench.project.build_dir / "bin" / self.bench_name
 
+    def cpu_count_str(self, separator: str = " ") -> str:
+        return (
+            f"{separator}CPU count"
+            if self.profiler_type == PROFILERS.CPU_COUNTER
+            else ""
+        )
+
     def get_iters(self, run_trace: TracingType):
         """Returns the number of iterations to run for the given tracing type."""
         return (
@@ -539,11 +546,16 @@ def supported_runtimes(self) -> list[RUNTIMES]:
         return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
 
     def enabled(self) -> bool:
-        # This is a workaround for the BMG server where we have old results for self.KernelExecTime=20
-        # The benchmark instance gets created just to make metadata for these old results
         if not super().enabled():
             return False
 
+        if (
+            self.runtime in (RUNTIMES.SYCL, RUNTIMES.UR)
+        ) and options.profiler_type != self.profiler_type.value:
+            return False
+
+        # This is a workaround for the BMG server where we have old results for self.KernelExecTime=20
+        # The benchmark instance gets created just to make metadata for these old results
         device_arch = getattr(options, "device_architecture", "")
         if "bmg" in device_arch and self.KernelExecTime == 20:
             # Disable this benchmark for BMG server, just create metadata
@@ -568,7 +580,7 @@ def name(self):
             f" KernelExecTime={self.KernelExecTime}" if self.KernelExecTime != 1 else ""
         )
 
-        return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}"
+        return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
         order = "in order" if self.ioq else "out of order"
@@ -580,7 +592,7 @@ def display_name(self) -> str:
         if self.KernelExecTime != 1:
             info.append(f"KernelExecTime={self.KernelExecTime}")
         additional_info = f" {' '.join(info)}" if info else ""
-        return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}"
+        return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}{self.cpu_count_str(', ')}"
 
     def explicit_group(self):
         order = "in order" if self.ioq else "out of order"
@@ -589,7 +601,7 @@ def explicit_group(self):
 
         kernel_exec_time_str = f" long kernel" if self.KernelExecTime != 1 else ""
 
-        return f"SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}"
+        return f"SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}{self.cpu_count_str(', ')}"
 
     def description(self) -> str:
         order = "in-order" if self.ioq else "out-of-order"
@@ -607,18 +619,16 @@ def range(self) -> tuple[float, float]:
 
     def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         iters = self.get_iters(run_trace)
-        bin_args = [
+        return [
             f"--iterations={iters}",
             f"--Ioq={self.ioq}",
             f"--MeasureCompletion={self.MeasureCompletion}",
             "--Profiling=0",
             f"--NumKernels={self.NumKernels}",
             f"--KernelExecTime={self.KernelExecTime}",
             f"--UseEvents={self.UseEvents}",
+            f"--profilerType={self.profiler_type.value}",
         ]
-        if self.runtime == RUNTIMES.SYCL or self.runtime == RUNTIMES.UR:
-            bin_args.append(f"--profilerType={self.profiler_type.value}")
-        return bin_args
 
     def get_metadata(self) -> dict[str, BenchmarkMetadata]:
         metadata_dict = super().get_metadata()
@@ -656,13 +666,18 @@ def __init__(
             profiler_type=profiler_type,
         )
 
+    def enabled(self) -> bool:
+        if options.profiler_type != self.profiler_type.value:
+            return False
+        return super().enabled()
+
     def name(self):
         order = "in order" if self.ioq else "out of order"
-        return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
+        return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
         order = "in order" if self.ioq else "out of order"
-        return f"SYCL ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
+        return f"SYCL ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(', ')}"
 
     def description(self) -> str:
         order = "in-order" if self.ioq else "out-of-order"
@@ -706,11 +721,16 @@ def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
             profiler_type=profiler_type,
         )
 
+    def enabled(self) -> bool:
+        if options.profiler_type != self.profiler_type.value:
+            return False
+        return super().enabled()
+
     def name(self):
-        return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
+        return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"SYCL QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
+        return f"SYCL QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(', ')}"
 
     def description(self) -> str:
         operation = "copy-only" if self.isCopyOnly else "copy and command submission"
@@ -748,11 +768,16 @@ def __init__(self, bench, source, destination, size, profiler_type):
             bench, "memory_benchmark_sycl", "QueueMemcpy", profiler_type=profiler_type
         )
 
+    def enabled(self) -> bool:
+        if options.profiler_type != self.profiler_type.value:
+            return False
+        return super().enabled()
+
     def name(self):
-        return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
+        return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"SYCL QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
+        return f"SYCL QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(', ')}"
 
     def description(self) -> str:
         return (
@@ -1038,8 +1063,16 @@ def __init__(
     def supported_runtimes(self) -> list[RUNTIMES]:
         return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
 
+    def enabled(self) -> bool:
+        if (
+            self.runtime == RUNTIMES.SYCL
+            and options.profiler_type != self.profiler_type.value
+        ):
+            return False
+        return super().enabled()
+
     def explicit_group(self):
-        return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels"
+        return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(', ')}"
 
     def description(self) -> str:
         return (
@@ -1048,10 +1081,10 @@ def description(self) -> str:
         )
 
     def name(self):
-        return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}"
+        return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels"
+        return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(', ')}"
 
     def get_tags(self):
         return [
@@ -1064,7 +1097,7 @@ def get_tags(self):
 
     def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         iters = self.get_iters(run_trace)
-        bin_args = [
+        return [
             f"--iterations={iters}",
             f"--NumKernels={self.numKernels}",
             f"--MeasureCompletionTime={self.measureCompletionTime}",
@@ -1074,10 +1107,8 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--UseEvents={self.useEvents}",
             "--UseExplicit=0",
             f"--UseHostTasks={self.useHostTasks}",
+            f"--profilerType={self.profiler_type.value}",
         ]
-        if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type.value}")
-        return bin_args
 
     def get_metadata(self) -> dict[str, BenchmarkMetadata]:
         metadata_dict = super().get_metadata()
@@ -1116,33 +1147,39 @@ def __init__(
     def supported_runtimes(self) -> list[RUNTIMES]:
         return [RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
 
+    def enabled(self) -> bool:
+        if (
+            self.runtime == RUNTIMES.SYCL
+            and options.profiler_type != self.profiler_type.value
+        ):
+            return False
+        return super().enabled()
+
     def explicit_group(self):
-        return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}"
+        return (
+            f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}{self.cpu_count_str(', ')}"
+        )
 
     def description(self) -> str:
         return ""
 
     def name(self):
-        return f"ulls_benchmark_{self.runtime.value} EmptyKernel wgc:{self.wgc}, wgs:{self.wgs}"
+        return f"ulls_benchmark_{self.runtime.value} EmptyKernel wgc:{self.wgc}, wgs:{self.wgs}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return (
-            f"{self.runtime.value.upper()} EmptyKernel, wgc {self.wgc}, wgs {self.wgs}"
-        )
+        return f"{self.runtime.value.upper()} EmptyKernel, wgc {self.wgc}, wgs {self.wgs}{self.cpu_count_str(', ')}"
 
     def get_tags(self):
         return [runtime_to_tag_name(self.runtime), "micro", "latency", "submit"]
 
     def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         iters = self.get_iters(run_trace)
-        bin_args = [
+        return [
             f"--iterations={iters}",
             f"--wgs={self.wgs}",
             f"--wgc={self.wgc}",
+            f"--profilerType={self.profiler_type.value}",
         ]
-        if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type.value}")
-        return bin_args
 
 
 class UllsKernelSwitch(ComputeBenchmark):
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
@@ -689,6 +689,14 @@ def validate_and_parse_env_args(env_args):
         help="Set the logging level",
         default="info",
     )
+    parser.add_argument(
+        "--profiler-type",
+        type=str,
+        choices=["timer", "cpuCounter"],
+        help="Set the profiler type for benchmarks. 'timer' measures execution time, "
+        "'cpuCounter' measures CPU instruction count for supported benchmarks.",
+        default="timer",
+    )
 
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
@@ -720,6 +728,7 @@ def validate_and_parse_env_args(env_args):
     options.build_jobs = args.build_jobs
     options.hip_arch = args.hip_arch
     options.flamegraph = args.flamegraph is not None
+    options.profiler_type = args.profiler_type
 
     # Initialize logger with command line arguments
     log.initialize(args.verbose, args.log_level)
diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py
@@ -74,6 +74,7 @@ class Options:
     exit_on_failure: bool = False
     flamegraph: bool = False
     unitrace: bool = False
+    profiler_type: str = "timer"
 
     # Options intended for CI: