[Do not merge] Test `KernelIntrinsics` #688

christiangnrd · 2025-10-22T03:48:25Z

Not a draft to also run benchmarks

github-actions · 2025-10-22T03:49:11Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic main) to apply these changes.

Click here to view the suggested changes.

diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
 
 KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
 
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = mtlfunction(f, tt; name, kwargs...)
-    KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+    return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
 end
 
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+    return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
 end
 
 
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
-    Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+    return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
 end
 function KI.max_work_group_size(::MetalBackend)::Int
-    Int(device().maxThreadsPerThreadgroup.width)
+    return Int(device().maxThreadsPerThreadgroup.width)
 end
 function KI.multiprocessor_count(::MetalBackend)::Int
-    Metal.num_gpu_cores()
+    return Metal.num_gpu_cores()
 end
 
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 72ced3ed..e90f5826 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,8 +66,8 @@ end
     if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
         ## COV_EXCL_START
         function broadcast_cartesian_static(dest, bc, Is)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                 I = @inbounds Is[i]
                 @inbounds dest[I] = bc[I]
@@ -91,8 +91,8 @@ end
        (isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
         ## COV_EXCL_START
         function broadcast_linear(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                  @inbounds dest[i] = bc[i]
                  i += stride
@@ -150,8 +150,8 @@ end
     else
         ## COV_EXCL_START
         function broadcast_cartesian(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                 I = @inbounds CartesianIndices(dest)[i]
                 @inbounds dest[I] = bc[I]
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
         @inbounds global_random_counters()[simdgroupId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId % UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     # we might not be able to launch all those threads to reduce each slice in one go.
     # that's why each threads also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single item group.
-    kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+    kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+        f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                                                           Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
 
     # how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         # we can cover the dimensions to reduce using a single group
         kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
-               workgroupsize = threads, numworkgroups = groups)
+            workgroupsize = threads, numworkgroups = groups
+        )
     else
         # temporary empty array whose type will match the final partial array
 	    partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         partial_kernel(f, op, init, Val(threads), Val(Rreduce),
                         Val(Rother), Val(UInt64(length(Rother))),
                         Val(grain), Val(shuffle), partial, A;
-                        numworkgroups = partial_groups, workgroupsize = partial_threads)
+            numworkgroups = partial_groups, workgroupsize = partial_threads
+        )
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
     end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index 221ee680..6f9d5a2c 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(()->MetalBackend(), "Metal", Metal, MtlArray, Metal.MtlDevic
     "Convert",           # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
     "SpecialFunctions",  # no equivalent Metal intrinsics for gamma, erf, etc
     "sparse",            # not supported yet
-    "CPU synchronization",
-    "fallback test: callable types",
+            "CPU synchronization",
+            "fallback test: callable types",
 ]))
diff --git a/test/runtests.jl b/test/runtests.jl
index 6805205e..99ccadac 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Metal

github-actions

Metal Benchmarks

Benchmark suite	Current: `ce67b4c`	Previous: `1232d45`	Ratio
`latency/precompile`	`29018226583` ns	`24992031042` ns	`1.16`
`latency/ttfp`	`2316989604.5` ns	`2131427167` ns	`1.09`
`latency/import`	`1389818917` ns	`1225716666` ns	`1.13`
`integration/metaldevrt`	`848875` ns	`834458` ns	`1.02`
`integration/byval/slices=1`	`1598042` ns	`1547145.5` ns	`1.03`
`integration/byval/slices=3`	`19072604` ns	`8632292` ns	`2.21`
`integration/byval/reference`	`1575125` ns	`1525292` ns	`1.03`
`integration/byval/slices=2`	`2686750` ns	`2592229` ns	`1.04`
`kernel/indexing`	`454500.5` ns	`624250` ns	`0.73`
`kernel/indexing_checked`	`507750` ns	`613041` ns	`0.83`
`kernel/launch`	`12666` ns	`12000` ns	`1.06`
`kernel/rand`	`532292` ns	`560375` ns	`0.95`
`array/construct`	`6375` ns	`6375` ns	`1`
`array/broadcast`	`529604` ns	`596750` ns	`0.89`
`array/random/randn/Float32`	`895500` ns	`814458.5` ns	`1.10`
`array/random/randn!/Float32`	`578416` ns	`620208` ns	`0.93`
`array/random/rand!/Int64`	`538333` ns	`552375` ns	`0.97`
`array/random/rand!/Float32`	`532812.5` ns	`583750` ns	`0.91`
`array/random/rand/Int64`	`911604.5` ns	`756208.5` ns	`1.21`
`array/random/rand/Float32`	`850145.5` ns	`583604` ns	`1.46`
`array/accumulate/Int64/1d`	`1374375` ns	`1252709` ns	`1.10`
`array/accumulate/Int64/dims=1`	`1963500` ns	`1815125` ns	`1.08`
`array/accumulate/Int64/dims=2`	`2340750` ns	`2160708` ns	`1.08`
`array/accumulate/Int64/dims=1L`	`12117709` ns	`11656187` ns	`1.04`
`array/accumulate/Int64/dims=2L`	`10014854` ns	`9645833` ns	`1.04`
`array/accumulate/Float32/1d`	`1114125` ns	`1067958` ns	`1.04`
`array/accumulate/Float32/dims=1`	`1718459` ns	`1541250` ns	`1.11`
`array/accumulate/Float32/dims=2`	`2159917` ns	`1840146` ns	`1.17`
`array/accumulate/Float32/dims=1L`	`10564334` ns	`9832792` ns	`1.07`
`array/accumulate/Float32/dims=2L`	`7796354` ns	`7218458.5` ns	`1.08`
`array/reductions/reduce/Int64/1d`	`1251833` ns	`1536834` ns	`0.81`
`array/reductions/reduce/Int64/dims=1`	`1114500` ns	`1068916` ns	`1.04`
`array/reductions/reduce/Int64/dims=2`	`1346250` ns	`1114812.5` ns	`1.21`
`array/reductions/reduce/Int64/dims=1L`	`2059959` ns	`2015792` ns	`1.02`
`array/reductions/reduce/Int64/dims=2L`	`4082500` ns	`4220062.5` ns	`0.97`
`array/reductions/reduce/Float32/1d`	`757125` ns	`1050021` ns	`0.72`
`array/reductions/reduce/Float32/dims=1`	`832104` ns	`814312.5` ns	`1.02`
`array/reductions/reduce/Float32/dims=2`	`861792` ns	`840000` ns	`1.03`
`array/reductions/reduce/Float32/dims=1L`	`1371458` ns	`1300250` ns	`1.05`
`array/reductions/reduce/Float32/dims=2L`	`1883979.5` ns	`1785812.5` ns	`1.05`
`array/reductions/mapreduce/Int64/1d`	`1241917` ns	`1544645.5` ns	`0.80`
`array/reductions/mapreduce/Int64/dims=1`	`1102729` ns	`1075416` ns	`1.03`
`array/reductions/mapreduce/Int64/dims=2`	`1347375` ns	`1124750` ns	`1.20`
`array/reductions/mapreduce/Int64/dims=1L`	`1949020.5` ns	`2014958` ns	`0.97`
`array/reductions/mapreduce/Int64/dims=2L`	`3882666` ns	`3606874.5` ns	`1.08`
`array/reductions/mapreduce/Float32/1d`	`795187` ns	`1030000` ns	`0.77`
`array/reductions/mapreduce/Float32/dims=1`	`813042` ns	`804459` ns	`1.01`
`array/reductions/mapreduce/Float32/dims=2`	`875063` ns	`816145.5` ns	`1.07`
`array/reductions/mapreduce/Float32/dims=1L`	`1374708` ns	`1307375` ns	`1.05`
`array/reductions/mapreduce/Float32/dims=2L`	`1882291` ns	`1894042` ns	`0.99`
`array/private/copyto!/gpu_to_gpu`	`535604.5` ns	`641459` ns	`0.83`
`array/private/copyto!/cpu_to_gpu`	`742062` ns	`784042` ns	`0.95`
`array/private/copyto!/gpu_to_cpu`	`728375` ns	`798458` ns	`0.91`
`array/private/iteration/findall/int`	`1649042` ns	`1556333` ns	`1.06`
`array/private/iteration/findall/bool`	`1563770.5` ns	`1422437.5` ns	`1.10`
`array/private/iteration/findfirst/int`	`2163500` ns	`2047375` ns	`1.06`
`array/private/iteration/findfirst/bool`	`2084958` ns	`2034375` ns	`1.02`
`array/private/iteration/scalar`	`2989917` ns	`3970666` ns	`0.75`
`array/private/iteration/logical`	`2755750` ns	`2580125` ns	`1.07`
`array/private/iteration/findmin/1d`	`2333333` ns	`2224833` ns	`1.05`
`array/private/iteration/findmin/2d`	`1591875` ns	`1504083` ns	`1.06`
`array/private/copy`	`806438` ns	`579459` ns	`1.39`
`array/shared/copyto!/gpu_to_gpu`	`85209` ns	`85625` ns	`1.00`
`array/shared/copyto!/cpu_to_gpu`	`82291.5` ns	`82000` ns	`1.00`
`array/shared/copyto!/gpu_to_cpu`	`83167` ns	`84417` ns	`0.99`
`array/shared/iteration/findall/int`	`1666417` ns	`1572333` ns	`1.06`
`array/shared/iteration/findall/bool`	`1568541.5` ns	`1439166.5` ns	`1.09`
`array/shared/iteration/findfirst/int`	`1765645.5` ns	`1659750` ns	`1.06`
`array/shared/iteration/findfirst/bool`	`1710375` ns	`1615833` ns	`1.06`
`array/shared/iteration/scalar`	`204625` ns	`206709` ns	`0.99`
`array/shared/iteration/logical`	`2561208` ns	`2254958` ns	`1.14`
`array/shared/iteration/findmin/1d`	`1959417` ns	`1802791` ns	`1.09`
`array/shared/iteration/findmin/2d`	`1586271` ns	`1506104.5` ns	`1.05`
`array/shared/copy`	`210042` ns	`251209` ns	`0.84`
`array/permutedims/4d`	`3132333` ns	`2351833` ns	`1.33`
`array/permutedims/2d`	`1176417` ns	`1144750.5` ns	`1.03`
`array/permutedims/3d`	`1820208` ns	`1650917` ns	`1.10`
`metal/synchronization/stream`	`19334` ns	`19209` ns	`1.01`
`metal/synchronization/context`	`19750` ns	`20125` ns	`0.98`

This comment was automatically generated by workflow using github-action-benchmark.

skip scripts tests on 1.10 Project.toml

christiangnrd force-pushed the kaintr branch 3 times, most recently from 9ac3d49 to 6314372 Compare October 22, 2025 04:31

github-actions bot reviewed Oct 22, 2025

View reviewed changes

christiangnrd force-pushed the kaintr branch 3 times, most recently from 22e754e to 68db9c2 Compare October 22, 2025 13:38

christiangnrd force-pushed the kaintr branch 2 times, most recently from 2b8dce1 to 0e76668 Compare November 2, 2025 21:16

christiangnrd mentioned this pull request Nov 2, 2025

Test GPUArrays reverse #648

Draft

christiangnrd force-pushed the kaintr branch 9 times, most recently from db9a7dc to c802ccc Compare November 6, 2025 22:27

christiangnrd force-pushed the kaintr branch 3 times, most recently from 6405cd5 to 5fd2378 Compare November 13, 2025 15:22

christiangnrd added 2 commits November 17, 2025 20:38

Tweak scripts test

a79641e

KernelIntrinsics

2c203c5

christiangnrd force-pushed the kaintr branch from 5fd2378 to 9d5a475 Compare November 18, 2025 01:09

Temp CI

7fe686a

skip scripts tests on 1.10 Project.toml

christiangnrd force-pushed the kaintr branch from 9d5a475 to 4b8f026 Compare November 18, 2025 01:23

Dogfood

ce67b4c

christiangnrd force-pushed the kaintr branch from 4b8f026 to ce67b4c Compare November 18, 2025 01:36

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Do not merge] Test `KernelIntrinsics` #688

[Do not merge] Test `KernelIntrinsics` #688

Uh oh!

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions bot commented Oct 22, 2025 •

edited

Loading

Uh oh!

github-actions bot left a comment •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

[Do not merge] Test KernelIntrinsics #688

Are you sure you want to change the base?

[Do not merge] Test KernelIntrinsics #688

Uh oh!

Conversation

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions bot commented Oct 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Metal Benchmarks

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

[Do not merge] Test `KernelIntrinsics` #688

[Do not merge] Test `KernelIntrinsics` #688

github-actions bot commented Oct 22, 2025 •

edited

Loading

github-actions bot left a comment •

edited

Loading