Skip to content

Conversation

@christiangnrd
Copy link
Member

Not a draft to also run benchmarks

@github-actions
Copy link
Contributor

github-actions bot commented Oct 22, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic main) to apply these changes.

Click here to view the suggested changes.
diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
 
 KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
 
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = mtlfunction(f, tt; name, kwargs...)
-    KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+    return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
 end
 
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+    return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
 end
 
 
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
-    Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+    return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
 end
 function KI.max_work_group_size(::MetalBackend)::Int
-    Int(device().maxThreadsPerThreadgroup.width)
+    return Int(device().maxThreadsPerThreadgroup.width)
 end
 function KI.multiprocessor_count(::MetalBackend)::Int
-    Metal.num_gpu_cores()
+    return Metal.num_gpu_cores()
 end
 
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 72ced3ed..e90f5826 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,8 +66,8 @@ end
     if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
         ## COV_EXCL_START
         function broadcast_cartesian_static(dest, bc, Is)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                 I = @inbounds Is[i]
                 @inbounds dest[I] = bc[I]
@@ -91,8 +91,8 @@ end
        (isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
         ## COV_EXCL_START
         function broadcast_linear(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                  @inbounds dest[i] = bc[i]
                  i += stride
@@ -150,8 +150,8 @@ end
     else
         ## COV_EXCL_START
         function broadcast_cartesian(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                 I = @inbounds CartesianIndices(dest)[i]
                 @inbounds dest[I] = bc[I]
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
         @inbounds global_random_counters()[simdgroupId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId % UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     # we might not be able to launch all those threads to reduce each slice in one go.
     # that's why each threads also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single item group.
-    kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+    kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+        f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                                                           Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
 
     # how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         # we can cover the dimensions to reduce using a single group
         kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
-               workgroupsize = threads, numworkgroups = groups)
+            workgroupsize = threads, numworkgroups = groups
+        )
     else
         # temporary empty array whose type will match the final partial array
 	    partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         partial_kernel(f, op, init, Val(threads), Val(Rreduce),
                         Val(Rother), Val(UInt64(length(Rother))),
                         Val(grain), Val(shuffle), partial, A;
-                        numworkgroups = partial_groups, workgroupsize = partial_threads)
+            numworkgroups = partial_groups, workgroupsize = partial_threads
+        )
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
     end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index 221ee680..6f9d5a2c 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(()->MetalBackend(), "Metal", Metal, MtlArray, Metal.MtlDevic
     "Convert",           # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
     "SpecialFunctions",  # no equivalent Metal intrinsics for gamma, erf, etc
     "sparse",            # not supported yet
-    "CPU synchronization",
-    "fallback test: callable types",
+            "CPU synchronization",
+            "fallback test: callable types",
 ]))
diff --git a/test/runtests.jl b/test/runtests.jl
index 6805205e..99ccadac 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Metal

@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from 9ac3d49 to 6314372 Compare October 22, 2025 04:31
Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Metal Benchmarks

Benchmark suite Current: ce67b4c Previous: 1232d45 Ratio
latency/precompile 29018226583 ns 24992031042 ns 1.16
latency/ttfp 2316989604.5 ns 2131427167 ns 1.09
latency/import 1389818917 ns 1225716666 ns 1.13
integration/metaldevrt 848875 ns 834458 ns 1.02
integration/byval/slices=1 1598042 ns 1547145.5 ns 1.03
integration/byval/slices=3 19072604 ns 8632292 ns 2.21
integration/byval/reference 1575125 ns 1525292 ns 1.03
integration/byval/slices=2 2686750 ns 2592229 ns 1.04
kernel/indexing 454500.5 ns 624250 ns 0.73
kernel/indexing_checked 507750 ns 613041 ns 0.83
kernel/launch 12666 ns 12000 ns 1.06
kernel/rand 532292 ns 560375 ns 0.95
array/construct 6375 ns 6375 ns 1
array/broadcast 529604 ns 596750 ns 0.89
array/random/randn/Float32 895500 ns 814458.5 ns 1.10
array/random/randn!/Float32 578416 ns 620208 ns 0.93
array/random/rand!/Int64 538333 ns 552375 ns 0.97
array/random/rand!/Float32 532812.5 ns 583750 ns 0.91
array/random/rand/Int64 911604.5 ns 756208.5 ns 1.21
array/random/rand/Float32 850145.5 ns 583604 ns 1.46
array/accumulate/Int64/1d 1374375 ns 1252709 ns 1.10
array/accumulate/Int64/dims=1 1963500 ns 1815125 ns 1.08
array/accumulate/Int64/dims=2 2340750 ns 2160708 ns 1.08
array/accumulate/Int64/dims=1L 12117709 ns 11656187 ns 1.04
array/accumulate/Int64/dims=2L 10014854 ns 9645833 ns 1.04
array/accumulate/Float32/1d 1114125 ns 1067958 ns 1.04
array/accumulate/Float32/dims=1 1718459 ns 1541250 ns 1.11
array/accumulate/Float32/dims=2 2159917 ns 1840146 ns 1.17
array/accumulate/Float32/dims=1L 10564334 ns 9832792 ns 1.07
array/accumulate/Float32/dims=2L 7796354 ns 7218458.5 ns 1.08
array/reductions/reduce/Int64/1d 1251833 ns 1536834 ns 0.81
array/reductions/reduce/Int64/dims=1 1114500 ns 1068916 ns 1.04
array/reductions/reduce/Int64/dims=2 1346250 ns 1114812.5 ns 1.21
array/reductions/reduce/Int64/dims=1L 2059959 ns 2015792 ns 1.02
array/reductions/reduce/Int64/dims=2L 4082500 ns 4220062.5 ns 0.97
array/reductions/reduce/Float32/1d 757125 ns 1050021 ns 0.72
array/reductions/reduce/Float32/dims=1 832104 ns 814312.5 ns 1.02
array/reductions/reduce/Float32/dims=2 861792 ns 840000 ns 1.03
array/reductions/reduce/Float32/dims=1L 1371458 ns 1300250 ns 1.05
array/reductions/reduce/Float32/dims=2L 1883979.5 ns 1785812.5 ns 1.05
array/reductions/mapreduce/Int64/1d 1241917 ns 1544645.5 ns 0.80
array/reductions/mapreduce/Int64/dims=1 1102729 ns 1075416 ns 1.03
array/reductions/mapreduce/Int64/dims=2 1347375 ns 1124750 ns 1.20
array/reductions/mapreduce/Int64/dims=1L 1949020.5 ns 2014958 ns 0.97
array/reductions/mapreduce/Int64/dims=2L 3882666 ns 3606874.5 ns 1.08
array/reductions/mapreduce/Float32/1d 795187 ns 1030000 ns 0.77
array/reductions/mapreduce/Float32/dims=1 813042 ns 804459 ns 1.01
array/reductions/mapreduce/Float32/dims=2 875063 ns 816145.5 ns 1.07
array/reductions/mapreduce/Float32/dims=1L 1374708 ns 1307375 ns 1.05
array/reductions/mapreduce/Float32/dims=2L 1882291 ns 1894042 ns 0.99
array/private/copyto!/gpu_to_gpu 535604.5 ns 641459 ns 0.83
array/private/copyto!/cpu_to_gpu 742062 ns 784042 ns 0.95
array/private/copyto!/gpu_to_cpu 728375 ns 798458 ns 0.91
array/private/iteration/findall/int 1649042 ns 1556333 ns 1.06
array/private/iteration/findall/bool 1563770.5 ns 1422437.5 ns 1.10
array/private/iteration/findfirst/int 2163500 ns 2047375 ns 1.06
array/private/iteration/findfirst/bool 2084958 ns 2034375 ns 1.02
array/private/iteration/scalar 2989917 ns 3970666 ns 0.75
array/private/iteration/logical 2755750 ns 2580125 ns 1.07
array/private/iteration/findmin/1d 2333333 ns 2224833 ns 1.05
array/private/iteration/findmin/2d 1591875 ns 1504083 ns 1.06
array/private/copy 806438 ns 579459 ns 1.39
array/shared/copyto!/gpu_to_gpu 85209 ns 85625 ns 1.00
array/shared/copyto!/cpu_to_gpu 82291.5 ns 82000 ns 1.00
array/shared/copyto!/gpu_to_cpu 83167 ns 84417 ns 0.99
array/shared/iteration/findall/int 1666417 ns 1572333 ns 1.06
array/shared/iteration/findall/bool 1568541.5 ns 1439166.5 ns 1.09
array/shared/iteration/findfirst/int 1765645.5 ns 1659750 ns 1.06
array/shared/iteration/findfirst/bool 1710375 ns 1615833 ns 1.06
array/shared/iteration/scalar 204625 ns 206709 ns 0.99
array/shared/iteration/logical 2561208 ns 2254958 ns 1.14
array/shared/iteration/findmin/1d 1959417 ns 1802791 ns 1.09
array/shared/iteration/findmin/2d 1586271 ns 1506104.5 ns 1.05
array/shared/copy 210042 ns 251209 ns 0.84
array/permutedims/4d 3132333 ns 2351833 ns 1.33
array/permutedims/2d 1176417 ns 1144750.5 ns 1.03
array/permutedims/3d 1820208 ns 1650917 ns 1.10
metal/synchronization/stream 19334 ns 19209 ns 1.01
metal/synchronization/context 19750 ns 20125 ns 0.98

This comment was automatically generated by workflow using github-action-benchmark.

@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from 22e754e to 68db9c2 Compare October 22, 2025 13:38
@christiangnrd christiangnrd force-pushed the kaintr branch 2 times, most recently from 2b8dce1 to 0e76668 Compare November 2, 2025 21:16
@christiangnrd christiangnrd force-pushed the kaintr branch 9 times, most recently from db9a7dc to c802ccc Compare November 6, 2025 22:27
@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from 6405cd5 to 5fd2378 Compare November 13, 2025 15:22
skip scripts tests on 1.10

Project.toml
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants