dogfood

christiangnrd · christiangnrd · commit e3c30d7530f4 · 2025-10-22T01:01:12.000-03:00
diff --git a/src/Metal.jl b/src/Metal.jl
@@ -12,6 +12,7 @@ using ExprTools: splitdef, combinedef
 using ObjectiveC, .CoreFoundation, .Foundation, .Dispatch, .OS
 import ObjectiveC: is_macos, darwin_version, macos_version
 import KernelAbstractions
+import KernelAbstractions: KernelIntrinsics as KI
 using ScopedValues
 
 include("version.jl")
diff --git a/src/accumulate.jl b/src/accumulate.jl
@@ -2,13 +2,13 @@
 function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArray,
                       Rdim, Rpre, Rpost, Rother, neutral, init,
                       ::Val{maxthreads}, ::Val{inclusive}=Val(true)) where {T, maxthreads, inclusive}
-    threads = threads_per_threadgroup_3d().x
-    thread = thread_position_in_threadgroup_3d().x
+    threads = get_local_size().x
+    thread = get_local_id().x
 
-    temp = MtlThreadGroupArray(T, (Int32(2) * maxthreads,))
+    temp = KI.localmemory(T, (Int32(2) * maxthreads,))
 
-    i = (threadgroup_position_in_grid_3d().x - Int32(1)) * threads_per_threadgroup_3d().x + thread_position_in_threadgroup_3d().x
-    j = (threadgroup_position_in_grid_3d().z - Int32(1)) * threadgroups_per_grid_3d().y + threadgroup_position_in_grid_3d().y
+    i = (get_group_id().x - Int32(1)) * get_local_size().x + get_local_id().x
+    j = (get_group_id().z - Int32(1)) * get_num_groups().y + get_group_id().y
 
     if j > length(Rother)
         return
@@ -29,7 +29,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     offset = one(thread)
     d = threads >> 0x1
     while d > zero(d)
-        threadgroup_barrier(MemoryFlagThreadGroup)
+        KI.barrier()
         @inbounds if thread <= d
             ai = offset * (thread << 0x1 - 0x1)
             bi = offset * (thread << 0x1)
@@ -46,7 +46,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     d = one(thread)
     while d < threads
         offset >>= 0x1
-        threadgroup_barrier(MemoryFlagThreadGroup)
+        KI.barrier()
         @inbounds if thread <= d
             ai = offset * (thread << 0x1 - 0x1)
             bi = offset * (thread << 0x1)
@@ -58,7 +58,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
         d <<= 0x1
     end
 
-    threadgroup_barrier(MemoryFlagThreadGroup)
+    KI.barrier()
 
     @inbounds if i <= length(Rdim)
         val = if inclusive
@@ -76,10 +76,10 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
 end
 
 function aggregate_partial_scan(op::Function, output::AbstractArray, aggregates::AbstractArray, Rdim, Rpre, Rpost, Rother, init)
-    block = threadgroup_position_in_grid_3d().x
+    block = get_group_id().x
 
-    i = (threadgroup_position_in_grid_3d().x - Int32(1)) * threads_per_threadgroup_3d().x + thread_position_in_threadgroup_3d().x
-    j = (threadgroup_position_in_grid_3d().z - Int32(1)) * threadgroups_per_grid_3d().y + threadgroup_position_in_grid_3d().y
+    i = (get_group_id().x - Int32(1)) * get_local_size().x + get_local_id().x
+    j = (get_group_id().z - Int32(1)) * get_num_groups().y + get_group_id().y
 
     @inbounds if i <= length(Rdim) && j <= length(Rother)
         I = Rother[j]
diff --git a/src/broadcast.jl b/src/broadcast.jl
@@ -66,8 +66,8 @@ end
     if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
         ## COV_EXCL_START
         function broadcast_cartesian_static(dest, bc, Is)
-             i = thread_position_in_grid().x
-             stride = threads_per_grid().x
+             i = KI.get_global_id().x
+             stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                 I = @inbounds Is[i]
                 @inbounds dest[I] = bc[I]
@@ -91,8 +91,8 @@ end
        (isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
         ## COV_EXCL_START
         function broadcast_linear(dest, bc)
-             i = thread_position_in_grid().x
-             stride = threads_per_grid().x
+             i = KI.get_global_id().x
+             stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                  @inbounds dest[i] = bc[i]
                  i += stride
@@ -150,8 +150,8 @@ end
     else
         ## COV_EXCL_START
         function broadcast_cartesian(dest, bc)
-             i = thread_position_in_grid().x
-             stride = threads_per_grid().x
+             i = KI.get_global_id().x
+             stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                 I = @inbounds CartesianIndices(dest)[i]
                 @inbounds dest[I] = bc[I]
diff --git a/src/device/random.jl b/src/device/random.jl
@@ -88,9 +88,9 @@ end
     elseif field === :ctr1
         @inbounds global_random_counters()[simdgroupId]
     elseif field === :ctr2
-        globalId = thread_position_in_grid().x +
-                   (thread_position_in_grid().y - 1i32) * threads_per_grid().x +
-                   (thread_position_in_grid().z - 1i32) * threads_per_grid().x * threads_per_grid().y
+        globalId = KI.get_global_id().x +
+                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId % UInt32
     end::UInt32
 end
diff --git a/src/indexing.jl b/src/indexing.jl
@@ -33,7 +33,7 @@ function Base.findall(bools::WrappedMtlArray{Bool})
 
     if n > 0
         function kernel(ys::MtlDeviceArray, bools, indices)
-            i = (threadgroup_position_in_grid().x - Int32(1)) * threads_per_threadgroup().x + thread_position_in_threadgroup().x
+            i = (KI.get_group_id().x - Int32(1)) * KI.get_local_size().x + KI.get_local_id().x
 
             @inbounds if i <= length(bools) && bools[i]
                 i′ = CartesianIndices(bools)[i]
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -20,7 +20,7 @@ end
 @inline function reduce_group(op, val::T, neutral, shuffle::Val{true}, ::Val{maxthreads}) where {T, maxthreads}
     # shared mem for partial sums
     assume(threads_per_simdgroup() == 32)
-    shared = MtlThreadGroupArray(T, 32)
+    shared = KI.localmemory(T, 32)
 
     wid  = simdgroup_index_in_threadgroup()
     lane = thread_index_in_simdgroup()
@@ -34,10 +34,10 @@ end
     end
 
     # wait for all partial reductions
-    threadgroup_barrier(MemoryFlagThreadGroup)
+    KI.barrier()
 
     # read from shared memory only if that warp existed
-    val = if thread_index_in_threadgroup() <= fld1(threads_per_threadgroup().x, 32)
+    val = if KI.get_local_id().x <= fld1(KI.get_local_size().x, 32)
         @inbounds shared[lane]
     else
         neutral
@@ -52,17 +52,17 @@ end
 
 # Reduce a value across a group, using local memory for communication
 @inline function reduce_group(op, val::T, neutral, shuffle::Val{false}, ::Val{maxthreads}) where {T, maxthreads}
-    threads = threads_per_threadgroup().x
-    thread = thread_position_in_threadgroup().x
+    threads = KI.get_local_size().x
+    thread = KI.get_local_id().x
 
     # local mem for a complete reduction
-    shared = MtlThreadGroupArray(T, (maxthreads,))
+    shared = KI.localmemory(T, (maxthreads,))
     @inbounds shared[thread] = val
 
     # perform a reduction
     d = 1
     while d < threads
-        threadgroup_barrier(MemoryFlagThreadGroup)
+        KI.barrier()
         index = 2 * d * (thread-1) + 1
         @inbounds if index <= threads
             other_val = if index + d <= threads
@@ -94,9 +94,9 @@ function partial_mapreduce_device(f, op, neutral, maxthreads, ::Val{Rreduce},
     ::Val{Rother}, ::Val{Rlen}, ::Val{grain}, shuffle, R, As...) where {Rreduce, Rother, Rlen, grain}
     # decompose the 1D hardware indices into separate ones for reduction (across items
     # and possibly groups if it doesn't fit) and other elements (remaining groups)
-    localIdx_reduce = thread_position_in_threadgroup().x
-    localDim_reduce = threads_per_threadgroup().x * grain
-    groupIdx_reduce, groupIdx_other = fldmod1(threadgroup_position_in_grid().x, Rlen)
+    localIdx_reduce = KI.get_local_id().x
+    localDim_reduce = KI.get_local_size().x * grain
+    groupIdx_reduce, groupIdx_other = fldmod1(KI.get_group_id().x, Rlen)
 
     # group-based indexing into the values outside of the reduction dimension
     # (that means we can safely synchronize items within this group)
@@ -141,7 +141,7 @@ function partial_mapreduce_device(f, op, neutral, maxthreads, ::Val{Rreduce},
 end
 
 function serial_mapreduce_kernel(f, op, neutral, ::Val{Rreduce}, ::Val{Rother}, R, As) where {Rreduce, Rother}
-    grid_idx = thread_position_in_grid().x
+    grid_idx = KI.get_global_id().x
 
     @inbounds if grid_idx <= length(Rother)
         Iother = Rother[grid_idx]
@@ -166,11 +166,12 @@ end
 
 ## COV_EXCL_STOP
 
-serial_mapreduce_threshold(dev) = dev.maxThreadsPerThreadgroup.width * num_gpu_cores()
+serial_mapreduce_threshold(dev) = KI.max_work_group_size(MetalBackend()) * KI.multiprocessor_count(MetalBackend())
 
 function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
                                  A::Union{AbstractArray,Broadcast.Broadcasted};
                                  init=nothing) where {F, OP, T}
+    backend = MetalBackend()
     Base.check_reducedims(R, A)
     length(A) == 0 && return R # isempty(::Broadcasted) iterates
 
@@ -195,10 +196,10 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
 
     # If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
     if length(Rother) >= serial_mapreduce_threshold(device(R))
-        kernel = @metal launch=false serial_mapreduce_kernel(f, op, init, Val(Rreduce), Val(Rother), R, A)
-        threads = min(length(Rother), kernel.pipeline.maxTotalThreadsPerThreadgroup)
+        kernel = KI.KIKernel(backend, serial_mapreduce_kernel, f, op, init, Val(Rreduce), Val(Rother), R, A)
+        threads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=length(Rother))
         groups = cld(length(Rother), threads)
-        kernel(f, op, init, Val(Rreduce), Val(Rother), R, A; threads, groups)
+        kernel(f, op, init, Val(Rreduce), Val(Rother), R, A; numworkgroups=groups, workgroupsize=threads)
         return R
     end
 
@@ -223,17 +224,17 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     # we might not be able to launch all those threads to reduce each slice in one go.
     # that's why each threads also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single item group.
-    kernel = @metal launch=false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+    kernel = KI.KIKernel(backend, partial_mapreduce_device, f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                                                           Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
 
     # how many threads do we want?
     #
     # threads in a group work together to reduce values across the reduction dimensions;
     # we want as many as possible to improve algorithm efficiency and execution occupancy.
-    wanted_threads = shuffle ? nextwarp(kernel.pipeline, length(Rreduce)) : length(Rreduce)
+    wanted_threads = shuffle ? nextwarp(kernel.kern.pipeline, length(Rreduce)) : length(Rreduce)
     function compute_threads(max_threads)
         if wanted_threads > max_threads
-            shuffle ? prevwarp(kernel.pipeline, max_threads) : max_threads
+            shuffle ? prevwarp(kernel.kern.pipeline, max_threads) : max_threads
         else
             wanted_threads
         end
@@ -243,7 +244,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     #         kernel above may be greater than the maxTotalThreadsPerThreadgroup of the eventually launched
     #         kernel below, causing errors
     # reduce_threads = compute_threads(kernel.pipeline.maxTotalThreadsPerThreadgroup)
-    reduce_threads = compute_threads(512)
+    reduce_threads = compute_threads(KI.kernel_max_work_group_size(backend, kernel))
 
     # how many groups should we launch?
     #
@@ -262,7 +263,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         # we can cover the dimensions to reduce using a single group
         kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
-               threads, groups)
+                numworkgroups=groups, workgroupsize=threads)
     else
         # we need multiple steps to cover all values to reduce
         partial = similar(R, (size(R)..., reduce_groups))
@@ -273,9 +274,12 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         end
         # NOTE: we can't use the previously-compiled kernel, since the type of `partial`
         #       might not match the original output container (e.g. if that was a view).
-        @metal threads groups partial_mapreduce_device(
+        KI.KIKernel(backend, partial_mapreduce_device,
             f, op, init, Val(threads), Val(Rreduce), Val(Rother),
-            Val(UInt64(length(Rother))), Val(grain), Val(shuffle), partial, A)
+            Val(UInt64(length(Rother))), Val(grain), Val(shuffle), partial, A)(
+            f, op, init, Val(threads), Val(Rreduce), Val(Rother),
+            Val(UInt64(length(Rother))), Val(grain), Val(shuffle), partial, A;
+            numworkgroups=groups, workgroupsize=threads)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
     end