KernelIntrinsics

christiangnrd · christiangnrd · commit cf7c66f8182e · 2025-11-17T22:57:05.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -67,7 +67,7 @@ ExprTools = "0.1"
 GPUArrays = "11.2.4"
 GPUCompiler = "1.4"
 GPUToolbox = "0.3, 1"
-KernelAbstractions = "0.9.38"
+KernelAbstractions = "0.10"
 LLVM = "9.3.1"
 LLVMLoopInfo = "1"
 LazyArtifacts = "1"
diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
@@ -1,9 +1,10 @@
 module CUDAKernels
 
 using ..CUDA
-using ..CUDA: @device_override, CUSPARSE, default_memory, UnifiedMemory
+using ..CUDA: @device_override, CUSPARSE, default_memory, UnifiedMemory, cufunction, cudaconvert
 
 import KernelAbstractions as KA
+import KernelAbstractions: KI
 
 import StaticArrays
 import SparseArrays: AbstractSparseArray
@@ -157,34 +158,59 @@ function (obj::KA.Kernel{CUDABackend})(args...; ndrange=nothing, workgroupsize=n
     return nothing
 end
 
+KI.argconvert(::CUDABackend, arg) = cudaconvert(arg)
+
+function KI.kernel_function(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+    kern = cufunction(f, tt; name, kwargs...)
+    KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+end
+
+function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups = 1, workgroupsize = 1)
+    KI.check_launch_args(numworkgroups, workgroupsize)
+
+    obj.kern(args...; threads=workgroupsize, blocks=numworkgroups)
+    return nothing
+end
+
+
+function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.Kernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+    kernel_config = launch_configuration(kikern.kern.fun)
+
+    Int(min(kernel_config.threads, max_work_items))
+end
+function KI.max_work_group_size(::CUDABackend)::Int
+    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+end
+function KI.multiprocessor_count(::CUDABackend)::Int
+    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+end
+
 ## indexing
 
 ## COV_EXCL_START
-@device_override @inline function KA.__index_Local_Linear(ctx)
-    return threadIdx().x
+@device_override @inline function KI.get_local_id()
+    return (; x = Int(threadIdx().x), y = Int(threadIdx().y), z = Int(threadIdx().z))
 end
 
 
-@device_override @inline function KA.__index_Group_Linear(ctx)
-    return blockIdx().x
+@device_override @inline function KI.get_group_id()
+    return (; x = Int(blockIdx().x), y = Int(blockIdx().y), z = Int(blockIdx().z))
 end
 
-@device_override @inline function KA.__index_Global_Linear(ctx)
-    I =  @inbounds KA.expand(KA.__iterspace(ctx), blockIdx().x, threadIdx().x)
-    # TODO: This is unfortunate, can we get the linear index cheaper
-    @inbounds LinearIndices(KA.__ndrange(ctx))[I]
+@device_override @inline function KI.get_global_id()
+    return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
 end
 
-@device_override @inline function KA.__index_Local_Cartesian(ctx)
-    @inbounds KA.workitems(KA.__iterspace(ctx))[threadIdx().x]
+@device_override @inline function KI.get_local_size()
+    return (; x = Int(blockDim().x), y = Int(blockDim().y), z = Int(blockDim().z))
 end
 
-@device_override @inline function KA.__index_Group_Cartesian(ctx)
-    @inbounds KA.blocks(KA.__iterspace(ctx))[blockIdx().x]
+@device_override @inline function KI.get_num_groups()
+    return (; x = Int(gridDim().x), y = Int(gridDim().y), z = Int(gridDim().z))
 end
 
-@device_override @inline function KA.__index_Global_Cartesian(ctx)
-    return @inbounds KA.expand(KA.__iterspace(ctx), blockIdx().x, threadIdx().x)
+@device_override @inline function KI.get_global_size()
+    return (; x = Int(blockDim().x * gridDim().x), y = Int(blockDim().y * gridDim().y), z = Int(blockDim().z * gridDim().z))
 end
 
 @device_override @inline function KA.__validindex(ctx)
@@ -198,7 +224,8 @@ end
 
 ## shared and scratch memory
 
-@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+# @device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
     CuStaticSharedArray(T, Dims)
 end
 
@@ -208,11 +235,11 @@ end
 
 ## synchronization and printing
 
-@device_override @inline function KA.__synchronize()
+@device_override @inline function KI.barrier()
     sync_threads()
 end
 
-@device_override @inline function KA.__print(args...)
+@device_override @inline function KI._print(args...)
     CUDA._cuprint(args...)
 end
 
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
@@ -4,7 +4,9 @@ using SparseArrays
 
 include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
 
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray)
+Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
+    "CPU synchronization",
+    "fallback test: callable types",]))
 for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
     Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
 end
@@ -16,13 +18,13 @@ end
 @testset "CUDA Backend Adapt Tests" begin
     # CPU → GPU
     A = sprand(Float32, 10, 10, 0.5) #CSC
-    A_d = adapt(CUDABackend(), A) 
+    A_d = adapt(CUDABackend(), A)
     @test A_d isa CUSPARSE.CuSparseMatrixCSC
     @test adapt(CUDABackend(), A_d) |> typeof == typeof(A_d)
 
     # GPU → CPU
     B_d = A |> cu # CuCSC
     B = adapt(KA.CPU(), A_d)
     @test B isa SparseMatrixCSC
-    @test adapt(KA.CPU(), B) |> typeof == typeof(B) 
+    @test adapt(KA.CPU(), B) |> typeof == typeof(B)
 end