-
Notifications
You must be signed in to change notification settings - Fork 47
[Do not merge] Test KernelIntrinsics
#688
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
christiangnrd
wants to merge
4
commits into
main
Choose a base branch
from
kaintr
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Contributor
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
kern = mtlfunction(f, tt; name, kwargs...)
- KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+ return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
end
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
KI.check_launch_args(numworkgroups, workgroupsize)
- obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+ return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
end
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
- Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+ return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
end
function KI.max_work_group_size(::MetalBackend)::Int
- Int(device().maxThreadsPerThreadgroup.width)
+ return Int(device().maxThreadsPerThreadgroup.width)
end
function KI.multiprocessor_count(::MetalBackend)::Int
- Metal.num_gpu_cores()
+ return Metal.num_gpu_cores()
end
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 72ced3ed..e90f5826 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,8 +66,8 @@ end
if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
## COV_EXCL_START
function broadcast_cartesian_static(dest, bc, Is)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
while 1 <= i <= length(dest)
I = @inbounds Is[i]
@inbounds dest[I] = bc[I]
@@ -91,8 +91,8 @@ end
(isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
## COV_EXCL_START
function broadcast_linear(dest, bc)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
while 1 <= i <= length(dest)
@inbounds dest[i] = bc[i]
i += stride
@@ -150,8 +150,8 @@ end
else
## COV_EXCL_START
function broadcast_cartesian(dest, bc)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
while 1 <= i <= length(dest)
I = @inbounds CartesianIndices(dest)[i]
@inbounds dest[I] = bc[I]
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
@inbounds global_random_counters()[simdgroupId]
elseif field === :ctr2
globalId = KI.get_global_id().x +
- (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
- (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+ (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+ (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
globalId % UInt32
end::UInt32
end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
# we might not be able to launch all those threads to reduce each slice in one go.
# that's why each threads also loops across their inputs, processing multiple values
# so that we can span the entire reduction dimension using a single item group.
- kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+ kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+ f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
# how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
# we can cover the dimensions to reduce using a single group
kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
- workgroupsize = threads, numworkgroups = groups)
+ workgroupsize = threads, numworkgroups = groups
+ )
else
# temporary empty array whose type will match the final partial array
partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
partial_kernel(f, op, init, Val(threads), Val(Rreduce),
Val(Rother), Val(UInt64(length(Rother))),
Val(grain), Val(shuffle), partial, A;
- numworkgroups = partial_groups, workgroupsize = partial_threads)
+ numworkgroups = partial_groups, workgroupsize = partial_threads
+ )
GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index 221ee680..6f9d5a2c 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(()->MetalBackend(), "Metal", Metal, MtlArray, Metal.MtlDevic
"Convert", # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
"SpecialFunctions", # no equivalent Metal intrinsics for gamma, erf, etc
"sparse", # not supported yet
- "CPU synchronization",
- "fallback test: callable types",
+ "CPU synchronization",
+ "fallback test: callable types",
]))
diff --git a/test/runtests.jl b/test/runtests.jl
index 6805205e..99ccadac 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
@static if VERSION < v"1.11"
using Pkg
- Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+ Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
end
using Metal |
9ac3d49 to
6314372
Compare
Contributor
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Metal Benchmarks
| Benchmark suite | Current: ce67b4c | Previous: 1232d45 | Ratio |
|---|---|---|---|
latency/precompile |
29018226583 ns |
24992031042 ns |
1.16 |
latency/ttfp |
2316989604.5 ns |
2131427167 ns |
1.09 |
latency/import |
1389818917 ns |
1225716666 ns |
1.13 |
integration/metaldevrt |
848875 ns |
834458 ns |
1.02 |
integration/byval/slices=1 |
1598042 ns |
1547145.5 ns |
1.03 |
integration/byval/slices=3 |
19072604 ns |
8632292 ns |
2.21 |
integration/byval/reference |
1575125 ns |
1525292 ns |
1.03 |
integration/byval/slices=2 |
2686750 ns |
2592229 ns |
1.04 |
kernel/indexing |
454500.5 ns |
624250 ns |
0.73 |
kernel/indexing_checked |
507750 ns |
613041 ns |
0.83 |
kernel/launch |
12666 ns |
12000 ns |
1.06 |
kernel/rand |
532292 ns |
560375 ns |
0.95 |
array/construct |
6375 ns |
6375 ns |
1 |
array/broadcast |
529604 ns |
596750 ns |
0.89 |
array/random/randn/Float32 |
895500 ns |
814458.5 ns |
1.10 |
array/random/randn!/Float32 |
578416 ns |
620208 ns |
0.93 |
array/random/rand!/Int64 |
538333 ns |
552375 ns |
0.97 |
array/random/rand!/Float32 |
532812.5 ns |
583750 ns |
0.91 |
array/random/rand/Int64 |
911604.5 ns |
756208.5 ns |
1.21 |
array/random/rand/Float32 |
850145.5 ns |
583604 ns |
1.46 |
array/accumulate/Int64/1d |
1374375 ns |
1252709 ns |
1.10 |
array/accumulate/Int64/dims=1 |
1963500 ns |
1815125 ns |
1.08 |
array/accumulate/Int64/dims=2 |
2340750 ns |
2160708 ns |
1.08 |
array/accumulate/Int64/dims=1L |
12117709 ns |
11656187 ns |
1.04 |
array/accumulate/Int64/dims=2L |
10014854 ns |
9645833 ns |
1.04 |
array/accumulate/Float32/1d |
1114125 ns |
1067958 ns |
1.04 |
array/accumulate/Float32/dims=1 |
1718459 ns |
1541250 ns |
1.11 |
array/accumulate/Float32/dims=2 |
2159917 ns |
1840146 ns |
1.17 |
array/accumulate/Float32/dims=1L |
10564334 ns |
9832792 ns |
1.07 |
array/accumulate/Float32/dims=2L |
7796354 ns |
7218458.5 ns |
1.08 |
array/reductions/reduce/Int64/1d |
1251833 ns |
1536834 ns |
0.81 |
array/reductions/reduce/Int64/dims=1 |
1114500 ns |
1068916 ns |
1.04 |
array/reductions/reduce/Int64/dims=2 |
1346250 ns |
1114812.5 ns |
1.21 |
array/reductions/reduce/Int64/dims=1L |
2059959 ns |
2015792 ns |
1.02 |
array/reductions/reduce/Int64/dims=2L |
4082500 ns |
4220062.5 ns |
0.97 |
array/reductions/reduce/Float32/1d |
757125 ns |
1050021 ns |
0.72 |
array/reductions/reduce/Float32/dims=1 |
832104 ns |
814312.5 ns |
1.02 |
array/reductions/reduce/Float32/dims=2 |
861792 ns |
840000 ns |
1.03 |
array/reductions/reduce/Float32/dims=1L |
1371458 ns |
1300250 ns |
1.05 |
array/reductions/reduce/Float32/dims=2L |
1883979.5 ns |
1785812.5 ns |
1.05 |
array/reductions/mapreduce/Int64/1d |
1241917 ns |
1544645.5 ns |
0.80 |
array/reductions/mapreduce/Int64/dims=1 |
1102729 ns |
1075416 ns |
1.03 |
array/reductions/mapreduce/Int64/dims=2 |
1347375 ns |
1124750 ns |
1.20 |
array/reductions/mapreduce/Int64/dims=1L |
1949020.5 ns |
2014958 ns |
0.97 |
array/reductions/mapreduce/Int64/dims=2L |
3882666 ns |
3606874.5 ns |
1.08 |
array/reductions/mapreduce/Float32/1d |
795187 ns |
1030000 ns |
0.77 |
array/reductions/mapreduce/Float32/dims=1 |
813042 ns |
804459 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=2 |
875063 ns |
816145.5 ns |
1.07 |
array/reductions/mapreduce/Float32/dims=1L |
1374708 ns |
1307375 ns |
1.05 |
array/reductions/mapreduce/Float32/dims=2L |
1882291 ns |
1894042 ns |
0.99 |
array/private/copyto!/gpu_to_gpu |
535604.5 ns |
641459 ns |
0.83 |
array/private/copyto!/cpu_to_gpu |
742062 ns |
784042 ns |
0.95 |
array/private/copyto!/gpu_to_cpu |
728375 ns |
798458 ns |
0.91 |
array/private/iteration/findall/int |
1649042 ns |
1556333 ns |
1.06 |
array/private/iteration/findall/bool |
1563770.5 ns |
1422437.5 ns |
1.10 |
array/private/iteration/findfirst/int |
2163500 ns |
2047375 ns |
1.06 |
array/private/iteration/findfirst/bool |
2084958 ns |
2034375 ns |
1.02 |
array/private/iteration/scalar |
2989917 ns |
3970666 ns |
0.75 |
array/private/iteration/logical |
2755750 ns |
2580125 ns |
1.07 |
array/private/iteration/findmin/1d |
2333333 ns |
2224833 ns |
1.05 |
array/private/iteration/findmin/2d |
1591875 ns |
1504083 ns |
1.06 |
array/private/copy |
806438 ns |
579459 ns |
1.39 |
array/shared/copyto!/gpu_to_gpu |
85209 ns |
85625 ns |
1.00 |
array/shared/copyto!/cpu_to_gpu |
82291.5 ns |
82000 ns |
1.00 |
array/shared/copyto!/gpu_to_cpu |
83167 ns |
84417 ns |
0.99 |
array/shared/iteration/findall/int |
1666417 ns |
1572333 ns |
1.06 |
array/shared/iteration/findall/bool |
1568541.5 ns |
1439166.5 ns |
1.09 |
array/shared/iteration/findfirst/int |
1765645.5 ns |
1659750 ns |
1.06 |
array/shared/iteration/findfirst/bool |
1710375 ns |
1615833 ns |
1.06 |
array/shared/iteration/scalar |
204625 ns |
206709 ns |
0.99 |
array/shared/iteration/logical |
2561208 ns |
2254958 ns |
1.14 |
array/shared/iteration/findmin/1d |
1959417 ns |
1802791 ns |
1.09 |
array/shared/iteration/findmin/2d |
1586271 ns |
1506104.5 ns |
1.05 |
array/shared/copy |
210042 ns |
251209 ns |
0.84 |
array/permutedims/4d |
3132333 ns |
2351833 ns |
1.33 |
array/permutedims/2d |
1176417 ns |
1144750.5 ns |
1.03 |
array/permutedims/3d |
1820208 ns |
1650917 ns |
1.10 |
metal/synchronization/stream |
19334 ns |
19209 ns |
1.01 |
metal/synchronization/context |
19750 ns |
20125 ns |
0.98 |
This comment was automatically generated by workflow using github-action-benchmark.
22e754e to
68db9c2
Compare
2b8dce1 to
0e76668
Compare
db9a7dc to
c802ccc
Compare
6405cd5 to
5fd2378
Compare
5fd2378 to
9d5a475
Compare
9d5a475 to
4b8f026
Compare
4b8f026 to
ce67b4c
Compare
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Not a draft to also run benchmarks