diff --git a/src/sorting.jl b/src/sorting.jl index a3883a6ee1..527c978604 100644 --- a/src/sorting.jl +++ b/src/sorting.jl @@ -21,7 +21,7 @@ depending on the relevant `parity` Originally developed by @xaellison (Alex Ellison). """ -module Quicksort +module QuickSortImpl export quicksort! @@ -485,28 +485,448 @@ end end +""" +This is an iterative bitonic sort that mimics a recursive version to support +non-power2 lengths. + +Credit for the recursive form of this algorithm goes to: +https://www.inf.hs-flensburg.de/lang/algorithmen/sortieren/bitonic/oddn.htm + +CUDA.jl implementation originally by @xaellison + +Overview: `comparator_kernel` implements a layer of sorting network comparators +generally. The sort could run just by looping over `comparator`, but +`comparator_small_kernel` copies values into shmem and loops over several +comparators that don't need to access any values outside the range held in +shared memory. It provides a moderate speedup. + +Notation: +`k`, `j` denote the level of the sorting network (equivalently, recursion depth). +`vals` is the array of values of type `T` that is either being `sort`-ed or `sortperm`-ed. +`inds` is an array of indices of type `J` that gets permuted in `sortperm!` (standard 1-indexed) +`i1`, `i2` index either `vals` or `inds` depending on the operation. +`lo`, `n`, and `m` are integers of type `I` used to denote/calculate ranges as + described in the recursive algorithm link above. Note these follow the 0-indexing + convention from the above source. +""" +module BitonicSortImpl + +export bitonic_sort! + +using ..CUDA +using ..CUDA: i32 + + +# General functions + +@inline two(::Type{Int}) = 2 +@inline two(::Type{Int32}) = 2i32 + +@inline function gp2lt(x::Int)::Int + x -= 1 + x |= x >> 1 + x |= x >> 2 + x |= x >> 4 + x |= x >> 8 + x |= x >> 16 + x |= x >> 32 + xor(x, x >> 1) +end + +@inline function gp2lt(x::Int32)::Int32 + x -= 1i32 + x |= x >> 1i32 + x |= x >> 2i32 + x |= x >> 4i32 + x |= x >> 8i32 + x |= x >> 16i32 + xor(x, x >> 1i32) +end + +@inline function bisect_range(index::I, lo::I, n::I) where {I} + if n <= one(I) + return -one(I), -one(I) + end + m = gp2lt(n) + if index < lo + m + n = m + else + lo = lo + m + n = n - m + end + return lo, n +end + +@inline function rev_lt(a::T, b::T, lt, rev::Val{R}) where {T,R} + if R + return lt(b, a) + else + return lt(a, b) + end +end + +@inline function rev_lt(a::Tuple{T,J}, b::Tuple{T,J}, lt, rev::Val{R}) where {T,J,R} + if R + if a[1] == b[1] + return a[2] < b[2] + else + return lt(b[1], a[1]) + end + else + return lt(a, b) + end +end + +# Functions specifically for "large" bitonic steps (those that cannot use shmem) + +@inline function compare!(vals::AbstractArray{T}, i1::I, i2::I, dir::Bool, by, lt, rev) where {T,I} + i1′, i2′ = i1 + one(I), i2 + one(I) + @inbounds if dir != rev_lt(by(vals[i1′]), by(vals[i2′]), lt, rev) + vals[i1′], vals[i2′] = vals[i2′], vals[i1′] + end +end + +@inline function compare!(vals_inds::Tuple, i1::I, i2::I, dir::Bool, by, lt, rev) where {I} + i1′, i2′ = i1 + one(I), i2 + one(I) + vals, inds = vals_inds + # comparing tuples of (value, index) guarantees stability of sort + @inbounds if dir != rev_lt((by(vals[inds[i1′]]), inds[i1′]), (by(vals[inds[i2′]]), inds[i2′]), lt, rev) + inds[i1′], inds[i2′] = inds[i2′], inds[i1′] + end +end + + +@inline function get_range_part1(n::I, index::I, k::I)::Tuple{I,I,Bool} where {I} + lo = zero(I) + dir = true + for iter = one(I):k-one(I) + if n <= one(I) + return -one(I), -one(I), false + end + + if index < lo + n ÷ two(I) + n = n ÷ two(I) + dir = !dir + else + lo = lo + n ÷ two(I) + n = n - n ÷ two(I) + end + end + return lo, n, dir +end + +@inline function get_range_part2(lo::I, n::I, index::I, j::I)::Tuple{I,I} where {I} + for iter = one(I):j-one(I) + lo, n = bisect_range(index, lo, n) + end + return lo, n +end + +""" +Determines parameters for swapping when the grid index directly maps to an +Array index for swapping +""" +@inline function get_range(n, index, k, j) + lo, n, dir = get_range_part1(n, index, k) + lo, n = get_range_part2(lo, n, index, j) + return lo, n, dir +end + +""" +Performs a step of bitonic sort requiring swaps between indices further apart +than the size of block allows (eg, 1 <--> 10000) + +The grid index directly maps to the index of `c` that will be used in the swap. + +Note that to avoid synchronization issues, only one thread from each pair of +indices being swapped will actually move data. This does mean half of the threads +do nothing, but it works for non-power2 arrays while allowing direct indexing. +""" +function comparator_kernel(vals, length_vals::I, k::I, j::I, by::F1, lt::F2, rev) where {I,F1,F2} + index = (blockDim().x * (blockIdx().x - one(I))) + threadIdx().x - one(I) + + lo, n, dir = get_range(length_vals, index, k, j) + + if !(lo < zero(I) || n < zero(I)) && !(index >= length_vals) + m = gp2lt(n) + if lo <= index < lo + n - m + i1, i2 = index, index + m + @inbounds compare!(vals, i1, i2, dir, by, lt, rev) + end + end + return +end + + +# Functions for "small" bitonic steps (those that can use shmem) + +@inline function compare_small!(vals::AbstractArray{T}, i1::I, i2::I, dir::Bool, by, lt, rev) where {T,I} + i1′, i2′ = i1 + one(I), i2 + one(I) + @inbounds if dir != rev_lt(by(vals[i1′]), by(vals[i2′]), lt, rev) + vals[i1′], vals[i2′] = vals[i2′], vals[i1′] + end +end + +@inline function compare_small!(vals_inds::Tuple, i1::I, i2::I, dir::Bool, by, lt, rev) where {I} + i1′, i2′ = i1 + one(I), i2 + one(I) + vals, inds = vals_inds + # comparing tuples of (value, index) guarantees stability of sort + @inbounds if dir != rev_lt((by(vals[i1′]), inds[i1′]), (by(vals[i2′]), inds[i2′]), lt, rev) + vals[i1′], vals[i2′] = vals[i2′], vals[i1′] + inds[i1′], inds[i2′] = inds[i2′], inds[i1′] + end +end + +""" +For each thread in the block, "re-compute" the range which would have been +passed in recursively. This range only depends on the block, and guarantees +all threads perform swaps accessible using shmem. + +Various negative exit values just for debugging. +""" +@inline function block_range(n::I, block_index::I, k::I, j::I)::Tuple{I,I,Bool} where {I} + lo = zero(I) + dir = true + tmp = block_index * two(I) + + # analogous to `get_range_part1` + for iter = one(I):(k-one(I)) + tmp ÷= two(I) + if n <= one(I) + return -one(I), -one(I), false + end + + if tmp % two(I) == zero(I) + n = n ÷ two(I) + dir = !dir + else + lo = lo + n ÷ two(I) + n = n - n ÷ two(I) + end + end + + # analogous to `get_range_part2` + for iter = one(I):(j-one(I)) + tmp ÷= two(I) + if n <= one(I) + return -one(I), -one(I), false + end + + m = gp2lt(n) + if tmp % two(I) == zero(I) + n = m + else + lo = lo + m + n = n - m + end + + end + if zero(I) <= n <= one(I) + return -one(I), -one(I), false + end + return lo, n, dir +end + +""" +For sort/sort! `c`, allocate and return shared memory view of `c` +Each view is indexed along block x dim: one view per pseudo-block +`index` is expected to be from a 0-indexing context +""" +@inline function initialize_shmem!(vals::AbstractArray{T}, index::I, in_range, offset = zero(I)) where {T,I} + swap = CuDynamicSharedArray(T, (blockDim().x, blockDim().y), offset) + if in_range + @inbounds swap[threadIdx().x, threadIdx().y] = vals[index+one(I)] + end + sync_threads() + return @view swap[:, threadIdx().y] +end + +""" +For sortperm/sortperm!, allocate and return shared memory views of `c` and index +array. Each view is indexed along block x dim: one view per pseudo-block. +`index` is expected to be from a 0-indexing context, but the indices stored in +`val_inds` are expected to be 1-indexed +""" +@inline function initialize_shmem!(vals_inds::Tuple{AbstractArray{T},AbstractArray{J}}, index, in_range) where {T,J} + offset = prod(blockDim()) * sizeof(J) + vals, inds = vals_inds + inds_view = initialize_shmem!(inds, index, in_range) + vals_view = initialize_shmem!(vals, inds_view[threadIdx().x] - one(J), in_range, offset) + return vals_view, inds_view +end + +""" +For sort/sort!, copy shmem view `swap` back into global array `c` +`index` is expected to be from a 0-indexing context +""" +@inline function finalize_shmem!(vals::AbstractArray, swap::AbstractArray, index::I, in_range::Bool) where {I} + if in_range + @inbounds vals[index+one(I)] = swap[threadIdx().x] + end +end + +""" +For sortperm/sortperm!, copy shmem view `swap` back to global index array +`index` is expected to be from a 0-indexing context, but the indices stored in +`val_inds` are expected to be 1-indexed +""" +@inline function finalize_shmem!(vals_inds::Tuple, swap::Tuple, index, in_range::Bool) + vals, inds = vals_inds + swap_vals, swap_inds = swap + finalize_shmem!(inds, swap_inds, index, in_range) +end + +""" +Performs consecutive steps of bitonic sort requiring swaps between indices no +further apart than the size of block allows. This effectively moves part of the +inner loop (over j, below) inside of a kernel to minimize launches and do +swaps in shared mem. + +Note that the x dimension of a thread block is treated as a comparator, +so when the maximum size of a comparator in this kernel is small, multiple +may be executed along the block y dimension, allowing for higher occupancy. +This is captured by `pseudo_block_idx`. + +Note that this moves the array values copied within shmem, but doesn't copy them +back to global the way it does for indices. +""" +function comparator_small_kernel(c, length_c::I, k::I, j_0::I, j_f::I, by::F1, lt::F2, rev) where {I,F1,F2} + pseudo_block_idx = (blockIdx().x - one(I)) * blockDim().y + threadIdx().y - one(I) + # immutable info about the range used by this kernel + _lo, _n, dir = block_range(length_c, pseudo_block_idx, k, j_0) + index = _lo + threadIdx().x - one(I) + in_range = (threadIdx().x <= _n && _lo >= zero(I)) + + swap = initialize_shmem!(c, index, in_range) + + # mutable copies for pseudo-recursion + lo, n = _lo, _n + + for j = j_0:j_f + if !(lo < zero(I) || n < zero(I)) && in_range + m = gp2lt(n) + if lo <= index < lo + n - m + i1, i2 = index - _lo, index - _lo + m + compare_small!(swap, i1, i2, dir, by, lt, rev) + end + end + lo, n = bisect_range(index, lo, n) + sync_threads() + end + + finalize_shmem!(c, swap, index, in_range) + return +end + + +# Host side code +function bitonic_shmem(c::AbstractArray{T}, threads) where {T} + return prod(threads) * sizeof(T) +end + +function bitonic_shmem(c, threads) + return prod(threads) * sum(map(a -> sizeof(eltype(a)), c)) +end + +""" +Call bitonic sort on `c` which can be a CuArray of values to `sort!` or a tuple +of values and an index array for doing `sortperm!`. Cannot provide a stable +`sort!` although `sortperm!` is properly stable. To reverse, set `rev=true` +rather than `lt=!isless` (otherwise stability of sortperm breaks down). +""" +function bitonic_sort!(c; by = identity, lt = isless, rev = false) where {T} + c_len = if typeof(c) <: Tuple + length(c[1]) + else + length(c) + end + k0 = c_len |> log2 |> ceil |> Int + + blocks_per_mp = CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR) + + # These two outer loops are the same as the serial version outlined here: + # https://en.wikipedia.org/wiki/Bitonic_sorter#Example_code + # Notation: our k/j are the base2 logs of k/j in Wikipedia example + + for k = k0:-1:1 + j_final = (1 + k0 - k) + for j = 1:j_final + + # use Int32 args for indexing when possible --> ~10% faster kernels + I = c_len <= typemax(Int32) ? Int32 : Int + args1 = (c, map(I, (c_len, k, j, j_final))..., by, lt, Val(rev)) + kernel1 = @cuda launch=false comparator_small_kernel(args1...) + config1 = launch_configuration(kernel1.fun, shmem = threads -> bitonic_shmem(c, threads)) + threads1 = prevpow(2, config1.threads) + + args2 = (c, map(I, (c_len, k, j))..., by, lt, Val(rev)) + kernel2 = @cuda launch=false comparator_kernel(args2...) + config2 = launch_configuration(kernel2.fun, shmem = threads -> bitonic_shmem(c, threads)) + threads2 = prevpow(2, config2.threads) + + threads = min(threads1, threads2) + min_pseudo_block = threads ÷ blocks_per_mp + log_threads = threads |> log2 |> Int + + if k0 - k - j + 2 <= log_threads + pseudo_block_size = 1 << abs(j_final + 1 - j) + _block_size = if pseudo_block_size <= min_pseudo_block + (pseudo_block_size, min_pseudo_block ÷ pseudo_block_size) + else + (pseudo_block_size, 1) + end + b = nextpow(2, cld(c_len, prod(_block_size))) + kernel1(args1...; blocks = b, threads = _block_size, shmem = bitonic_shmem(c, _block_size)) + break + else + b = nextpow(2, cld(c_len, threads)) + kernel2(args2...; blocks = b, threads = threads) + end + end + end +end +end + # Base interface implementation -using .Quicksort +using .BitonicSortImpl +using .QuickSortImpl -function Base.sort!(c::AnyCuArray; dims::Integer, lt=isless, by=identity, rev=false) + +abstract type SortingAlgorithm end +struct QuickSortAlg <: SortingAlgorithm end +struct BitonicSortAlg <: SortingAlgorithm end + +const QuickSort = QuickSortAlg() +const BitonicSort = BitonicSortAlg() + + +function Base.sort!(c::AnyCuVector, alg::QuickSortAlg; lt=isless, by=identity, rev=false) # for reverse sorting, invert the less-than function if rev lt = !lt end - quicksort!(c; lt, by, dims) + quicksort!(c; lt, by, dims=1) return c end -function Base.sort!(c::AnyCuVector; lt=isless, by=identity, rev=false) - # for reverse sorting, invert the less-than function +function Base.sort!(c::AnyCuVector, alg::BitonicSortAlg; kwargs...) + return bitonic_sort!(c; kwargs...) +end + +function Base.sort!(c::AnyCuVector; alg :: SortingAlgorithm = BitonicSort, kwargs...) + return sort!(c, alg; kwargs...) +end + +function Base.sort!(c::AnyCuArray; dims::Integer, lt=isless, by=identity, rev=false) + # for multi dim sorting, only quicksort is supported so no alg keyword if rev lt = !lt end - quicksort!(c; lt, by, dims=1) + quicksort!(c; lt, by, dims) return c end @@ -536,3 +956,18 @@ end function Base.partialsort(c::AnyCuArray, k::Union{Integer, OrdinalRange}; kwargs...) return partialsort!(copy(c), k; kwargs...) end + +function Base.sortperm!(I::AnyCuArray{T}, c::AnyCuArray; initialized=false, kwargs...) where T + if length(I) != length(c) + throw(ArgumentError("index vector must have the same length/indices as the source vector")) + end + if !initialized + I .= one(T):T(length(I)) + end + bitonic_sort!((c, I); kwargs...) + return I +end + +function Base.sortperm(c::AnyCuArray; kwargs...) + sortperm!(CuArray(1:length(c)), c; initialized=true, kwargs...) +end diff --git a/test/sorting.jl b/test/sorting.jl index 02e400e6cb..79152e2d47 100644 --- a/test/sorting.jl +++ b/test/sorting.jl @@ -3,7 +3,7 @@ using DataStructures @testset "quicksort" begin -import CUDA.Quicksort: flex_lt, find_partition, quicksort!, +import CUDA.QuickSortImpl: flex_lt, find_partition, quicksort!, partition_batches_kernel, consolidate_batch_partition, bubble_sort @testset "integer functions" begin @@ -169,7 +169,7 @@ end """ Tests if `c` is a valid sort of `a` """ -function check_equivalence(a::Vector, c::Vector; kwargs...) +function check_equivalence(a::Vector, c::Vector; alg=nothing, kwargs...) counter(a) == counter(c) && issorted(c; kwargs...) end @@ -239,6 +239,22 @@ function check_partialsort!(T, N, partial_k, f=identity; kwargs...) right_size && check_partial_equivalence(original_arr, host_result, partial_k; kwargs...) end +function check_sortperm!(i, T, N; kwargs...) + I = CuArray(i) + a = rand(T, N) + c = CuArray(a) + CUDA.@sync sortperm!(I, c; kwargs...) + return Array(I) == sortperm!(i, a; kwargs...) +end + + +function check_sortperm(T, N; kwargs...) + a = rand(T, N) + c = CuArray(a) + I = CUDA.@sync sortperm(c; kwargs...) + return Array(I) == sortperm(a; kwargs...) +end + # Makes sure that non-maximally-large block sizes don't result in race conds @testset "reduced block sizes" begin function init() @@ -265,58 +281,86 @@ end end @testset "interface" begin - # pre-sorted - @test check_sort!(Int, 1000000) - @test check_sort!(Int32, 1000000) - @test check_sort!(Float64, 1000000) - @test check_sort!(Float32, 1000000) - @test check_sort!(Int32, 1000000; rev=true) - @test check_sort!(Float32, 1000000; rev=true) - - # reverse sorted - @test check_sort!(Int32, 1000000, x -> -x) - @test check_sort!(Float32, 1000000, x -> -x) - @test check_sort!(Int32, 1000000, x -> -x; rev=true) - @test check_sort!(Float32, 1000000, x -> -x; rev=true) - - @test check_sort!(Int, 10000, x -> rand(Int)) - @test check_sort!(Int32, 10000, x -> rand(Int32)) - @test check_sort!(Int8, 10000, x -> rand(Int8)) - @test check_sort!(Float64, 10000, x -> rand(Float64)) - @test check_sort!(Float32, 10000, x -> rand(Float32)) - @test check_sort!(Float16, 10000, x -> rand(Float16)) - - # non-uniform distributions - @test check_sort!(UInt8, 100000, x -> round(255 * rand() ^ 2)) - @test check_sort!(UInt8, 100000, x -> round(255 * rand() ^ 3)) - - # more copies of each value than can fit in one block - @test check_sort!(Int8, 4000000, x -> rand(Int8)) - - # multiple dimensions - @test check_sort!(Int32, (4, 50000, 4); dims=2) - @test check_sort!(Int32, (4, 4, 50000); dims=3, rev=true) - - # large sizes - @test check_sort!(Float32, 2^25) - - # various sync depths - for depth in 0:4 - CUDA.limit!(CUDA.LIMIT_DEV_RUNTIME_SYNC_DEPTH, depth) - @test check_sort!(Int, 100000, x -> rand(Int)) - end - - # using a `by` argument - @test check_sort(Float32, 100000; by=x->abs(x - 0.5)) - @test check_sort!(Float32, (100000, 4); by=x->abs(x - 0.5), dims=1) - @test check_sort!(Float32, (4, 100000); by=x->abs(x - 0.5), dims=2) - @test check_sort!(Float64, 400000; by=x->8*x-round(8*x)) - @test check_sort!(Float64, (100000, 4); by=x->8*x-round(8*x), dims=1) - @test check_sort!(Float64, (4, 100000); by=x->8*x-round(8*x), dims=2) - # target bubble sort by using sub-blocksize input: - @test check_sort!(Int, 200; by=x->x % 2) - @test check_sort!(Int, 200; by=x->x % 3) - @test check_sort!(Int, 200; by=x->x % 4) + @testset "quicksort" begin + # pre-sorted + @test check_sort!(Int, 1000000; alg=CUDA.QuickSort) + @test check_sort!(Int32, 1000000; alg=CUDA.QuickSort) + @test check_sort!(Float64, 1000000; alg=CUDA.QuickSort) + @test check_sort!(Float32, 1000000; alg=CUDA.QuickSort) + @test check_sort!(Int32, 1000000; rev=true) + @test check_sort!(Float32, 1000000; rev=true) + + # reverse sorted + @test check_sort!(Int32, 1000000, x -> -x; alg=CUDA.QuickSort) + @test check_sort!(Float32, 1000000, x -> -x; alg=CUDA.QuickSort) + @test check_sort!(Int32, 1000000, x -> -x; rev=true, alg=CUDA.QuickSort) + @test check_sort!(Float32, 1000000, x -> -x; rev=true, alg=CUDA.QuickSort) + + @test check_sort!(Int, 10000, x -> rand(Int); alg=CUDA.QuickSort) + @test check_sort!(Int32, 10000, x -> rand(Int32); alg=CUDA.QuickSort) + @test check_sort!(Int8, 10000, x -> rand(Int8); alg=CUDA.QuickSort) + @test check_sort!(Float64, 10000, x -> rand(Float64); alg=CUDA.QuickSort) + @test check_sort!(Float32, 10000, x -> rand(Float32); alg=CUDA.QuickSort) + @test check_sort!(Float16, 10000, x -> rand(Float16); alg=CUDA.QuickSort) + + # non-uniform distributions + @test check_sort!(UInt8, 100000, x -> round(255 * rand() ^ 2); alg=CUDA.QuickSort) + @test check_sort!(UInt8, 100000, x -> round(255 * rand() ^ 3); alg=CUDA.QuickSort) + + # more copies of each value than can fit in one block + @test check_sort!(Int8, 4000000, x -> rand(Int8); alg=CUDA.QuickSort) + + # multiple dimensions + @test check_sort!(Int32, (4, 50000, 4); dims=2) + @test check_sort!(Int32, (4, 4, 50000); dims=3, rev=true) + + # large sizes + @test check_sort!(Float32, 2^25; alg=CUDA.QuickSort) + + # various sync depths + for depth in 0:4 + CUDA.limit!(CUDA.LIMIT_DEV_RUNTIME_SYNC_DEPTH, depth) + @test check_sort!(Int, 100000, x -> rand(Int); alg=CUDA.QuickSort) + end + + # using a `by` argument + @test check_sort(Float32, 100000; by=x->abs(x - 0.5), alg=CUDA.QuickSort) + @test check_sort!(Float32, (100000, 4); by=x->abs(x - 0.5), dims=1) + @test check_sort!(Float32, (4, 100000); by=x->abs(x - 0.5), dims=2) + @test check_sort!(Float64, 400000; by=x->8*x-round(8*x), alg=CUDA.QuickSort) + @test check_sort!(Float64, (100000, 4); by=x->8*x-round(8*x), dims=1) + @test check_sort!(Float64, (4, 100000); by=x->8*x-round(8*x), dims=2) + # target bubble sort by using sub-blocksize input: + @test check_sort!(Int, 200; by=x->x % 2, alg=CUDA.QuickSort) + @test check_sort!(Int, 200; by=x->x % 3, alg=CUDA.QuickSort) + @test check_sort!(Int, 200; by=x->x % 4, alg=CUDA.QuickSort) + end # end quicksort tests + + @testset "bitonic sort" begin + # test various types + @test check_sort!(Int, 10000, x -> rand(Int); alg=CUDA.BitonicSort) + @test check_sort!(Int32, 10000, x -> rand(Int32); alg=CUDA.BitonicSort) + @test check_sort!(Int8, 10000, x -> rand(Int8); alg=CUDA.BitonicSort) + @test check_sort!(Float64, 10000, x -> rand(Float64); alg=CUDA.BitonicSort) + @test check_sort!(Float32, 10000, x -> rand(Float32); alg=CUDA.BitonicSort) + @test check_sort!(Float16, 10000, x -> rand(Float16); alg=CUDA.BitonicSort) + + # test various sizes + @test check_sort!(Float32, 1, x -> rand(Float32); alg=CUDA.BitonicSort) + @test check_sort!(Float32, 2, x -> rand(Float32); alg=CUDA.BitonicSort) + @test check_sort!(Float32, 3, x -> rand(Float32); alg=CUDA.BitonicSort) + @test check_sort!(Float32, 4, x -> rand(Float32); alg=CUDA.BitonicSort) + @test check_sort!(Float32, 1 << 16 + 0, x -> rand(Float32); alg=CUDA.BitonicSort) + @test check_sort!(Float32, 1 << 16 + 1, x -> rand(Float32); alg=CUDA.BitonicSort) + @test check_sort!(Float32, 1 << 16 + 31, x -> rand(Float32); alg=CUDA.BitonicSort) + @test check_sort!(Float32, 1 << 16 + 32, x -> rand(Float32); alg=CUDA.BitonicSort) + @test check_sort!(Float32, 1 << 16 + 33, x -> rand(Float32); alg=CUDA.BitonicSort) + @test check_sort!(Float32, 1 << 16 + 127, x -> rand(Float32); alg=CUDA.BitonicSort) + @test check_sort!(Float32, 1 << 16 + 128, x -> rand(Float32); alg=CUDA.BitonicSort) + @test check_sort!(Float32, 1 << 16 + 129, x -> rand(Float32); alg=CUDA.BitonicSort) + end # end bitonic tests + + @test_throws MethodError check_sort!(Int, (100, 100); alg=CUDA.BitonicSort, dims=1) #partial sort @test check_partialsort!(Int, 100000, 1) @@ -329,6 +373,24 @@ end @test check_partialsort!(Float32, 100000, 50000; by=x->abs(x - 0.5)) @test check_partialsort!(Float32, 100000, 10000:20000; by=x->abs(x - 0.5)) @test check_partialsort!(Float32, 100000, 1:100000; by=x->abs(x - 0.5)) + + #sort perm + # A set of 1e6 Float32s has ~9.4e5 unique values: stability is non-trivial + @test check_sortperm(Float32, 1000000) + @test check_sortperm(Float32, 1000000; rev=true) + @test check_sortperm(Float32, 1000000; by=x->abs(x-0.5f0)) + @test check_sortperm(Float32, 1000000; rev=true, by=x->abs(x-0.5f0)) + @test check_sortperm(Float64, 1000000) + @test check_sortperm(Float64, 1000000; rev=true) + @test check_sortperm(Float64, 1000000; by=x->abs(x-0.5)) + @test check_sortperm(Float64, 1000000; rev=true, by=x->abs(x-0.5)) + # check with Int32 indices + @test check_sortperm!(collect(Int32(1):Int32(1000000)), Float32, 1000000) + # `initialized` kwarg + @test check_sortperm!(collect(Int32(1):Int32(1000000)), Float32, 1000000; initialized=true) + @test check_sortperm!(collect(Int32(1):Int32(1000000)), Float32, 1000000; initialized=false) + # expected error case + @test_throws ArgumentError sortperm!(CuArray(1:3), CuArray(1:4)) end end