Skip to content

Commit 1b24915

Browse files
authored
Merge pull request #470 from pxl-th/amdgpu-extension
Add AMDGPU extension
2 parents b1226e8 + 44f7b3d commit 1b24915

17 files changed

+422
-0
lines changed

.buildkite/pipeline.yml

+28
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,34 @@ steps:
5555
if: build.pull_request.labels includes "benchmark"
5656
timeout_in_minutes: 30
5757

58+
- label: "AMDGPU - Julia 1.9 - No Artifacts"
59+
plugins:
60+
- JuliaCI/julia#v1:
61+
version: 1.9-nightly
62+
- JuliaCI/julia-test#v1:
63+
- JuliaCI/julia-coverage#v1:
64+
codecov: true
65+
dirs:
66+
- src
67+
- ext
68+
agents:
69+
queue: "juliagpu"
70+
rocm: "*"
71+
rocmgpu: "*"
72+
command:
73+
- julia -e """
74+
using TOML;
75+
conf = TOML.parse(read(\"Project.toml\", String));
76+
push!(conf[\"targets\"][\"test\"], \"AMDGPU\");
77+
open(io -> TOML.print(io, conf), \"Project.toml\", \"w\");
78+
"""
79+
timeout_in_minutes: 30
80+
env:
81+
JULIA_AMDGPU_CORE_MUST_LOAD: "1"
82+
JULIA_AMDGPU_HIP_MUST_LOAD: "1"
83+
JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
84+
NNLIB_TEST_AMDGPU: true
85+
5886
# - label: "GPU julia nightly"
5987
# plugins:
6088
# - JuliaCI/julia#v1:

Project.toml

+8
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,21 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1111
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
1212
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
1313

14+
[weakdeps]
15+
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
16+
17+
[extensions]
18+
AMDGPUExt = "AMDGPU"
19+
1420
[compat]
21+
AMDGPU = "0.4.7"
1522
Adapt = "2, 3.2"
1623
ChainRulesCore = "1.13"
1724
Requires = "0.5, 1.0"
1825
julia = "1.6"
1926

2027
[extras]
28+
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
2129
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
2230
ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
2331
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"

docs/src/index.md

+1
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
For use with automatic differentiation, this package defines gradients using [ChainRules.jl](https://github.com/JuliaDiff/ChainRules.jl). These will be seen by various packages including [Zygote.jl](https://github.com/FluxML/Zygote.jl).
66

77
To use these functions with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) you will need [NNlibCUDA.jl](https://github.com/FluxML/NNlibCUDA.jl) as well.
8+
For [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) you will need to load it and NNlib in the same Julia session.

ext/AMDGPUExt/AMDGPUExt.jl

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
module AMDGPUExt
2+
3+
using Adapt
4+
using AMDGPU
5+
using AMDGPU.MIOpen
6+
using ChainRulesCore
7+
using NNlib
8+
using NNlib: BatchedAdjoint, BatchedTranspose, BatchedAdjOrTrans
9+
using NNlib: DenseConvDims, PoolDims
10+
11+
const MIOPENFloat = Union{Float16, Float32}
12+
13+
const ROCBatchedAdjoint{T} = BatchedAdjoint{T, <: ROCArray{T}}
14+
const ROCBatchedTranspose{T} = BatchedTranspose{T, <: ROCArray{T}}
15+
const ROCBatchedAdjOrTrans{T} = Union{ROCBatchedAdjoint{T}, ROCBatchedTranspose{T}}
16+
const WrappedROCBatchedAdjOrTrans{T, N} = Adapt.WrappedArray{T, N, ROCBatchedAdjOrTrans{T}, ROCBatchedAdjOrTrans{T}}
17+
const AnyROCBatchedAdjOrTrans = Union{ROCBatchedAdjOrTrans, WrappedROCBatchedAdjOrTrans}
18+
19+
function Base.convert(::Type{T}, b::AnyROCBatchedAdjOrTrans) where {T <: Array}
20+
Base.convert(T, adapt(Array, b))
21+
end
22+
23+
function Base.Array{T, N}(b::AnyROCBatchedAdjOrTrans) where {T, N}
24+
Array{T, N}(adapt(Array, b))
25+
end
26+
27+
Base.collect(b::AnyROCBatchedAdjOrTrans) = collect(adapt(Array, b))
28+
29+
function Base.show(
30+
io::IO, mime::MIME{Symbol("text/plain")}, x::AnyROCBatchedAdjOrTrans,
31+
)
32+
show(io, mime, adapt(Array, x))
33+
end
34+
35+
Base.show(io::IO, x::AnyROCBatchedAdjOrTrans) = show(io, adapt(Array, x))
36+
37+
Base.display(x::AnyROCBatchedAdjOrTrans) = display(adapt(Array, x))
38+
39+
function NNlib._batched_gemm!(
40+
::Type{<: ROCArray}, transA::Char, transB::Char, α, A, B, β, C,
41+
)
42+
AMDGPU.rocBLAS.gemm_batched!(transA, transB, α, A, B, β, C)
43+
end
44+
45+
function nnlib_padding(dims)
46+
pd = NNlib.padding(dims)
47+
if !all(pd[1:2:end] .== pd[2:2:end])
48+
@warn """
49+
MIOpen does not support asymmetric padding, defaulting to symmetric choice:
50+
$pd -> $(pd[1:2:end]).
51+
""" maxlog=1
52+
end
53+
pd[1:2:end]
54+
end
55+
56+
include("conv.jl")
57+
include("pool.jl")
58+
include("softmax.jl")
59+
include("activations.jl")
60+
61+
end

ext/AMDGPUExt/activations.jl

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
for (f, op) in [
2+
NNlib.relu => MIOpen.relu,
3+
NNlib.relu6 => x -> MIOpen.clippedrelu(x, 6),
4+
NNlib.softplus => MIOpen.softrelu,
5+
NNlib.σ => MIOpen.sigmoid,
6+
Base.tanh => MIOpen.tanh,
7+
# TODO define for leakyrelu, elu, etc.?
8+
]
9+
@eval function Base.materialize(
10+
bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{ROCArray{<:MIOPENFloat}}}
11+
)
12+
return $op(bc.args[1])
13+
end
14+
end
15+
16+
Base.broadcasted(::typeof(identity), x::ROCArray{T}) where {T<:MIOPENFloat} = x

ext/AMDGPUExt/conv.jl

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
function NNlib.conv!(
2+
y::ROCArray{T, N}, x::ROCArray{T, N}, w::ROCArray{T, N}, cdims::DenseConvDims,
3+
) where {T <: MIOPENFloat, N}
4+
NNlib.flipkernel(cdims) || throw(ArgumentError(
5+
"MIOpen supports only cross-correlation as its convolution implementation."))
6+
7+
nd = max(0, 4 - N)
8+
ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd)
9+
MIOpen.convolution!(
10+
NNlib.insert_singleton_spatial_dimension(y, nd),
11+
NNlib.insert_singleton_spatial_dimension(x, nd),
12+
NNlib.insert_singleton_spatial_dimension(w, nd);
13+
padding=nnlib_padding(ncdims), stride=NNlib.stride(ncdims),
14+
dilation=NNlib.dilation(ncdims), groups=NNlib.groupcount(ncdims))
15+
return y
16+
end
17+
18+
function NNlib.∇conv_data!(
19+
dx::ROCArray{T, N}, dy::ROCArray{T, N}, w::ROCArray{T, N}, cdims::DenseConvDims,
20+
) where {T <: MIOPENFloat, N}
21+
NNlib.flipkernel(cdims) || throw(ArgumentError(
22+
"MIOpen supports only cross-correlation as its convolution implementation."))
23+
24+
nd = max(0, 4 - N)
25+
ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd)
26+
MIOpen.∇convolution_data!(
27+
NNlib.insert_singleton_spatial_dimension(dx, nd),
28+
NNlib.insert_singleton_spatial_dimension(dy, nd),
29+
NNlib.insert_singleton_spatial_dimension(w, nd);
30+
padding=nnlib_padding(ncdims), stride=NNlib.stride(ncdims),
31+
dilation=NNlib.dilation(ncdims), groups=NNlib.groupcount(ncdims))
32+
return dx
33+
end
34+
35+
function NNlib.∇conv_filter!(
36+
dw::ROCArray{T, N}, x::ROCArray{T, N}, dy::ROCArray{T, N}, cdims::DenseConvDims,
37+
) where {T <: MIOPENFloat, N}
38+
NNlib.flipkernel(cdims) || throw(ArgumentError(
39+
"MIOpen supports only cross-correlation as its convolution implementation."))
40+
41+
nd = max(0, 4 - N)
42+
ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd)
43+
MIOpen.∇convolution_weight!(
44+
NNlib.insert_singleton_spatial_dimension(dw, nd),
45+
NNlib.insert_singleton_spatial_dimension(dy, nd),
46+
NNlib.insert_singleton_spatial_dimension(x, nd);
47+
padding=nnlib_padding(ncdims), stride=NNlib.stride(ncdims),
48+
dilation=NNlib.dilation(ncdims), groups=NNlib.groupcount(ncdims))
49+
return dw
50+
end

ext/AMDGPUExt/pool.jl

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
for poolname in (:maxpool, :meanpool)
2+
@eval function NNlib.$(poolname)(
3+
x::ROCArray{T, N}, pdims::PoolDims,
4+
) where {T <: MIOPENFloat, N}
5+
y = similar(x, NNlib.output_size(pdims)..., NNlib.channels_out(pdims), size(x, N))
6+
nd = max(0, 4 - N)
7+
npdims = NNlib.insert_singleton_spatial_dimension(pdims, nd)
8+
MIOpen.$(Symbol("$(poolname)!"))(
9+
NNlib.insert_singleton_spatial_dimension(y, nd),
10+
NNlib.insert_singleton_spatial_dimension(x, nd);
11+
dims=NNlib.kernel_size(npdims), padding=nnlib_padding(npdims),
12+
stride=NNlib.stride(npdims), do_backward=false)
13+
return y
14+
end
15+
16+
@eval function ChainRulesCore.rrule(
17+
::typeof(NNlib.$(poolname)), x::ROCArray{T, N}, pdims::PoolDims,
18+
) where {T <: MIOPENFloat, N}
19+
y = similar(x, NNlib.output_size(pdims)..., NNlib.channels_out(pdims), size(x, N))
20+
nd = max(0, 4 - N)
21+
npdims = NNlib.insert_singleton_spatial_dimension(pdims, nd)
22+
23+
# `workspace` is used in the pullback.
24+
_, workspace = MIOpen.$(Symbol("$(poolname)!"))(
25+
NNlib.insert_singleton_spatial_dimension(y, nd),
26+
NNlib.insert_singleton_spatial_dimension(x, nd);
27+
dims=NNlib.kernel_size(npdims), padding=nnlib_padding(npdims),
28+
stride=NNlib.stride(npdims))
29+
30+
function _pooling_pullback(Δ)
31+
dx = similar(x)
32+
MIOpen.$(Symbol("$(poolname)!"))(
33+
NNlib.insert_singleton_spatial_dimension(dx, nd),
34+
NNlib.insert_singleton_spatial_dimension(unthunk(Δ), nd),
35+
NNlib.insert_singleton_spatial_dimension(y, nd),
36+
NNlib.insert_singleton_spatial_dimension(x, nd);
37+
dims=NNlib.kernel_size(npdims), padding=nnlib_padding(npdims),
38+
stride=NNlib.stride(npdims), workspace)
39+
return NoTangent(), dx, NoTangent()
40+
end
41+
y, _pooling_pullback
42+
end
43+
end

ext/AMDGPUExt/softmax.jl

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
for fname in (:softmax, :logsoftmax)
2+
@eval function NNlib.$(fname)(x::ROCArray{T}; dims = 1) where T <: MIOPENFloat
3+
MIOpen.$(fname)(x; dims)
4+
end
5+
6+
@eval function NNlib.$(Symbol("$(fname)"))(
7+
dy::ROCArray{T, N}, x::ROCArray{T, N}, y::ROCArray{T, N}; dims = 1,
8+
) where {T <: MIOPENFloat, N}
9+
MIOpen.$(Symbol("$(fname)!"))(dy, y; dims)
10+
end
11+
end

test/amd/activations.jl

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
@testset "Compare CPU & GPU" begin
2+
for (T, atol) in ((Float16, 1f-2), (Float32, 1f-5))
3+
x = randn(T, 16)
4+
gputest(x -> NNlib.relu.(x), x; atol)
5+
gputest(x -> NNlib.relu6.(x), x; atol)
6+
gputest(x -> NNlib.softplus.(x), x; atol)
7+
gputest(x -> tanh.(x), x; atol)
8+
gputest(x -> identity.(x), x; atol)
9+
end
10+
end

test/amd/batched_mul.jl

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
@testset "batched_mul" begin
2+
A = rand(Float32, 3, 3, 2)
3+
B = rand(Float32, 3, 3, 2)
4+
dA, dB = ROCArray.((A, B))
5+
6+
C = batched_mul(A, B)
7+
@test ROCArray(C) batched_mul(dA, dB)
8+
9+
Ct = batched_mul(batched_transpose(A), B)
10+
@test ROCArray(Ct) batched_mul(batched_transpose(dA), dB)
11+
12+
Ca = batched_mul(A, batched_adjoint(B))
13+
@test ROCArray(Ca) batched_mul(dA, batched_adjoint(dB))
14+
15+
# 5-arg batched_mul!
16+
C .= pi
17+
batched_mul!(C, A, B, 2f0, 3f0)
18+
Cpi = ROCArray(similar(C)) .= pi
19+
@test ROCArray(C) batched_mul!(Cpi, dA, dB, 2f0, 3f0)
20+
21+
# PermutedDimsArray
22+
@test ROCArray(Ct) batched_mul(PermutedDimsArray(dA, (2, 1, 3)), dB)
23+
24+
# FIXME same but with (1, 3, 2) errors
25+
D = permutedims(B, (2, 1, 3))
26+
Cp = batched_mul(batched_adjoint(A), B)
27+
@test ROCArray(Cp) batched_mul(
28+
batched_adjoint(dA), PermutedDimsArray(ROCArray(D), (2, 1, 3)))
29+
30+
# Methods which reshape
31+
M = randn(Float32, 3, 3)
32+
Cm = batched_mul(A, M)
33+
@test ROCArray(Cm) batched_mul(dA, ROCArray(M))
34+
end

test/amd/batched_repr.jl

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
function print_array_strs(x)
2+
str = sprint((io, x)->show(io, MIME"text/plain"(), x), x)
3+
return @view split(str, '\n')[2:end]
4+
end
5+
6+
@testset "BatchedAdjOrTrans" begin
7+
x = rand(Float32, 3, 4, 2)
8+
y = ROCArray(x)
9+
10+
bax = batched_adjoint(x)
11+
btx = batched_transpose(x)
12+
bay = batched_adjoint(y)
13+
bty = batched_transpose(y)
14+
15+
@test sprint(show, bax) == sprint(show, bay)
16+
@test sprint(show, btx) == sprint(show, bty)
17+
18+
@test print_array_strs(bax) == print_array_strs(bay)
19+
@test print_array_strs(btx) == print_array_strs(bty)
20+
21+
@test Array(bax) == Array(bay)
22+
@test collect(bax) == collect(bay)
23+
@test Array(btx) == Array(bty)
24+
@test collect(btx) == collect(bty)
25+
26+
for shape in (:, (12, 2))
27+
rbax = reshape(bax, shape)
28+
rbtx = reshape(btx, shape)
29+
rbay = reshape(bay, shape)
30+
rbty = reshape(bty, shape)
31+
32+
@test sprint(show, rbax) == sprint(show, rbay)
33+
@test sprint(show, rbtx) == sprint(show, rbty)
34+
35+
@test print_array_strs(rbax) == print_array_strs(rbay)
36+
@test print_array_strs(rbtx) == print_array_strs(rbty)
37+
38+
@test Array(rbax) == Array(rbay)
39+
@test collect(rbax) == collect(rbay)
40+
@test Array(rbtx) == Array(rbty)
41+
@test collect(rbtx) == collect(rbty)
42+
end
43+
end

test/amd/conv.jl

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
@testset "Compare CPU & GPU" begin
2+
channels, batch = 3, 2
3+
for T in (Float16, Float32), nd in (1, 2, 3)
4+
x = rand(Float32, fill(4, nd)..., 3, 1)
5+
w = rand(Float32, fill(2, nd)..., channels, 4)
6+
cdims = DenseConvDims(x, w, flipkernel=true)
7+
gputest((x, w) -> NNlib.conv(x, w, cdims), x, w; atol=1e-4)
8+
end
9+
end

test/amd/pool.jl

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
@testset "Compare CPU & GPU" begin
2+
channels, batch = 3, 2
3+
for T in (Float16, Float32), nd in (1, 2, 3)
4+
x = rand(T, fill(8, nd)..., channels, batch)
5+
pdims = PoolDims(x, 2)
6+
# NOTE: Disable grad check for maxpool as *sometimes*
7+
# it does not *completely* agree with CPU :/
8+
gputest(x -> NNlib.maxpool(x, pdims), x; checkgrad=false)
9+
gputest(x -> NNlib.meanpool(x, pdims), x)
10+
end
11+
end

0 commit comments

Comments
 (0)