Closed
Description
Hi,
I've been testing simple MWE to try if I can get my model training using unified memory feature of latest CUDA.jl package.
It does not look like the problem is within CUDA.jl but more like with NNlib.jl having a conflicting broadcast definitions when model is on device memory and data are on unified memory.
First scenario is when model is on device and data in buffer unified memory:
using CUDA
using Flux
struct TestLayer
chain
end
function TestLayer(input_size::Int)
chain = Chain(Dense(input_size, input_size),Dense(input_size, input_size),Dense(input_size, input_size))
TestLayer(chain)
end
Flux.@functor TestLayer
function (tl::TestLayer)(x)
tl.chain(x)
end
X = cu(rand(Float32, 128, 128, 512), unified=true)
# First issue
m = TestLayer(128) |> gpu
# Second issue
# cu(TestLayer(128), unified=true)
m(X)
would error out
ERROR: conflicting broadcast rules defined
Broadcast.BroadcastStyle(::CUDA.CuArrayStyle{2, CUDA.Mem.UnifiedBuffer}, ::CUDA.CuArrayStyle{1, CUDA.Mem.DeviceBuffer}) = CUDA.CuArrayStyle{2, CUDA.Mem.UnifiedBuffer}()
Broadcast.BroadcastStyle(::CUDA.CuArrayStyle{1, CUDA.Mem.DeviceBuffer}, ::CUDA.CuArrayStyle{2, CUDA.Mem.UnifiedBuffer}) = CUDA.CuArrayStyle{2, CUDA.Mem.DeviceBuffer}()
One of these should be undefined (and thus return Broadcast.Unknown).
Stacktrace:
[1] error(s::String)
@ Base ./error.jl:35
[2] result_join(::CUDA.CuArrayStyle{…}, ::CUDA.CuArrayStyle{…}, ::CUDA.CuArrayStyle{…}, ::CUDA.CuArrayStyle{…})
@ Base.Broadcast ./broadcast.jl:501
[3] result_style(s1::CUDA.CuArrayStyle{2, CUDA.Mem.UnifiedBuffer}, s2::CUDA.CuArrayStyle{1, CUDA.Mem.DeviceBuffer})
@ Base.Broadcast ./broadcast.jl:485
[4] combine_styles(c1::CuArray{Float32, 2, CUDA.Mem.UnifiedBuffer}, c2::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
@ Base.Broadcast ./broadcast.jl:461
[5] broadcasted
@ ./broadcast.jl:1347 [inlined]
[6] broadcast(::typeof(+), ::CuArray{Float32, 2, CUDA.Mem.UnifiedBuffer}, ::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
@ Base.Broadcast ./broadcast.jl:841
[7] adjoint
@ ~/.julia/packages/Zygote/jxHJc/src/lib/broadcast.jl:82 [inlined]
[8] _pullback
@ ~/.julia/packages/ZygoteRules/M4xmc/src/adjoint.jl:67 [inlined]
[9] Dense
Second scenario is when both model and data are on unified memory which leads to different compilation error
ERROR: GPU compilation of MethodInstance for (::GPUArrays.var"#broadcast_kernel#38")(::CUDA.CuKernelContext, ::CuDeviceMatrix{…}, ::Base.Broadcast.Broadcasted{…}, ::Int64) failed
KernelError: passing and using non-bitstype argument
Argument 4 to your kernel function is of type Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2, CUDA.Mem.UnifiedBuffer}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}, typeof(identity), Tuple{Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2, CUDA.Mem.UnifiedBuffer}, Nothing, typeof(+), Tuple{Base.Broadcast.Extruded{CuDeviceMatrix{Float32, 1}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}, Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}}}}}, which is not isbits:
.args is of type Tuple{Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2, CUDA.Mem.UnifiedBuffer}, Nothing, typeof(+), Tuple{Base.Broadcast.Extruded{CuDeviceMatrix{Float32, 1}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}, Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}}}} which is not isbits.
.1 is of type Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2, CUDA.Mem.UnifiedBuffer}, Nothing, typeof(+), Tuple{Base.Broadcast.Extruded{CuDeviceMatrix{Float32, 1}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}, Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}}} which is not isbits.
.args is of type Tuple{Base.Broadcast.Extruded{CuDeviceMatrix{Float32, 1}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}, Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}} which is not isbits.
.2 is of type Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}} which is not isbits.
.x is of type Vector{Float32} which is not isbits.
Stacktrace:
[1] check_invocation(job::GPUCompiler.CompilerJob)
@ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/validation.jl:92
[2] macro expansion
@ ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:123 [inlined]
[3] macro expansion
@ ~/.julia/packages/TimerOutputs/RsWnF/src/TimerOutput.jl:253 [inlined]
[4]
@ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:121
[5] codegen
@ ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:110 [inlined]
[6] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool)
@ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:106
[7] compile
@ ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:98 [inlined]
[8] #1072
@ ~/.julia/packages/CUDA/htRwP/src/compiler/compilation.jl:247 [inlined]
[9] JuliaContext(f::CUDA.var"#1072#1075"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
@ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:47
[10] compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/.julia/packages/CUDA/htRwP/src/compiler/compilation.jl:246
[11] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/execution.jl:125
[12] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/execution.jl:103
[13] macro expansion
I suspect these might be related.