Don't spawn if: only one iteration or no threads or threading disabled (#633)

IanButterworth · web-flow · commit 85d80d1c4cf8 · 2025-03-18T15:12:27.000-07:00
* don't spawn if only one job

* fix what appears to be a typo

* revertme: temporary test

* fix check

* rm test

* add NNlib.ALLOW_THREADING control

* use ScopedValues

* use `@with` to avoid new scope

* rename do_work functions

* add note

* v0.9.29
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "NNlib"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.9.28"
+version = "0.9.29"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -10,6 +10,7 @@ GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [weakdeps]
@@ -43,6 +44,7 @@ GPUArraysCore = "0.1, 0.2"
 KernelAbstractions = "0.9.2"
 LinearAlgebra = "<0.0.1, 1"
 Random = "<0.0.1, 1"
+ScopedValues = "1.3.0"
 SpecialFunctions = "2"
 Statistics = "1"
 cuDNN = "1"
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -12,4 +12,13 @@ for CUDA support, or
 ```julia
 using NNlib, AMDGPU
 ```
-for AMDGPU support.
+for AMDGPU support.
+
+## Threading
+
+Various `NNlib` functions utilize available julia threads on divisible workloads. To disable this use
+the `ScopedValue`-backed switch `NNlib.@disallow_spawns`
+i.e.
+```julia
+NNlib.@disallow_spawns function_that_uses_nnlib()
+```
diff --git a/src/NNlib.jl b/src/NNlib.jl
@@ -13,11 +13,26 @@ using LinearAlgebra
 using LinearAlgebra.BLAS: @blasfunc, BlasInt
 using LinearAlgebra: AdjOrTransAbsMat, Adjoint, BlasFloat, Transpose
 using Random
+using ScopedValues
 using Statistics
 using Statistics: mean
 
 const Numeric = Union{AbstractArray{<:T}, T} where {T<:Number}
 
+# internal. TODO: change to an approach where amount of threading is controlled, not just on/off
+const ALLOW_SPAWNS = ScopedValue(true)
+should_use_spawn() = Threads.nthreads(:default) > 1 && ALLOW_SPAWNS[]
+"""
+    @disallow_spawns ex
+
+Disallow NNlib to use `@spawn` on divisible workloads. i.e. within `conv` etc.
+"""
+macro disallow_spawns(ex)
+    quote
+        @with ALLOW_SPAWNS => false $(esc(ex))
+    end
+end
+
 # Include APIs
 include("dim_helpers.jl")
 export ConvDims, DenseConvDims, PoolDims, DepthwiseConvDims
@@ -35,7 +50,7 @@ include("dropout.jl")
 export dropout, dropout!
 
 include("softmax.jl")
-export softmax, softmax!, ∇softmax, ∇softmax!, logsoftmax, 
+export softmax, softmax!, ∇softmax, ∇softmax!, logsoftmax,
     logsoftmax!, ∇logsoftmax, ∇logsoftmax!, logsumexp
 
 include("batched/batchedadjtrans.jl")
@@ -47,9 +62,9 @@ include("gemm.jl")
 export grid_sample, ∇grid_sample
 
 include("conv.jl")
-export conv, conv!, ∇conv_data, ∇conv_data!, ∇conv_filter, 
-    ∇conv_filter!, depthwiseconv, depthwiseconv!, 
-    ∇depthwiseconv_data, ∇depthwiseconv_data!, 
+export conv, conv!, ∇conv_data, ∇conv_data!, ∇conv_filter,
+    ∇conv_filter!, depthwiseconv, depthwiseconv!,
+    ∇depthwiseconv_data, ∇depthwiseconv_data!,
     ∇depthwiseconv_filter, ∇depthwiseconv_filter!
 
 include("conv_bias_act.jl")
diff --git a/src/conv.jl b/src/conv.jl
@@ -181,7 +181,7 @@ for (front_name, backend, signature) in (
 )
     # We only define 3d conv primitives, we reshape lower down to get 1d and 2d convolution
     @eval begin
-        
+
         function $(Symbol("$(front_name)!"))(
                         out::AbstractArray{$(signature[1][1]), $(signature[1][2])},
                         in1::AbstractArray{$(signature[2][1]), $(signature[1][2])},
@@ -202,11 +202,21 @@ for (front_name, backend, signature) in (
                                 C_in = channels_in(cdims) ÷ groupcount(cdims),
                                 C_out = channels_out(cdims) ÷ groupcount(cdims))
 
-            Threads.@sync for (xc, wc) in zip(x_cs, w_cs)
+            function conv_group(xc, wc)
                 x = @view in1[ntuple(i -> i == 4 ? xc : Colon(), 5)...]
                 w = @view in2[ntuple(i -> i == 5 ? wc : Colon(), 5)...]
                 y = @view out[ntuple(i -> i == 4 ? wc : Colon(), 5)...]
-                Threads.@spawn $(Symbol("$(front_name)_$(backend)!"))(y, x, w, cdims2; kwargs...)
+                $(Symbol("$(front_name)_$(backend)!"))(y, x, w, cdims2; kwargs...)
+            end
+
+            if should_use_spawn() && length(x_cs) > 1
+                Threads.@sync for (xc, wc) in zip(x_cs, w_cs)
+                    Threads.@spawn conv_group(xc, wc)
+                end
+            else
+                for (xc, wc) in zip(x_cs, w_cs)
+                    conv_group(xc, wc)
+                end
             end
 
             return out
@@ -246,11 +256,21 @@ for (front_name, backend, signature) in (
                                 C_in = channels_in(cdims) ÷ groupcount(cdims),
                                 C_out = channels_out(cdims) ÷ groupcount(cdims))
 
-            Threads.@sync for (xc, yc, wc) in zip(dx_cs, dy_cs, w_cs)
+            function ∇conv_data_group(xc, yc, wc)
                 dxv = @view out[ntuple(i -> i == 4 ? xc : Colon(), 5)...]
                 dyv = @view in1[ntuple(i -> i == 4 ? yc : Colon(), 5)...]
                 wv = @view in2[ntuple(i -> i == 5  ? wc : Colon(), 5)...]
-                Threads.@spawn $(Symbol("$(front_name)_$(backend)!"))(dxv, dyv, wv, cdims2; kwargs...)
+                $(Symbol("$(front_name)_$(backend)!"))(dxv, dyv, wv, cdims2; kwargs...)
+            end
+
+            if should_use_spawn() && length(dx_cs) > 1
+                Threads.@sync for (xc, yc, wc) in zip(dx_cs, dy_cs, w_cs)
+                    Threads.@spawn ∇conv_data_group(xc, yc, wc)
+                end
+            else
+                for (xc, yc, wc) in zip(dx_cs, dy_cs, w_cs)
+                    ∇conv_data_group(xc, yc, wc)
+                end
             end
 
             return out
@@ -288,11 +308,21 @@ for (front_name, backend, signature) in (
                                 C_in = channels_in(cdims) ÷ groupcount(cdims),
                                 C_out = channels_out(cdims) ÷ groupcount(cdims))
 
-            Threads.@sync for (wc, xc, yc) in zip(dw_cs, x_cs, dy_cs)
+            function ∇conv_filter_group(wc, xc, yc)
                 x = @view in1[ntuple(i -> i == 4 ? xc : Colon(), 5)...]
                 dy = @view in2[ntuple(i -> i == 4 ? yc : Colon(), 5)...]
-                dw = @view out[ntuple(i -> i == 5 ? yc : Colon(), 5)...]
-                Threads.@spawn $(Symbol("$(front_name)_$(backend)!"))(dw, x, dy, cdims2; kwargs...)
+                dw = @view out[ntuple(i -> i == 5 ? wc : Colon(), 5)...]
+                $(Symbol("$(front_name)_$(backend)!"))(dw, x, dy, cdims2; kwargs...)
+            end
+
+            if should_use_spawn() && length(dw_cs) > 1
+                Threads.@sync for (wc, xc, yc) in zip(dw_cs, x_cs, dy_cs)
+                    Threads.@spawn ∇conv_filter_group(wc, xc, yc)
+                end
+            else
+                for (wc, xc, yc) in zip(dw_cs, x_cs, dy_cs)
+                    ∇conv_filter_group(wc, xc, yc)
+                end
             end
 
             return out
@@ -306,10 +336,10 @@ for (front_name, backend, signature) in (
     # (frontend, backend, (out Array signature, in1 Array signature, in2 Array signature, (parametric Types)))
     (:depthwiseconv, :im2col, ((:T, 5), (:T, 5), (:T, 5), :C, (:(T <: G), :(C <: ConvDims)))),
     (:depthwiseconv, :direct, ((:yT, :N), (:T1, :N), (:T2, :N), :C, (:yT, :T1, :T2, :N, :(C <: ConvDims)))),
-    
+
     (:∇depthwiseconv_data, :im2col, ((:T, 5), (:T, 5), (:T, 5), :C, (:(T <: G), :(C <: ConvDims)))),
     (:∇depthwiseconv_data, :direct, ((:yT, :N), (:T1, :N), (:T2, :N), :C, (:yT, :T1, :T2, :N, :(C <: ConvDims)))),
-    
+
     (:∇depthwiseconv_filter, :im2col, ((:T, 5), (:T, 5), (:T, 5), :C, (:(T <: G), :(C <: ConvDims)))),
     (:∇depthwiseconv_filter, :direct, ((:yT, :N), (:T1, :N), (:T2, :N), :C, (:yT, :T1, :T2, :N, :(C <: ConvDims)))),
 )
diff --git a/src/gemm.jl b/src/gemm.jl
@@ -104,22 +104,34 @@ for (gemm, elt) in gemm_datatype_mappings
 
                 old_threads = get_num_threads()
                 set_num_threads(1)
-                Threads.@sync for ks in Iterators.partition(1:size(C, 3), cld(size(C, 3), n_threads))
-                    Threads.@spawn for k in ks
+
+                parts = Iterators.partition(1:size(C, 3), cld(size(C, 3), n_threads))
+
+                function gemm!_part(ks)
+                    for k in ks
 
                         ptrAk = ptrA + (k-1) * strA * sizeof($elt)
                         ptrBk = ptrB + (k-1) * strB * sizeof($elt)
                         ptrCk = ptrC + (k-1) * strC * sizeof($elt)
 
                         ccall((@blasfunc($(gemm)), libblas), Nothing,
-                              (Ref{UInt8}, Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt},
-                               Ref{BlasInt}, Ref{$elt}, Ptr{$elt}, Ref{BlasInt},
-                               Ptr{$elt}, Ref{BlasInt}, Ref{$elt}, Ptr{$elt},
-                               Ref{BlasInt}),
-                              transA, transB, m, n,
-                              ka, alpha, ptrAk, max(1,Base.stride(A,2)),
-                              ptrBk, max(1,Base.stride(B,2)), beta, ptrCk,
-                              max(1,Base.stride(C,2)))
+                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt},
+                            Ref{BlasInt}, Ref{$elt}, Ptr{$elt}, Ref{BlasInt},
+                            Ptr{$elt}, Ref{BlasInt}, Ref{$elt}, Ptr{$elt},
+                            Ref{BlasInt}),
+                            transA, transB, m, n,
+                            ka, alpha, ptrAk, max(1,Base.stride(A,2)),
+                            ptrBk, max(1,Base.stride(B,2)), beta, ptrCk,
+                            max(1,Base.stride(C,2)))
+                    end
+                end
+                if should_use_spawn() && length(parts) > 1
+                    Threads.@sync for ks in parts
+                        Threads.@spawn gemm!_part(ks)
+                    end
+                else
+                    for ks in parts
+                        gemm!_part(ks)
                     end
                 end
                 set_num_threads(old_threads)
diff --git a/src/impl/conv_im2col.jl b/src/impl/conv_im2col.jl
@@ -47,20 +47,28 @@ function conv_im2col!(
 
     parts = Iterators.partition(axes(x, 5), ceil(Int, size(x, 5) / ntasks))
 
-    @sync for (task_n, part) in enumerate(parts)
-        Threads.@spawn begin
-            col_slice = col_slice = view(col, :, :, task_n) # col_slice is a task-local workspace
-            for batch_idx in part
-                im2col!(col_slice, view(x, :, :, :, :, batch_idx), cdims)
-                GC.@preserve col_slice w y begin
-                    col_ptr = pointer(col_slice)
-                    w_ptr = pointer(w)
-                    y_ptr = pointer(y, (batch_idx - 1)*M*N + 1)
-                    gemm!(Val(false), Val(false), M, N, K, alpha, col_ptr, w_ptr, beta, y_ptr)
-                end
+    function conv_part(task_n, part)
+        col_slice = col_slice = view(col, :, :, task_n) # col_slice is a task-local workspace
+        for batch_idx in part
+            im2col!(col_slice, view(x, :, :, :, :, batch_idx), cdims)
+            GC.@preserve col_slice w y begin
+                col_ptr = pointer(col_slice)
+                w_ptr = pointer(w)
+                y_ptr = pointer(y, (batch_idx - 1)*M*N + 1)
+                gemm!(Val(false), Val(false), M, N, K, alpha, col_ptr, w_ptr, beta, y_ptr)
             end
         end
     end
+
+    if should_use_spawn() && length(parts) > 1
+        @sync for (task_n, part) in enumerate(parts)
+            Threads.@spawn conv_part(task_n, part)
+        end
+    else
+        for (task_n, part) in enumerate(parts)
+            conv_part(task_n, part)
+        end
+    end
     return y
 end
 
@@ -152,18 +160,25 @@ function ∇conv_data_im2col!(
 
     parts = Iterators.partition(axes(dx, 5), ceil(Int, size(dx, 5) / ntasks))
 
-    @sync for (task_n, part) in enumerate(parts)
-        Threads.@spawn begin
-            col_slice = col_slice = view(col, :, :, task_n) # col_slice is a task-local workspace
-            for batch_idx in part
-                GC.@preserve col_slice w dy begin
-                    dy_ptr = pointer(dy, (batch_idx - 1)*M*K + 1)
-                    w_ptr = pointer(w)
-                    col_ptr = pointer(col_slice)
-                    gemm!(Val(false), Val(true), M, N, K, alpha, dy_ptr, w_ptr, T(0), col_ptr)
-                end
-                col2im!(view(dx, :, :, :, :, batch_idx), col_slice, cdims, beta)
+    function ∇conv_data_part(task_n, part)
+        col_slice = col_slice = view(col, :, :, task_n) # col_slice is a task-local workspace
+        for batch_idx in part
+            GC.@preserve col_slice w dy begin
+                dy_ptr = pointer(dy, (batch_idx - 1)*M*K + 1)
+                w_ptr = pointer(w)
+                col_ptr = pointer(col_slice)
+                gemm!(Val(false), Val(true), M, N, K, alpha, dy_ptr, w_ptr, T(0), col_ptr)
             end
+            col2im!(view(dx, :, :, :, :, batch_idx), col_slice, cdims, beta)
+        end
+    end
+    if should_use_spawn() && length(parts) > 1
+        @sync for (task_n, part) in enumerate(parts)
+            Threads.@spawn ∇conv_data_part(task_n, part)
+        end
+    else
+        for (task_n, part) in enumerate(parts)
+            ∇conv_data_part(task_n, part)
         end
     end
     return dx
diff --git a/src/impl/depthwiseconv_im2col.jl b/src/impl/depthwiseconv_im2col.jl
@@ -30,25 +30,32 @@ function depthwiseconv_im2col!(
 
     dcdims = DenseConvDims(cdims)
 
-    @sync for (task_n, part) in enumerate(parts)
-        Threads.@spawn begin
-            col_slice = col_slice = view(col, :, :, task_n) # col_slice is a task-local workspace
-            for batch_idx in part
-                im2col!(col_slice, view(x, :, :, :, :, batch_idx), dcdims)
-
-                # We do a separate convolution for each channel in x, as we must
-                for c_in in 1:channels_in(cdims)
-                    # Walk each pointer forward as we process each input channel
-                    GC.@preserve col_slice w y begin
-                        col_ptr = pointer(col_slice, (c_in-1)*M*K+1)
-                        w_ptr = pointer(w, (c_in-1)*K*N+1)
-                        y_ptr = pointer(y, ((batch_idx - 1)*channels_in(cdims) + c_in - 1)*M*N + 1)
-                        gemm!(Val(false), Val(false), M, N, K, alpha, col_ptr, w_ptr, beta, y_ptr)
-                    end
+    function depthwiseconv_part(task_n, part)
+        col_slice = col_slice = view(col, :, :, task_n) # col_slice is a task-local workspace
+        for batch_idx in part
+            im2col!(col_slice, view(x, :, :, :, :, batch_idx), dcdims)
+
+            # We do a separate convolution for each channel in x, as we must
+            for c_in in 1:channels_in(cdims)
+                # Walk each pointer forward as we process each input channel
+                GC.@preserve col_slice w y begin
+                    col_ptr = pointer(col_slice, (c_in-1)*M*K+1)
+                    w_ptr = pointer(w, (c_in-1)*K*N+1)
+                    y_ptr = pointer(y, ((batch_idx - 1)*channels_in(cdims) + c_in - 1)*M*N + 1)
+                    gemm!(Val(false), Val(false), M, N, K, alpha, col_ptr, w_ptr, beta, y_ptr)
                 end
             end
         end
     end
+    if should_use_spawn() && length(parts) > 1
+        @sync for (task_n, part) in enumerate(parts)
+            Threads.@spawn depthwiseconv_part(task_n, part)
+        end
+    else
+        for (task_n, part) in enumerate(parts)
+            depthwiseconv_part(task_n, part)
+        end
+    end
     return y
 end
 
@@ -117,22 +124,29 @@ function ∇depthwiseconv_data_im2col!(
 
     parts = Iterators.partition(axes(dx)[end], ceil(Int, size(dx, 5) / ntasks))
 
-    @sync for (task_n, part) in enumerate(parts)
-        Threads.@spawn begin
-            col_slice = col_slice = view(col, :, :, task_n) # col_slice is a task-local workspace
-            for batch_idx in part
-                # We do a separate convolution for each channel in x, as we must
-                for cidx in 1:channels_in(cdims)
-                    GC.@preserve col_slice w dy begin
-                        # Walk each pointer forward as we process each input channel
-                        dy_ptr = pointer(dy, (batch_idx - 1)*M*K*channels_in(cdims)+(cidx - 1)*K*M + 1)
-                        w_ptr = pointer(w, (cidx - 1)*K*N + 1)
-                        col_ptr = pointer(col_slice, (cidx - 1)*M*N + 1)
-                        gemm!(Val(false), Val(true), M, N, K, alpha, dy_ptr, w_ptr, T(0), col_ptr)
-                    end
+    function ∇depthwiseconv_data_part(task_n, part)
+        col_slice = col_slice = view(col, :, :, task_n) # col_slice is a task-local workspace
+        for batch_idx in part
+            # We do a separate convolution for each channel in x, as we must
+            for cidx in 1:channels_in(cdims)
+                GC.@preserve col_slice w dy begin
+                    # Walk each pointer forward as we process each input channel
+                    dy_ptr = pointer(dy, (batch_idx - 1)*M*K*channels_in(cdims)+(cidx - 1)*K*M + 1)
+                    w_ptr = pointer(w, (cidx - 1)*K*N + 1)
+                    col_ptr = pointer(col_slice, (cidx - 1)*M*N + 1)
+                    gemm!(Val(false), Val(true), M, N, K, alpha, dy_ptr, w_ptr, T(0), col_ptr)
                 end
-                col2im!(view(dx, :, :, :, :, batch_idx), col_slice, cdims, beta)
             end
+            col2im!(view(dx, :, :, :, :, batch_idx), col_slice, cdims, beta)
+        end
+    end
+    if should_use_spawn() && length(parts) > 1
+        @sync for (task_n, part) in enumerate(parts)
+            Threads.@spawn ∇depthwiseconv_data_part(task_n, part)
+        end
+    else
+        for (task_n, part) in enumerate(parts)
+            ∇depthwiseconv_data_part(task_n, part)
         end
     end
     return dx
diff --git a/test/conv.jl b/test/conv.jl
diff --git a/test/runtests.jl b/test/runtests.jl