Use Threads.nthreads() instead of num_threads (#163)

* Use Threads.nthreads() instead of num_threads * a few updates and fixes
JuliaLinearAlgebra · Jan 4, 2023 · 5c30e4a · 5c30e4a · chriselrod · Jan 4, 2023
1 parent 430701e
commit 5c30e4a
Show file tree

Hide file tree

Showing 7 changed files with 630 additions and 274 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Octavian"
 uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 authors = ["Chris Elrod", "Dilum Aluthge", "Mason Protter", "contributors"]
-version = "0.3.18"
+version = "0.3.19"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -17,11 +17,11 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 ArrayInterface = "3.1.14, 5.0.1, 6"
-CPUSummary = "0.1.26"
+CPUSummary = "0.1.26, 0.2.1"
 IfElse = "0.1"
 LoopVectorization = "0.12.86"
 ManualMemory = "0.1.1"
-PolyesterWeave = "0.1.1"
+PolyesterWeave = "0.1.1, 0.2"
 Requires = "1"
 Static = "0.7.5, 0.8"
 ThreadingUtilities = "0.5"

diff --git a/src/Octavian.jl b/src/Octavian.jl
@@ -4,21 +4,41 @@ using Requires: @require
 
 using VectorizationBase, ArrayInterface, LoopVectorization
 
-using VectorizationBase: align, AbstractStridedPointer, zstridedpointer, vsub_nsw, assume,
-  static_sizeof, StridedPointer, gesp, pause, pick_vector_width, has_feature
-using CPUSummary: cache_size, num_cores, num_threads, cache_inclusive, cache_linesize
+using VectorizationBase:
+  align,
+  AbstractStridedPointer,
+  zstridedpointer,
+  vsub_nsw,
+  assume,
+  static_sizeof,
+  StridedPointer,
+  gesp,
+  pause,
+  pick_vector_width,
+  has_feature
+using CPUSummary: cache_size, num_cores, cache_inclusive, cache_linesize
 using LoopVectorization: preserve_buffer, CloseOpen, UpperBoundedInteger
 using ArrayInterface: size, strides, offsets, indices, axes, StrideIndex
 using IfElse: ifelse
 using PolyesterWeave
-using Static: StaticInt, Zero, One, StaticBool, True, False, gt, eq, StaticFloat64,
-    roundtostaticint, floortostaticint
+using Static:
+  StaticInt,
+  Zero,
+  One,
+  StaticBool,
+  True,
+  False,
+  gt,
+  eq,
+  StaticFloat64,
+  roundtostaticint,
+  floortostaticint
 using ManualMemory: MemoryBuffer, load, store!
 
 using ThreadingUtilities: _atomic_add!, _atomic_load, _atomic_store!, launch, wait, SPIN
 
 if !(StaticInt <: Base.Integer)
-const Integer = Union{Base.Integer, StaticInt}
+  const Integer = Union{Base.Integer,StaticInt}
 end
 
 export StaticInt
@@ -45,16 +65,16 @@ include("init.jl") # `Octavian.__init__()` is defined in this file
 @static if VERSION >= v"1.8.0-beta1"
   let
     __init__()
-    A64 = rand(100,100)
-    matmul(A64,A64)
-    matmul(A64',A64)
-    matmul(A64,A64')
-    matmul(A64',A64')
-    A32 = rand(Float32,100,100)
-    matmul(A32,A32)
-    matmul(A32',A32)
-    matmul(A32,A32')
-    matmul(A32',A32')
+    A64 = rand(100, 100)
+    matmul(A64, A64)
+    matmul(A64', A64)
+    matmul(A64, A64')
+    matmul(A64', A64')
+    A32 = rand(Float32, 100, 100)
+    matmul(A32, A32)
+    matmul(A32', A32)
+    matmul(A32, A32')
+    matmul(A32', A32')
   end
 end
 

diff --git a/src/block_sizes.jl b/src/block_sizes.jl
@@ -11,19 +11,17 @@ function block_sizes(::Val{T}, _α, _β, R₁, R₂) where {T}
   block_sizes(Val(T), W, α, β, L₁ₑ, L₂ₑ)
 end
 function block_sizes(::Val{T}, W, α, β, L₁ₑ, L₂ₑ) where {T}
-  mᵣnᵣ = matmul_params(Val(T))
-  mᵣ = getfield(mᵣnᵣ, 1)
-  nᵣ = getfield(mᵣnᵣ, 2)
+  mᵣ, nᵣ = matmul_params(Val(T))
   MᵣW = mᵣ * W
-    
-  Mc = floortostaticint(√(L₁ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₂ₑ) / StaticFloat64(MᵣW)) * MᵣW
-  Kc = roundtostaticint(√(L₁ₑ)*√(L₂ₑ)/√(L₁ₑ*β + L₂ₑ*α))
-  Nc = floortostaticint(√(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ) / StaticFloat64(nᵣ)) * nᵣ
-  
+
+  Mc = floortostaticint(√(L₁ₑ) * √(L₁ₑ * β + L₂ₑ * α) / √(L₂ₑ) / StaticFloat64(MᵣW)) * MᵣW
+  Kc = roundtostaticint(√(L₁ₑ) * √(L₂ₑ) / √(L₁ₑ * β + L₂ₑ * α))
+  Nc = floortostaticint(√(L₂ₑ) * √(L₁ₑ * β + L₂ₑ * α) / √(L₁ₑ) / StaticFloat64(nᵣ)) * nᵣ
+
   Mc, Kc, Nc
 end
 function block_sizes(::Val{T}) where {T}
-    block_sizes(Val(T), W₁Default(), W₂Default(), R₁Default(), R₂Default())
+  block_sizes(Val(T), W₁Default(), W₂Default(), R₁Default(), R₂Default())
 end
 
 """
@@ -48,12 +46,12 @@ This is meant to specify roughly the requested amount of blocks, and return rela
 This method is used fairly generally.
 """
 @inline function split_m(M, _Mblocks, W)
-    Miters = cld_fast(M, W)
-    Mblocks = min(_Mblocks, Miters)
-    Miter_per_block, Mrem = divrem_fast(Miters, Mblocks)
-    Mbsize = Miter_per_block * W
-    Mremfinal = M - Mbsize*(Mblocks-1) - Mrem * W
-    Mbsize, Mrem, Mremfinal, Mblocks
+  Miters = cld_fast(M, W)
+  Mblocks = min(_Mblocks, Miters)
+  Miter_per_block, Mrem = divrem_fast(Miters, Mblocks)
+  Mbsize = Miter_per_block * W
+  Mremfinal = M - Mbsize * (Mblocks - 1) - Mrem * W
+  Mbsize, Mrem, Mremfinal, Mblocks
 end
 
 """
@@ -162,33 +160,36 @@ Note that for synchronization on `B`, all threads must have the same values for
 independently of `M`, this algorithm guarantees all threads are on the same page.
 """
 @inline function solve_block_sizes(::Val{T}, M, K, N, _α, _β, R₂, R₃, Wfactor) where {T}
-    W = pick_vector_width(T)
-    α = _α * W
-    β = _β * W
-    L₁ₑ =  first_cache_size(Val(T)) * R₂
-    L₂ₑ = second_cache_size(Val(T)) * R₃
+  W = pick_vector_width(T)
+  α = _α * W
+  β = _β * W
+  L₁ₑ = first_cache_size(Val(T)) * R₂
+  L₂ₑ = second_cache_size(Val(T)) * R₃
 
-    # Nc_init = round(Int, √(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)/√(L₁ₑ))
-    Nc_init⁻¹ = √(L₁ₑ) / (√(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ))
-
-    Niter = cldapproxi(N, Nc_init⁻¹) # approximate `ceil`
-    Nblock, Nrem = divrem_fast(N, Niter)
-    Nblock_Nrem = Nblock + One()#(Nrem > 0)
+  # Nc_init = round(Int, √(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)/√(L₁ₑ))
+  Nc_init⁻¹ = √(L₁ₑ) / (√(L₂ₑ) * √(α * L₂ₑ + β * L₁ₑ))
 
-    ((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) = solve_McKc(Val(T), M, K, Nblock_Nrem, _α, _β, R₂, R₃, Wfactor)
-
-    (Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter), promote(Nblock, Nblock_Nrem, Nrem, Niter)
+  Niter = cldapproxi(N, Nc_init⁻¹) # approximate `ceil`
+  Nblock, Nrem = divrem_fast(N, Niter)
+  Nblock_Nrem = Nblock + One()#(Nrem > 0)
+
+  ((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) =
+    solve_McKc(Val(T), M, K, Nblock_Nrem, _α, _β, R₂, R₃, Wfactor)
+
+  (Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter),
+  (Kblock, Kblock_Krem, Krem, Kiter),
+  promote(Nblock, Nblock_Nrem, Nrem, Niter)
 end
 # Takes Nc, calcs Mc and Kc
 @inline function solve_McKc(::Val{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
   W = pick_vector_width(T)
   Wfloat = StaticFloat64(W)
   α = _α * Wfloat
-  β = _β * Wfloat
-  L₁ₑ =  first_cache_size(Val(T)) * R₂
+  # β = _β * Wfloat
+  L₁ₑ = first_cache_size(Val(T)) * R₂
   L₂ₑ = second_cache_size(Val(T)) * R₃
 
-  Kc_init⁻¹ = Base.FastMath.max_fast(√(α/L₁ₑ), Nc*inv(L₂ₑ))
+  Kc_init⁻¹ = Base.FastMath.max_fast(√(α / L₁ₑ), Nc * inv(L₂ₑ))
   Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
   Kblock, Krem = divrem_fast(K, Kiter)
   Kblock_Krem = Kblock + One()
@@ -202,7 +203,7 @@ end
     Mblocks, Mblocks_rem = divrem_fast(M, Mᵣ)
     Miter, Mrem = divrem_fast(Mblocks, Mc_init_base)
     if Miter == 0
-       return (0, 0, Int(M)::Int, 0, 1), Kblock_summary
+      return (0, 0, Int(M)::Int, 0, 1), Kblock_summary
     elseif Miter > Mrem
       Mblock_Mrem = Mbsize + Mᵣ
       Mremfinal = Mbsize + Mblocks_rem
@@ -221,7 +222,10 @@ end
   end
 end
 
-@inline cldapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432)) # approximate `ceil`
+@inline cldapproxi(n, d⁻¹) = Base.fptosi(
+  Int,
+  Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432),
+) # approximate `ceil`
 # @inline divapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.mul_fast(n, d⁻¹)) # approximate `div`
 
 """
@@ -231,14 +235,14 @@ Finds first combination of `Miter` and `Niter` that doesn't make `M` too small w
 This would be awkard if there are computers with prime numbers of cores. I should probably consider that possibility at some point.
 """
 @inline function find_first_acceptable(::Val{T}, M, W) where {T}
-    Mᵣ, Nᵣ = matmul_params(Val(T))
-    factors = calc_factors()
-    for (miter, niter) ∈ factors
-        if miter * (StaticInt(2) * Mᵣ * W) ≤ M + (W + W)
-            return miter, niter
-        end
+  Mᵣ, _ = matmul_params(Val(T))
+  factors = calc_factors()
+  for (miter, niter) ∈ factors
+    if miter * (StaticInt(2) * Mᵣ * W) ≤ M + (W + W)
+      return miter, niter
     end
-    last(factors)
+  end
+  last(factors)
 end
 """
   divide_blocks(M, Ntotal, _nspawn, W)
@@ -247,8 +251,8 @@ Splits both `M` and `N` into blocks when trying to spawn a large number of threa
 """
 @inline function divide_blocks(::Val{T}, M, Ntotal, _nspawn, W) where {T}
   _nspawn == num_cores() && return find_first_acceptable(Val(T), M, W)
-  mᵣ, nᵣ = matmul_params(Val(T))
-  Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn)
+  mᵣ, _ = matmul_params(Val(T))
+  Miter = clamp(div_fast(M, W * mᵣ * MᵣW_mul_factor()), 1, _nspawn)
   nspawn = div_fast(_nspawn, Miter)
   if (nspawn ≤ 1) & (Miter < _nspawn)
     # rebalance Miter

diff --git a/src/global_constants.jl b/src/global_constants.jl
@@ -3,23 +3,23 @@ const OCTAVIAN_NUM_TASKS = Ref(1)
 _nthreads() = OCTAVIAN_NUM_TASKS[]
 
 @generated function calc_factors(::Union{Val{nc},StaticInt{nc}} = num_cores()) where {nc}
-    t = Expr(:tuple)
-    for i ∈ nc:-1:1
-        d, r = divrem(nc, i)
-        iszero(r) && push!(t.args, (i, d))
-    end
-    t
+  t = Expr(:tuple)
+  for i ∈ nc:-1:1
+    d, r = divrem(nc, i)
+    iszero(r) && push!(t.args, (i, d))
+  end
+  t
 end
 # const CORE_FACTORS = calc_factors()
 
 MᵣW_mul_factor(::True) = StaticInt{4}()
 MᵣW_mul_factor(::False) = StaticInt{9}()
 MᵣW_mul_factor() = MᵣW_mul_factor(has_feature(Val(:x86_64_avx512f)))
 
-W₁Default(::True) = StaticFloat64{0.0009898277594117685}()
-W₂Default(::True) = StaticFloat64{0.9865020832559304}()
-R₁Default(::True) = StaticFloat64{0.5820044063603483}()
-R₂Default(::True) = StaticFloat64{0.7580885846640107}()
+W₁Default(::True) = StaticFloat64{0.0007423708195588264}()
+W₂Default(::True) = StaticFloat64{0.7757548987718677}()
+R₁Default(::True) = StaticFloat64{0.7936663315339363}()
+R₂Default(::True) = StaticFloat64{0.7144577794375783}()
 
 W₁Default_arch(::Val{:znver1}) = StaticFloat64{0.053918949422353986}()
 W₂Default_arch(::Val{:znver1}) = StaticFloat64{0.3013238122374886}()
@@ -55,16 +55,20 @@ end
 
 second_cache() = first_cache() + One()
 
-_first_cache_size(fcs::StaticInt) = ifelse(eq(first_cache(), StaticInt(2)) & cache_inclusive(StaticInt(2)), fcs - cache_size(One()), fcs)
+_first_cache_size(fcs::StaticInt) = ifelse(
+  eq(first_cache(), StaticInt(2)) & cache_inclusive(StaticInt(2)),
+  fcs - cache_size(One()),
+  fcs,
+)
 _first_cache_size(::Nothing) = StaticInt(262144)
 first_cache_size() = _first_cache_size(cache_size(first_cache()))
 
 _second_cache_size(scs::StaticInt, ::True) = scs - cache_size(first_cache())
 _second_cache_size(scs::StaticInt, ::False) = scs
 _second_cache_size(::StaticInt{0}, ::Nothing) = StaticInt(3145728)
 function second_cache_size()
-    sc = second_cache()
-    _second_cache_size(cache_size(sc), cache_inclusive(sc)) * min(num_cores(), num_threads())
+  sc = second_cache()
+  _second_cache_size(cache_size(sc), cache_inclusive(sc))
 end
 
 first_cache_size(::Val{T}) where {T} = first_cache_size() ÷ static_sizeof(T)

diff --git a/src/init.jl b/src/init.jl
@@ -18,15 +18,23 @@ end
 function init_bcache()
   if bcache_count() ≢ Zero()
     if BCACHEPTR[] == C_NULL
-      BCACHEPTR[] = VectorizationBase.valloc(second_cache_size() * bcache_count(), Cvoid, ccall(:jl_getpagesize, Int, ()))
+      BCACHEPTR[] = VectorizationBase.valloc(
+        Threads.nthreads() * second_cache_size() * bcache_count(),
+        Cvoid,
+        ccall(:jl_getpagesize, Int, ()),
+      )
     end
   end
   nothing
 end
 
 function init_acache()
   if ACACHEPTR[] == C_NULL
-    ACACHEPTR[] = VectorizationBase.valloc(first_cache_size() * init_num_tasks(), Cvoid, ccall(:jl_getpagesize, Int, ()))
+    ACACHEPTR[] = VectorizationBase.valloc(
+      first_cache_size() * init_num_tasks(),
+      Cvoid,
+      ccall(:jl_getpagesize, Int, ()),
+    )
   end
   nothing
 end