Skip to content

Commit

Permalink
Use Threads.nthreads() instead of num_threads (#163)
Browse files Browse the repository at this point in the history
* Use Threads.nthreads() instead of num_threads

* a few updates and fixes
  • Loading branch information
chriselrod authored Jan 4, 2023
1 parent 430701e commit 5c30e4a
Show file tree
Hide file tree
Showing 7 changed files with 630 additions and 274 deletions.
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Octavian"
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
authors = ["Chris Elrod", "Dilum Aluthge", "Mason Protter", "contributors"]
version = "0.3.18"
version = "0.3.19"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand All @@ -17,11 +17,11 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"

[compat]
ArrayInterface = "3.1.14, 5.0.1, 6"
CPUSummary = "0.1.26"
CPUSummary = "0.1.26, 0.2.1"
IfElse = "0.1"
LoopVectorization = "0.12.86"
ManualMemory = "0.1.1"
PolyesterWeave = "0.1.1"
PolyesterWeave = "0.1.1, 0.2"
Requires = "1"
Static = "0.7.5, 0.8"
ThreadingUtilities = "0.5"
Expand Down
52 changes: 36 additions & 16 deletions src/Octavian.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,41 @@ using Requires: @require

using VectorizationBase, ArrayInterface, LoopVectorization

using VectorizationBase: align, AbstractStridedPointer, zstridedpointer, vsub_nsw, assume,
static_sizeof, StridedPointer, gesp, pause, pick_vector_width, has_feature
using CPUSummary: cache_size, num_cores, num_threads, cache_inclusive, cache_linesize
using VectorizationBase:
align,
AbstractStridedPointer,
zstridedpointer,
vsub_nsw,
assume,
static_sizeof,
StridedPointer,
gesp,
pause,
pick_vector_width,
has_feature
using CPUSummary: cache_size, num_cores, cache_inclusive, cache_linesize
using LoopVectorization: preserve_buffer, CloseOpen, UpperBoundedInteger
using ArrayInterface: size, strides, offsets, indices, axes, StrideIndex
using IfElse: ifelse
using PolyesterWeave
using Static: StaticInt, Zero, One, StaticBool, True, False, gt, eq, StaticFloat64,
roundtostaticint, floortostaticint
using Static:
StaticInt,
Zero,
One,
StaticBool,
True,
False,
gt,
eq,
StaticFloat64,
roundtostaticint,
floortostaticint
using ManualMemory: MemoryBuffer, load, store!

using ThreadingUtilities: _atomic_add!, _atomic_load, _atomic_store!, launch, wait, SPIN

if !(StaticInt <: Base.Integer)
const Integer = Union{Base.Integer, StaticInt}
const Integer = Union{Base.Integer,StaticInt}
end

export StaticInt
Expand All @@ -45,16 +65,16 @@ include("init.jl") # `Octavian.__init__()` is defined in this file
@static if VERSION >= v"1.8.0-beta1"
let
__init__()
A64 = rand(100,100)
matmul(A64,A64)
matmul(A64',A64)
matmul(A64,A64')
matmul(A64',A64')
A32 = rand(Float32,100,100)
matmul(A32,A32)
matmul(A32',A32)
matmul(A32,A32')
matmul(A32',A32')
A64 = rand(100, 100)
matmul(A64, A64)
matmul(A64', A64)
matmul(A64, A64')
matmul(A64', A64')
A32 = rand(Float32, 100, 100)
matmul(A32, A32)
matmul(A32', A32)
matmul(A32, A32')
matmul(A32', A32')
end
end

Expand Down
90 changes: 47 additions & 43 deletions src/block_sizes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,17 @@ function block_sizes(::Val{T}, _α, _β, R₁, R₂) where {T}
block_sizes(Val(T), W, α, β, L₁ₑ, L₂ₑ)
end
function block_sizes(::Val{T}, W, α, β, L₁ₑ, L₂ₑ) where {T}
mᵣnᵣ = matmul_params(Val(T))
mᵣ = getfield(mᵣnᵣ, 1)
nᵣ = getfield(mᵣnᵣ, 2)
mᵣ, nᵣ = matmul_params(Val(T))
MᵣW = mᵣ * W
Mc = floortostaticint((L₁ₑ)*(L₁ₑ*β + L₂ₑ*α)/(L₂ₑ) / StaticFloat64(MᵣW)) * MᵣW
Kc = roundtostaticint((L₁ₑ)*(L₂ₑ)/(L₁ₑ*β + L₂ₑ*α))
Nc = floortostaticint((L₂ₑ)*(L₁ₑ*β + L₂ₑ*α)/(L₁ₑ) / StaticFloat64(nᵣ)) * nᵣ

Mc = floortostaticint((L₁ₑ) * (L₁ₑ * β + L₂ₑ * α) / (L₂ₑ) / StaticFloat64(MᵣW)) * MᵣW
Kc = roundtostaticint((L₁ₑ) * (L₂ₑ) / (L₁ₑ * β + L₂ₑ * α))
Nc = floortostaticint((L₂ₑ) * (L₁ₑ * β + L₂ₑ * α) / (L₁ₑ) / StaticFloat64(nᵣ)) * nᵣ

Mc, Kc, Nc
end
function block_sizes(::Val{T}) where {T}
block_sizes(Val(T), W₁Default(), W₂Default(), R₁Default(), R₂Default())
block_sizes(Val(T), W₁Default(), W₂Default(), R₁Default(), R₂Default())
end

"""
Expand All @@ -48,12 +46,12 @@ This is meant to specify roughly the requested amount of blocks, and return rela
This method is used fairly generally.
"""
@inline function split_m(M, _Mblocks, W)
Miters = cld_fast(M, W)
Mblocks = min(_Mblocks, Miters)
Miter_per_block, Mrem = divrem_fast(Miters, Mblocks)
Mbsize = Miter_per_block * W
Mremfinal = M - Mbsize*(Mblocks-1) - Mrem * W
Mbsize, Mrem, Mremfinal, Mblocks
Miters = cld_fast(M, W)
Mblocks = min(_Mblocks, Miters)
Miter_per_block, Mrem = divrem_fast(Miters, Mblocks)
Mbsize = Miter_per_block * W
Mremfinal = M - Mbsize * (Mblocks - 1) - Mrem * W
Mbsize, Mrem, Mremfinal, Mblocks
end

"""
Expand Down Expand Up @@ -162,33 +160,36 @@ Note that for synchronization on `B`, all threads must have the same values for
independently of `M`, this algorithm guarantees all threads are on the same page.
"""
@inline function solve_block_sizes(::Val{T}, M, K, N, _α, _β, R₂, R₃, Wfactor) where {T}
W = pick_vector_width(T)
α =* W
β =* W
L₁ₑ = first_cache_size(Val(T)) * R₂
L₂ₑ = second_cache_size(Val(T)) * R₃
W = pick_vector_width(T)
α =* W
β =* W
L₁ₑ = first_cache_size(Val(T)) * R₂
L₂ₑ = second_cache_size(Val(T)) * R₃

# Nc_init = round(Int, √(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)/√(L₁ₑ))
Nc_init⁻¹ = (L₁ₑ) / ((L₂ₑ)*√* L₂ₑ + β * L₁ₑ))

Niter = cldapproxi(N, Nc_init⁻¹) # approximate `ceil`
Nblock, Nrem = divrem_fast(N, Niter)
Nblock_Nrem = Nblock + One()#(Nrem > 0)
# Nc_init = round(Int, √(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)/√(L₁ₑ))
Nc_init⁻¹ = (L₁ₑ) / ((L₂ₑ) * * L₂ₑ + β * L₁ₑ))

((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) = solve_McKc(Val(T), M, K, Nblock_Nrem, _α, _β, R₂, R₃, Wfactor)

(Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter), promote(Nblock, Nblock_Nrem, Nrem, Niter)
Niter = cldapproxi(N, Nc_init⁻¹) # approximate `ceil`
Nblock, Nrem = divrem_fast(N, Niter)
Nblock_Nrem = Nblock + One()#(Nrem > 0)

((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) =
solve_McKc(Val(T), M, K, Nblock_Nrem, _α, _β, R₂, R₃, Wfactor)

(Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter),
(Kblock, Kblock_Krem, Krem, Kiter),
promote(Nblock, Nblock_Nrem, Nrem, Niter)
end
# Takes Nc, calcs Mc and Kc
@inline function solve_McKc(::Val{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
W = pick_vector_width(T)
Wfloat = StaticFloat64(W)
α =* Wfloat
β =* Wfloat
L₁ₑ = first_cache_size(Val(T)) * R₂
# β = _β * Wfloat
L₁ₑ = first_cache_size(Val(T)) * R₂
L₂ₑ = second_cache_size(Val(T)) * R₃

Kc_init⁻¹ = Base.FastMath.max_fast(/L₁ₑ), Nc*inv(L₂ₑ))
Kc_init⁻¹ = Base.FastMath.max_fast( / L₁ₑ), Nc * inv(L₂ₑ))
Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
Kblock, Krem = divrem_fast(K, Kiter)
Kblock_Krem = Kblock + One()
Expand All @@ -202,7 +203,7 @@ end
Mblocks, Mblocks_rem = divrem_fast(M, Mᵣ)
Miter, Mrem = divrem_fast(Mblocks, Mc_init_base)
if Miter == 0
return (0, 0, Int(M)::Int, 0, 1), Kblock_summary
return (0, 0, Int(M)::Int, 0, 1), Kblock_summary
elseif Miter > Mrem
Mblock_Mrem = Mbsize + Mᵣ
Mremfinal = Mbsize + Mblocks_rem
Expand All @@ -221,7 +222,10 @@ end
end
end

@inline cldapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432)) # approximate `ceil`
@inline cldapproxi(n, d⁻¹) = Base.fptosi(
Int,
Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432),
) # approximate `ceil`
# @inline divapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.mul_fast(n, d⁻¹)) # approximate `div`

"""
Expand All @@ -231,14 +235,14 @@ Finds first combination of `Miter` and `Niter` that doesn't make `M` too small w
This would be awkard if there are computers with prime numbers of cores. I should probably consider that possibility at some point.
"""
@inline function find_first_acceptable(::Val{T}, M, W) where {T}
Mᵣ, Nᵣ = matmul_params(Val(T))
factors = calc_factors()
for (miter, niter) factors
if miter * (StaticInt(2) * Mᵣ * W) M + (W + W)
return miter, niter
end
Mᵣ, _ = matmul_params(Val(T))
factors = calc_factors()
for (miter, niter) factors
if miter * (StaticInt(2) * Mᵣ * W) M + (W + W)
return miter, niter
end
last(factors)
end
last(factors)
end
"""
divide_blocks(M, Ntotal, _nspawn, W)
Expand All @@ -247,8 +251,8 @@ Splits both `M` and `N` into blocks when trying to spawn a large number of threa
"""
@inline function divide_blocks(::Val{T}, M, Ntotal, _nspawn, W) where {T}
_nspawn == num_cores() && return find_first_acceptable(Val(T), M, W)
mᵣ, nᵣ = matmul_params(Val(T))
Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn)
mᵣ, _ = matmul_params(Val(T))
Miter = clamp(div_fast(M, W * mᵣ * MᵣW_mul_factor()), 1, _nspawn)
nspawn = div_fast(_nspawn, Miter)
if (nspawn 1) & (Miter < _nspawn)
# rebalance Miter
Expand Down
30 changes: 17 additions & 13 deletions src/global_constants.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,23 @@ const OCTAVIAN_NUM_TASKS = Ref(1)
_nthreads() = OCTAVIAN_NUM_TASKS[]

@generated function calc_factors(::Union{Val{nc},StaticInt{nc}} = num_cores()) where {nc}
t = Expr(:tuple)
for i nc:-1:1
d, r = divrem(nc, i)
iszero(r) && push!(t.args, (i, d))
end
t
t = Expr(:tuple)
for i nc:-1:1
d, r = divrem(nc, i)
iszero(r) && push!(t.args, (i, d))
end
t
end
# const CORE_FACTORS = calc_factors()

MᵣW_mul_factor(::True) = StaticInt{4}()
MᵣW_mul_factor(::False) = StaticInt{9}()
MᵣW_mul_factor() = MᵣW_mul_factor(has_feature(Val(:x86_64_avx512f)))

W₁Default(::True) = StaticFloat64{0.0009898277594117685}()
W₂Default(::True) = StaticFloat64{0.9865020832559304}()
R₁Default(::True) = StaticFloat64{0.5820044063603483}()
R₂Default(::True) = StaticFloat64{0.7580885846640107}()
W₁Default(::True) = StaticFloat64{0.0007423708195588264}()
W₂Default(::True) = StaticFloat64{0.7757548987718677}()
R₁Default(::True) = StaticFloat64{0.7936663315339363}()
R₂Default(::True) = StaticFloat64{0.7144577794375783}()

W₁Default_arch(::Val{:znver1}) = StaticFloat64{0.053918949422353986}()
W₂Default_arch(::Val{:znver1}) = StaticFloat64{0.3013238122374886}()
Expand Down Expand Up @@ -55,16 +55,20 @@ end

second_cache() = first_cache() + One()

_first_cache_size(fcs::StaticInt) = ifelse(eq(first_cache(), StaticInt(2)) & cache_inclusive(StaticInt(2)), fcs - cache_size(One()), fcs)
_first_cache_size(fcs::StaticInt) = ifelse(
eq(first_cache(), StaticInt(2)) & cache_inclusive(StaticInt(2)),
fcs - cache_size(One()),
fcs,
)
_first_cache_size(::Nothing) = StaticInt(262144)
first_cache_size() = _first_cache_size(cache_size(first_cache()))

_second_cache_size(scs::StaticInt, ::True) = scs - cache_size(first_cache())
_second_cache_size(scs::StaticInt, ::False) = scs
_second_cache_size(::StaticInt{0}, ::Nothing) = StaticInt(3145728)
function second_cache_size()
sc = second_cache()
_second_cache_size(cache_size(sc), cache_inclusive(sc)) * min(num_cores(), num_threads())
sc = second_cache()
_second_cache_size(cache_size(sc), cache_inclusive(sc))
end

first_cache_size(::Val{T}) where {T} = first_cache_size() ÷ static_sizeof(T)
Expand Down
12 changes: 10 additions & 2 deletions src/init.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,23 @@ end
function init_bcache()
if bcache_count() Zero()
if BCACHEPTR[] == C_NULL
BCACHEPTR[] = VectorizationBase.valloc(second_cache_size() * bcache_count(), Cvoid, ccall(:jl_getpagesize, Int, ()))
BCACHEPTR[] = VectorizationBase.valloc(
Threads.nthreads() * second_cache_size() * bcache_count(),
Cvoid,
ccall(:jl_getpagesize, Int, ()),
)
end
end
nothing
end

function init_acache()
if ACACHEPTR[] == C_NULL
ACACHEPTR[] = VectorizationBase.valloc(first_cache_size() * init_num_tasks(), Cvoid, ccall(:jl_getpagesize, Int, ()))
ACACHEPTR[] = VectorizationBase.valloc(
first_cache_size() * init_num_tasks(),
Cvoid,
ccall(:jl_getpagesize, Int, ()),
)
end
nothing
end
Expand Down
Loading

2 comments on commit 5c30e4a

@chriselrod
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/75050

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.3.19 -m "<description of version>" 5c30e4af5d0f63bb7a340a3ea15e526c4077cba9
git push origin v0.3.19

Please sign in to comment.