Skip to content

Commit 5c30e4a

Browse files
authored
Use Threads.nthreads() instead of num_threads (#163)
* Use Threads.nthreads() instead of num_threads * a few updates and fixes
1 parent 430701e commit 5c30e4a

File tree

7 files changed

+630
-274
lines changed

7 files changed

+630
-274
lines changed

Project.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "Octavian"
22
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
33
authors = ["Chris Elrod", "Dilum Aluthge", "Mason Protter", "contributors"]
4-
version = "0.3.18"
4+
version = "0.3.19"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -17,11 +17,11 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1717

1818
[compat]
1919
ArrayInterface = "3.1.14, 5.0.1, 6"
20-
CPUSummary = "0.1.26"
20+
CPUSummary = "0.1.26, 0.2.1"
2121
IfElse = "0.1"
2222
LoopVectorization = "0.12.86"
2323
ManualMemory = "0.1.1"
24-
PolyesterWeave = "0.1.1"
24+
PolyesterWeave = "0.1.1, 0.2"
2525
Requires = "1"
2626
Static = "0.7.5, 0.8"
2727
ThreadingUtilities = "0.5"

src/Octavian.jl

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,41 @@ using Requires: @require
44

55
using VectorizationBase, ArrayInterface, LoopVectorization
66

7-
using VectorizationBase: align, AbstractStridedPointer, zstridedpointer, vsub_nsw, assume,
8-
static_sizeof, StridedPointer, gesp, pause, pick_vector_width, has_feature
9-
using CPUSummary: cache_size, num_cores, num_threads, cache_inclusive, cache_linesize
7+
using VectorizationBase:
8+
align,
9+
AbstractStridedPointer,
10+
zstridedpointer,
11+
vsub_nsw,
12+
assume,
13+
static_sizeof,
14+
StridedPointer,
15+
gesp,
16+
pause,
17+
pick_vector_width,
18+
has_feature
19+
using CPUSummary: cache_size, num_cores, cache_inclusive, cache_linesize
1020
using LoopVectorization: preserve_buffer, CloseOpen, UpperBoundedInteger
1121
using ArrayInterface: size, strides, offsets, indices, axes, StrideIndex
1222
using IfElse: ifelse
1323
using PolyesterWeave
14-
using Static: StaticInt, Zero, One, StaticBool, True, False, gt, eq, StaticFloat64,
15-
roundtostaticint, floortostaticint
24+
using Static:
25+
StaticInt,
26+
Zero,
27+
One,
28+
StaticBool,
29+
True,
30+
False,
31+
gt,
32+
eq,
33+
StaticFloat64,
34+
roundtostaticint,
35+
floortostaticint
1636
using ManualMemory: MemoryBuffer, load, store!
1737

1838
using ThreadingUtilities: _atomic_add!, _atomic_load, _atomic_store!, launch, wait, SPIN
1939

2040
if !(StaticInt <: Base.Integer)
21-
const Integer = Union{Base.Integer, StaticInt}
41+
const Integer = Union{Base.Integer,StaticInt}
2242
end
2343

2444
export StaticInt
@@ -45,16 +65,16 @@ include("init.jl") # `Octavian.__init__()` is defined in this file
4565
@static if VERSION >= v"1.8.0-beta1"
4666
let
4767
__init__()
48-
A64 = rand(100,100)
49-
matmul(A64,A64)
50-
matmul(A64',A64)
51-
matmul(A64,A64')
52-
matmul(A64',A64')
53-
A32 = rand(Float32,100,100)
54-
matmul(A32,A32)
55-
matmul(A32',A32)
56-
matmul(A32,A32')
57-
matmul(A32',A32')
68+
A64 = rand(100, 100)
69+
matmul(A64, A64)
70+
matmul(A64', A64)
71+
matmul(A64, A64')
72+
matmul(A64', A64')
73+
A32 = rand(Float32, 100, 100)
74+
matmul(A32, A32)
75+
matmul(A32', A32)
76+
matmul(A32, A32')
77+
matmul(A32', A32')
5878
end
5979
end
6080

src/block_sizes.jl

Lines changed: 47 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,17 @@ function block_sizes(::Val{T}, _α, _β, R₁, R₂) where {T}
1111
block_sizes(Val(T), W, α, β, L₁ₑ, L₂ₑ)
1212
end
1313
function block_sizes(::Val{T}, W, α, β, L₁ₑ, L₂ₑ) where {T}
14-
mᵣnᵣ = matmul_params(Val(T))
15-
mᵣ = getfield(mᵣnᵣ, 1)
16-
nᵣ = getfield(mᵣnᵣ, 2)
14+
mᵣ, nᵣ = matmul_params(Val(T))
1715
MᵣW = mᵣ * W
18-
19-
Mc = floortostaticint((L₁ₑ)*(L₁ₑ*β + L₂ₑ*α)/(L₂ₑ) / StaticFloat64(MᵣW)) * MᵣW
20-
Kc = roundtostaticint((L₁ₑ)*(L₂ₑ)/(L₁ₑ*β + L₂ₑ*α))
21-
Nc = floortostaticint((L₂ₑ)*(L₁ₑ*β + L₂ₑ*α)/(L₁ₑ) / StaticFloat64(nᵣ)) * nᵣ
22-
16+
17+
Mc = floortostaticint((L₁ₑ) * (L₁ₑ * β + L₂ₑ * α) / (L₂ₑ) / StaticFloat64(MᵣW)) * MᵣW
18+
Kc = roundtostaticint((L₁ₑ) * (L₂ₑ) / (L₁ₑ * β + L₂ₑ * α))
19+
Nc = floortostaticint((L₂ₑ) * (L₁ₑ * β + L₂ₑ * α) / (L₁ₑ) / StaticFloat64(nᵣ)) * nᵣ
20+
2321
Mc, Kc, Nc
2422
end
2523
function block_sizes(::Val{T}) where {T}
26-
block_sizes(Val(T), W₁Default(), W₂Default(), R₁Default(), R₂Default())
24+
block_sizes(Val(T), W₁Default(), W₂Default(), R₁Default(), R₂Default())
2725
end
2826

2927
"""
@@ -48,12 +46,12 @@ This is meant to specify roughly the requested amount of blocks, and return rela
4846
This method is used fairly generally.
4947
"""
5048
@inline function split_m(M, _Mblocks, W)
51-
Miters = cld_fast(M, W)
52-
Mblocks = min(_Mblocks, Miters)
53-
Miter_per_block, Mrem = divrem_fast(Miters, Mblocks)
54-
Mbsize = Miter_per_block * W
55-
Mremfinal = M - Mbsize*(Mblocks-1) - Mrem * W
56-
Mbsize, Mrem, Mremfinal, Mblocks
49+
Miters = cld_fast(M, W)
50+
Mblocks = min(_Mblocks, Miters)
51+
Miter_per_block, Mrem = divrem_fast(Miters, Mblocks)
52+
Mbsize = Miter_per_block * W
53+
Mremfinal = M - Mbsize * (Mblocks - 1) - Mrem * W
54+
Mbsize, Mrem, Mremfinal, Mblocks
5755
end
5856

5957
"""
@@ -162,33 +160,36 @@ Note that for synchronization on `B`, all threads must have the same values for
162160
independently of `M`, this algorithm guarantees all threads are on the same page.
163161
"""
164162
@inline function solve_block_sizes(::Val{T}, M, K, N, _α, _β, R₂, R₃, Wfactor) where {T}
165-
W = pick_vector_width(T)
166-
α =* W
167-
β =* W
168-
L₁ₑ = first_cache_size(Val(T)) * R₂
169-
L₂ₑ = second_cache_size(Val(T)) * R₃
163+
W = pick_vector_width(T)
164+
α =* W
165+
β =* W
166+
L₁ₑ = first_cache_size(Val(T)) * R₂
167+
L₂ₑ = second_cache_size(Val(T)) * R₃
170168

171-
# Nc_init = round(Int, √(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)/√(L₁ₑ))
172-
Nc_init⁻¹ = (L₁ₑ) / ((L₂ₑ)*√* L₂ₑ + β * L₁ₑ))
173-
174-
Niter = cldapproxi(N, Nc_init⁻¹) # approximate `ceil`
175-
Nblock, Nrem = divrem_fast(N, Niter)
176-
Nblock_Nrem = Nblock + One()#(Nrem > 0)
169+
# Nc_init = round(Int, √(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)/√(L₁ₑ))
170+
Nc_init⁻¹ = (L₁ₑ) / ((L₂ₑ) * * L₂ₑ + β * L₁ₑ))
177171

178-
((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) = solve_McKc(Val(T), M, K, Nblock_Nrem, _α, _β, R₂, R₃, Wfactor)
179-
180-
(Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter), promote(Nblock, Nblock_Nrem, Nrem, Niter)
172+
Niter = cldapproxi(N, Nc_init⁻¹) # approximate `ceil`
173+
Nblock, Nrem = divrem_fast(N, Niter)
174+
Nblock_Nrem = Nblock + One()#(Nrem > 0)
175+
176+
((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) =
177+
solve_McKc(Val(T), M, K, Nblock_Nrem, _α, _β, R₂, R₃, Wfactor)
178+
179+
(Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter),
180+
(Kblock, Kblock_Krem, Krem, Kiter),
181+
promote(Nblock, Nblock_Nrem, Nrem, Niter)
181182
end
182183
# Takes Nc, calcs Mc and Kc
183184
@inline function solve_McKc(::Val{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
184185
W = pick_vector_width(T)
185186
Wfloat = StaticFloat64(W)
186187
α =* Wfloat
187-
β =* Wfloat
188-
L₁ₑ = first_cache_size(Val(T)) * R₂
188+
# β = _β * Wfloat
189+
L₁ₑ = first_cache_size(Val(T)) * R₂
189190
L₂ₑ = second_cache_size(Val(T)) * R₃
190191

191-
Kc_init⁻¹ = Base.FastMath.max_fast(/L₁ₑ), Nc*inv(L₂ₑ))
192+
Kc_init⁻¹ = Base.FastMath.max_fast( / L₁ₑ), Nc * inv(L₂ₑ))
192193
Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
193194
Kblock, Krem = divrem_fast(K, Kiter)
194195
Kblock_Krem = Kblock + One()
@@ -202,7 +203,7 @@ end
202203
Mblocks, Mblocks_rem = divrem_fast(M, Mᵣ)
203204
Miter, Mrem = divrem_fast(Mblocks, Mc_init_base)
204205
if Miter == 0
205-
return (0, 0, Int(M)::Int, 0, 1), Kblock_summary
206+
return (0, 0, Int(M)::Int, 0, 1), Kblock_summary
206207
elseif Miter > Mrem
207208
Mblock_Mrem = Mbsize + Mᵣ
208209
Mremfinal = Mbsize + Mblocks_rem
@@ -221,7 +222,10 @@ end
221222
end
222223
end
223224

224-
@inline cldapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432)) # approximate `ceil`
225+
@inline cldapproxi(n, d⁻¹) = Base.fptosi(
226+
Int,
227+
Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432),
228+
) # approximate `ceil`
225229
# @inline divapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.mul_fast(n, d⁻¹)) # approximate `div`
226230

227231
"""
@@ -231,14 +235,14 @@ Finds first combination of `Miter` and `Niter` that doesn't make `M` too small w
231235
This would be awkard if there are computers with prime numbers of cores. I should probably consider that possibility at some point.
232236
"""
233237
@inline function find_first_acceptable(::Val{T}, M, W) where {T}
234-
Mᵣ, Nᵣ = matmul_params(Val(T))
235-
factors = calc_factors()
236-
for (miter, niter) factors
237-
if miter * (StaticInt(2) * Mᵣ * W) M + (W + W)
238-
return miter, niter
239-
end
238+
Mᵣ, _ = matmul_params(Val(T))
239+
factors = calc_factors()
240+
for (miter, niter) factors
241+
if miter * (StaticInt(2) * Mᵣ * W) M + (W + W)
242+
return miter, niter
240243
end
241-
last(factors)
244+
end
245+
last(factors)
242246
end
243247
"""
244248
divide_blocks(M, Ntotal, _nspawn, W)
@@ -247,8 +251,8 @@ Splits both `M` and `N` into blocks when trying to spawn a large number of threa
247251
"""
248252
@inline function divide_blocks(::Val{T}, M, Ntotal, _nspawn, W) where {T}
249253
_nspawn == num_cores() && return find_first_acceptable(Val(T), M, W)
250-
mᵣ, nᵣ = matmul_params(Val(T))
251-
Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn)
254+
mᵣ, _ = matmul_params(Val(T))
255+
Miter = clamp(div_fast(M, W * mᵣ * MᵣW_mul_factor()), 1, _nspawn)
252256
nspawn = div_fast(_nspawn, Miter)
253257
if (nspawn 1) & (Miter < _nspawn)
254258
# rebalance Miter

src/global_constants.jl

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,23 @@ const OCTAVIAN_NUM_TASKS = Ref(1)
33
_nthreads() = OCTAVIAN_NUM_TASKS[]
44

55
@generated function calc_factors(::Union{Val{nc},StaticInt{nc}} = num_cores()) where {nc}
6-
t = Expr(:tuple)
7-
for i nc:-1:1
8-
d, r = divrem(nc, i)
9-
iszero(r) && push!(t.args, (i, d))
10-
end
11-
t
6+
t = Expr(:tuple)
7+
for i nc:-1:1
8+
d, r = divrem(nc, i)
9+
iszero(r) && push!(t.args, (i, d))
10+
end
11+
t
1212
end
1313
# const CORE_FACTORS = calc_factors()
1414

1515
MᵣW_mul_factor(::True) = StaticInt{4}()
1616
MᵣW_mul_factor(::False) = StaticInt{9}()
1717
MᵣW_mul_factor() = MᵣW_mul_factor(has_feature(Val(:x86_64_avx512f)))
1818

19-
W₁Default(::True) = StaticFloat64{0.0009898277594117685}()
20-
W₂Default(::True) = StaticFloat64{0.9865020832559304}()
21-
R₁Default(::True) = StaticFloat64{0.5820044063603483}()
22-
R₂Default(::True) = StaticFloat64{0.7580885846640107}()
19+
W₁Default(::True) = StaticFloat64{0.0007423708195588264}()
20+
W₂Default(::True) = StaticFloat64{0.7757548987718677}()
21+
R₁Default(::True) = StaticFloat64{0.7936663315339363}()
22+
R₂Default(::True) = StaticFloat64{0.7144577794375783}()
2323

2424
W₁Default_arch(::Val{:znver1}) = StaticFloat64{0.053918949422353986}()
2525
W₂Default_arch(::Val{:znver1}) = StaticFloat64{0.3013238122374886}()
@@ -55,16 +55,20 @@ end
5555

5656
second_cache() = first_cache() + One()
5757

58-
_first_cache_size(fcs::StaticInt) = ifelse(eq(first_cache(), StaticInt(2)) & cache_inclusive(StaticInt(2)), fcs - cache_size(One()), fcs)
58+
_first_cache_size(fcs::StaticInt) = ifelse(
59+
eq(first_cache(), StaticInt(2)) & cache_inclusive(StaticInt(2)),
60+
fcs - cache_size(One()),
61+
fcs,
62+
)
5963
_first_cache_size(::Nothing) = StaticInt(262144)
6064
first_cache_size() = _first_cache_size(cache_size(first_cache()))
6165

6266
_second_cache_size(scs::StaticInt, ::True) = scs - cache_size(first_cache())
6367
_second_cache_size(scs::StaticInt, ::False) = scs
6468
_second_cache_size(::StaticInt{0}, ::Nothing) = StaticInt(3145728)
6569
function second_cache_size()
66-
sc = second_cache()
67-
_second_cache_size(cache_size(sc), cache_inclusive(sc)) * min(num_cores(), num_threads())
70+
sc = second_cache()
71+
_second_cache_size(cache_size(sc), cache_inclusive(sc))
6872
end
6973

7074
first_cache_size(::Val{T}) where {T} = first_cache_size() ÷ static_sizeof(T)

src/init.jl

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,23 @@ end
1818
function init_bcache()
1919
if bcache_count() Zero()
2020
if BCACHEPTR[] == C_NULL
21-
BCACHEPTR[] = VectorizationBase.valloc(second_cache_size() * bcache_count(), Cvoid, ccall(:jl_getpagesize, Int, ()))
21+
BCACHEPTR[] = VectorizationBase.valloc(
22+
Threads.nthreads() * second_cache_size() * bcache_count(),
23+
Cvoid,
24+
ccall(:jl_getpagesize, Int, ()),
25+
)
2226
end
2327
end
2428
nothing
2529
end
2630

2731
function init_acache()
2832
if ACACHEPTR[] == C_NULL
29-
ACACHEPTR[] = VectorizationBase.valloc(first_cache_size() * init_num_tasks(), Cvoid, ccall(:jl_getpagesize, Int, ()))
33+
ACACHEPTR[] = VectorizationBase.valloc(
34+
first_cache_size() * init_num_tasks(),
35+
Cvoid,
36+
ccall(:jl_getpagesize, Int, ()),
37+
)
3038
end
3139
nothing
3240
end

0 commit comments

Comments
 (0)