Skip to content

Commit 082c694

Browse files
authored
Use Polyester to allocate threads. Fixes #105 (#106)
1 parent 1f9ce87 commit 082c694

File tree

9 files changed

+289
-272
lines changed

9 files changed

+289
-272
lines changed

Project.toml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,26 @@
11
name = "Octavian"
22
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
33
authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
4-
version = "0.2.20"
4+
version = "0.3.0"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
88
IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
99
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
10+
ManualMemory = "d125e4d3-2237-4719-b19c-fa641b8a4667"
11+
Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
1012
Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
11-
StrideArraysCore = "7792a7ef-975c-4747-a70f-980b88e8d1da"
1213
ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
1314
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1415

1516
[compat]
1617
ArrayInterface = "3.1.14"
1718
IfElse = "0.1"
1819
LoopVectorization = "0.12.34"
20+
ManualMemory = "0.1.1"
21+
Polyester = "0.3.5"
1922
Static = "0.2"
20-
StrideArraysCore = "0.1.11"
21-
ThreadingUtilities = "0.4"
23+
ThreadingUtilities = "0.4.6"
2224
VectorizationBase = "0.20.16"
2325
julia = "1.6"
2426

src/Octavian.jl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,12 @@ using VectorizationBase: align, AbstractStridedPointer, zstridedpointer, vsub_ns
88
using LoopVectorization: preserve_buffer, CloseOpen, UpperBoundedInteger
99
using ArrayInterface: size, strides, offsets, indices, axes
1010
using IfElse: ifelse
11-
11+
using Polyester
1212
using Static: StaticInt, Zero, One, StaticBool, True, False, gt, eq, StaticFloat64,
1313
roundtostaticint, floortostaticint
14-
using StrideArraysCore: MemoryBuffer
14+
using ManualMemory: MemoryBuffer, load, store!
1515

16-
using ThreadingUtilities:
17-
_atomic_add!, _atomic_load, _atomic_store!,
18-
launch, wait, load, store!
16+
using ThreadingUtilities: _atomic_add!, _atomic_load, _atomic_store!, launch, wait
1917

2018
export StaticInt
2119
export matmul!

src/block_sizes.jl

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -220,15 +220,15 @@ end
220220
Splits both `M` and `N` into blocks when trying to spawn a large number of threads relative to the size of the matrices.
221221
"""
222222
@inline function divide_blocks(::Val{T}, M, Ntotal, _nspawn, W) where {T}
223-
_nspawn == num_cores() && return find_first_acceptable(Val(T), M, W)
224-
mᵣ, nᵣ = matmul_params(Val(T))
225-
Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn)
223+
_nspawn == num_cores() && return find_first_acceptable(Val(T), M, W)
224+
mᵣ, nᵣ = matmul_params(Val(T))
225+
Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn)
226+
nspawn = div_fast(_nspawn, Miter)
227+
if (nspawn 1) & (Miter < _nspawn)
228+
# rebalance Miter
229+
Miter = cld_fast(_nspawn, cld_fast(_nspawn, Miter))
226230
nspawn = div_fast(_nspawn, Miter)
227-
if (nspawn 1) & (Miter < _nspawn)
228-
# rebalance Miter
229-
Miter = cld_fast(_nspawn, cld_fast(_nspawn, Miter))
230-
nspawn = div_fast(_nspawn, Miter)
231-
end
232-
Miter, cld_fast(Ntotal, max(2, cld_fast(Ntotal, nspawn)))
231+
end
232+
Miter, cld_fast(Ntotal, max(2, cld_fast(Ntotal, nspawn)))
233233
end
234234

src/funcptrs.jl

Lines changed: 73 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,103 +1,101 @@
11

2-
32
struct LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd} <: Function end
43
function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd})(p::Ptr{UInt}) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd}
5-
offset, C = load(p, TC, 2*sizeof(UInt))
6-
offset, A = load(p, TA, offset)
7-
offset, B = load(p, TB, offset)
8-
offset, α = load(p, Α, offset)
9-
offset, β = load(p, Β, offset)
10-
offset, M = load(p, Md, offset)
11-
offset, K = load(p, Kd, offset)
12-
offset, N = load(p, Nd, offset)
13-
_call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
14-
nothing
4+
offset, C = load(p, TC, 2*sizeof(UInt))
5+
offset, A = load(p, TA, offset)
6+
offset, B = load(p, TB, offset)
7+
offset, α = load(p, Α, offset)
8+
offset, β = load(p, Β, offset)
9+
offset, M = load(p, Md, offset)
10+
offset, K = load(p, Kd, offset)
11+
offset, N = load(p, Nd, offset)
12+
_call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
13+
nothing
1514
end
1615
@inline _call_loopmul!(C, A, B, α, β, M, K, N, ::Val{false}) = loopmul!(C, A, B, α, β, M, K, N)
1716
@inline function _call_loopmul!(C::StridedPointer{T}, A, B, α, β, M, K, N, ::Val{true}) where {T}
18-
if M*K < first_cache_size(Val(T)) * R₂Default()
19-
packaloopmul!(C, A, B, α, β, M, K, N)
20-
return
21-
else
22-
matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, W₁Default(), W₂Default(), R₁Default(), R₂Default())
23-
return
24-
end
17+
if M*K < first_cache_size(Val(T)) * R₂Default()
18+
packaloopmul!(C, A, B, α, β, M, K, N)
19+
return
20+
else
21+
matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, W₁Default(), W₂Default(), R₁Default(), R₂Default())
22+
return
23+
end
2524
end
2625
call_loopmul!(C, A, B, α, β, M, K, N, ::Val{P}) where {P} = _call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
2726

2827
struct SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂} <: Function end
2928
function (::SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂})(p::Ptr{UInt}) where {TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}
30-
offset, C = load(p, TC, 2*sizeof(UInt))
31-
offset, A = load(p, TA, offset)
32-
offset, B = load(p, TB, offset)
33-
offset, α = load(p, Α, offset)
34-
offset, β = load(p, Β, offset)
35-
offset, M = load(p, Md, offset)
36-
offset, K = load(p, Kd, offset)
37-
offset, N = load(p, Nd, offset)
38-
offset, atomicp = load(p, Ptr{UInt32}, offset)
39-
offset, bcachep = load(p, BCP, offset)
40-
offset, id = load(p, ID, offset)
41-
offset, total_ids = load(p, TT, offset)
42-
sync_mul!(C, A, B, α, β, M, K, N, atomicp, bcachep, id, total_ids, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}())
43-
nothing
29+
offset, C = load(p, TC, 2*sizeof(UInt))
30+
offset, A = load(p, TA, offset)
31+
offset, B = load(p, TB, offset)
32+
offset, α = load(p, Α, offset)
33+
offset, β = load(p, Β, offset)
34+
offset, M = load(p, Md, offset)
35+
offset, K = load(p, Kd, offset)
36+
offset, N = load(p, Nd, offset)
37+
offset, atomicp = load(p, Ptr{UInt32}, offset)
38+
offset, bcachep = load(p, BCP, offset)
39+
offset, id = load(p, ID, offset)
40+
offset, total_ids = load(p, TT, offset)
41+
sync_mul!(C, A, B, α, β, M, K, N, atomicp, bcachep, id, total_ids, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}())
42+
nothing
4443
end
4544

4645
@generated function cfuncpointer(::T) where {T}
47-
precompile(T(), (Ptr{UInt},))
48-
quote
49-
$(Expr(:meta,:inline))
50-
@cfunction($(T()), Cvoid, (Ptr{UInt},))
51-
end
46+
precompile(T(), (Ptr{UInt},))
47+
quote
48+
$(Expr(:meta,:inline))
49+
@cfunction($(T()), Cvoid, (Ptr{UInt},))
50+
end
5251
end
5352

5453
@inline function setup_matmul!(p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd, ::Val{P}) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd}
55-
offset = store!(p, cfuncpointer(LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd}()), sizeof(UInt))
56-
offset = store!(p, C, offset)
57-
offset = store!(p, A, offset)
58-
offset = store!(p, B, offset)
59-
offset = store!(p, α, offset)
60-
offset = store!(p, β, offset)
61-
offset = store!(p, M, offset)
62-
offset = store!(p, K, offset)
63-
offset = store!(p, N, offset)
64-
nothing
54+
offset = store!(p, cfuncpointer(LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd}()), sizeof(UInt))
55+
offset = store!(p, C, offset)
56+
offset = store!(p, A, offset)
57+
offset = store!(p, B, offset)
58+
offset = store!(p, α, offset)
59+
offset = store!(p, β, offset)
60+
offset = store!(p, M, offset)
61+
offset = store!(p, K, offset)
62+
offset = store!(p, N, offset)
63+
nothing
6564
end
6665

6766
@inline function setup_syncmul!(
68-
p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd,
69-
ap::Ptr{UInt32},bcp::BCP,id::ID,tt::TT,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
67+
p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd,
68+
ap::Ptr{UInt32},bcp::BCP,id::ID,tt::TT,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
7069
) where {TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}
71-
offset = store!(p, cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}()), sizeof(UInt))
72-
offset = store!(p, C, offset)
73-
offset = store!(p, A, offset)
74-
offset = store!(p, B, offset)
75-
offset = store!(p, α, offset)
76-
offset = store!(p, β, offset)
77-
offset = store!(p, M, offset)
78-
offset = store!(p, K, offset)
79-
offset = store!(p, N, offset)
80-
offset = store!(p, ap, offset)
81-
offset = store!(p, bcp, offset)
82-
offset = store!(p, id, offset)
83-
offset = store!(p, tt, offset)
84-
nothing
70+
offset = store!(p, cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}()), sizeof(UInt))
71+
offset = store!(p, C, offset)
72+
offset = store!(p, A, offset)
73+
offset = store!(p, B, offset)
74+
offset = store!(p, α, offset)
75+
offset = store!(p, β, offset)
76+
offset = store!(p, M, offset)
77+
offset = store!(p, K, offset)
78+
offset = store!(p, N, offset)
79+
offset = store!(p, ap, offset)
80+
offset = store!(p, bcp, offset)
81+
offset = store!(p, id, offset)
82+
offset = store!(p, tt, offset)
83+
nothing
8584
end
8685

87-
function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::Int, ::Val{P}) where {P}
88-
launch(tid, C, A, B, α, β, M, K, N, Val{P}()) do p, C, A, B, α, β, M, K, N, VP
89-
setup_matmul!(p, C, A, B, α, β, M, K, N, VP)
90-
end
86+
@inline function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::UInt32, ::Val{P}) where {P}
87+
launch(setup_matmul!, tid, C, A, B, α, β, M, K, N, Val{P}())
9188
end
92-
function launch_thread_mul!(
93-
C, A, B, α, β, M, K, N, ap, bcp, tid, tt,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
89+
@inline function launch_thread_mul!(
90+
C, A, B, α, β, M, K, N, ap, bcp, tid, id, tt, ::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
9491
) where {W₁,W₂,R₁,R₂}
95-
launch(tid+one(tid), C, A, B, α, β, M, K, N, ap, bcp, tid, tt) do p, C, A, B, α, β, M, K, N, ap, bcp, tid, tt
96-
setup_syncmul!(
97-
p, C, A, B, α, β, M, K, N, ap, bcp, tid, tt,
98-
StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}()
99-
)
100-
end
92+
launch(tid, C, A, B, α, β, M, K, N, ap, bcp, id, tt) do p, C, A, B, α, β, M, K, N, ap, bcp, id, tt
93+
Base.@_inline_meta
94+
setup_syncmul!(
95+
p, C, A, B, α, β, M, K, N, ap, bcp, id, tt,
96+
StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}()
97+
)
98+
end
10199
end
102100

103101

src/global_constants.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ bcache_count() = VectorizationBase.num_cache(second_cache())
7373
const BCACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
7474
const BCACHE_LOCK = Threads.Atomic{UInt}(zero(UInt))
7575

76-
if Sys.WORD_SIZE 32
77-
const ACACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
76+
@static if Sys.WORD_SIZE 32
77+
const ACACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
7878
end
7979

src/init.jl

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,45 @@
11
function __init__()
2-
init_acache()
3-
init_bcache()
4-
nt = init_num_tasks()
5-
if nt < num_cores() && ("OCTAVIAN_WARNING" keys(ENV))
6-
msg = string(
7-
"Your system has $(num_cores()) physical cores, but `Octavian.jl` only has ",
8-
"$(nt > 1 ? "$(nt) threads" : "$(nt) thread") available. ",
9-
"For the best performance, you should start Julia with at least $(num_cores()) threads.",
10-
)
11-
@warn msg
12-
end
13-
reseet_bcache_lock!()
2+
init_acache()
3+
init_bcache()
4+
nt = init_num_tasks()
5+
if nt < num_cores() && ("OCTAVIAN_WARNING" keys(ENV))
6+
msg = string(
7+
"Your system has $(num_cores()) physical cores, but `Octavian.jl` only has ",
8+
"$(nt > 1 ? "$(nt) threads" : "$(nt) thread") available. ",
9+
"For the best performance, you should start Julia with at least $(num_cores()) threads.",
10+
)
11+
@warn msg
12+
end
13+
reseet_bcache_lock!()
1414
end
1515

1616
function init_bcache()
17-
if bcache_count() Zero()
18-
BCACHEPTR[] = VectorizationBase.valloc(second_cache_size() * bcache_count(), Cvoid, ccall(:jl_getpagesize, Int, ()))
19-
end
20-
nothing
17+
if bcache_count() Zero()
18+
BCACHEPTR[] = VectorizationBase.valloc(second_cache_size() * bcache_count(), Cvoid, ccall(:jl_getpagesize, Int, ()))
19+
end
20+
nothing
2121
end
2222

23-
if Sys.WORD_SIZE 32
24-
function init_acache()
25-
ACACHEPTR[] = VectorizationBase.valloc(first_cache_size() * init_num_tasks(), Cvoid, ccall(:jl_getpagesize, Int, ()))
26-
nothing
27-
end
23+
@static if Sys.WORD_SIZE 32
24+
function init_acache()
25+
ACACHEPTR[] = VectorizationBase.valloc(first_cache_size() * init_num_tasks(), Cvoid, ccall(:jl_getpagesize, Int, ()))
26+
nothing
27+
end
2828
else
29-
init_acache() = nothing
29+
init_acache() = nothing
3030
end
3131

3232
function init_num_tasks()
33-
num_tasks = _read_environment_num_tasks()::Int
34-
OCTAVIAN_NUM_TASKS[] = num_tasks
33+
num_tasks = _read_environment_num_tasks()::Int
34+
OCTAVIAN_NUM_TASKS[] = num_tasks
3535
end
3636

3737
function _read_environment_num_tasks()
38-
environment_variable = get(ENV, "OCTAVIAN_NUM_TASKS", "")::String
39-
nt = min(Threads.nthreads(), VectorizationBase.num_cores())::Int
40-
if isempty(environment_variable)
41-
return nt
42-
else
43-
return min(parse(Int, environment_variable)::Int, nt)
44-
end
38+
environment_variable = get(ENV, "OCTAVIAN_NUM_TASKS", "")::String
39+
nt = min(Threads.nthreads(), VectorizationBase.num_cores())::Int
40+
if isempty(environment_variable)
41+
return nt
42+
else
43+
return min(parse(Int, environment_variable)::Int, nt)
44+
end
4545
end

0 commit comments

Comments
 (0)