Skip to content

Commit 083c6ee

Browse files
Update for VectorizationBase 0.16/LoopVec 0.10 (#60)
* Update for VectorizationBase 0.16/LoopVec 0.10 (replace constants with functions) * Use approximate equality in matmul tests. * Octavian.StaticInt; tests pass locally * Fix single threaded * Docs: don't use Pkg server Co-authored-by: Dilum Aluthge <[email protected]>
1 parent c968586 commit 083c6ee

19 files changed

+380
-237
lines changed

.github/workflows/ci.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,13 @@ jobs:
109109
using Pkg
110110
Pkg.develop(PackageSpec(path=pwd()))
111111
Pkg.instantiate()'
112+
env:
113+
JULIA_PKG_SERVER: ""
112114
- run: julia --project=docs docs/make.jl
113115
env:
114116
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
115117
DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
118+
JULIA_PKG_SERVER: ""
116119
doctests:
117120
name: Doctests
118121
runs-on: ubuntu-latest
@@ -126,8 +129,12 @@ jobs:
126129
using Pkg
127130
Pkg.develop(PackageSpec(path=pwd()))
128131
Pkg.instantiate()'
132+
env:
133+
JULIA_PKG_SERVER: ""
129134
- run: |
130135
julia --project=docs -e '
131136
using Documenter: doctest
132137
using Octavian
133138
doctest(Octavian)'
139+
env:
140+
JULIA_PKG_SERVER: ""

Project.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "Octavian"
22
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
33
authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
4-
version = "0.2.5"
4+
version = "0.2.6"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -11,9 +11,9 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1111

1212
[compat]
1313
ArrayInterface = "2.14"
14-
LoopVectorization = "0.9.18"
14+
LoopVectorization = "0.10"
1515
ThreadingUtilities = "0.2"
16-
VectorizationBase = "0.15.2"
16+
VectorizationBase = "0.16"
1717
julia = "1.5"
1818

1919
[extras]

benchmark/staticarraybench.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,14 @@ rename!(df, matmulmethodnames);
6363
df.Size = sizerange
6464

6565
function pick_suffix(desc = "")
66-
suffix = if Octavian.VectorizationBase.AVX512F
66+
suffix = if Octavian.VectorizationBase.has_feature("x86_64_avx512f")
6767
"AVX512"
68-
elseif Octavian.VectorizationBase.AVX2
68+
elseif Octavian.VectorizationBase.has_feature("x86_64_avx2")
6969
"AVX2"
70-
elseif Octavian.VectorizationBase.REGISTER_SIZE == 32
70+
elseif Octavian.VectorizationBase.has_feature("x86_64_avx")
7171
"AVX"
7272
else
73-
"REGSUZE$(Octavian.VectorizationBase.REGISTER_SIZE)"
73+
"REGSIZE$(Octavian.VectorizationBase.register_size())"
7474
end
7575
if desc != ""
7676
suffix *= '_' * desc

benchmark/tilesearch.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ using Octavian: StaticFloat
55
function matmul_pack_ab!(C, A, B, ::Val{W₁}, ::Val{W₂}, ::Val{R₁}, ::Val{R₂}) where {W₁, W₂, R₁, R₂}
66
M, N = size(C); K = size(B,1)
77
zc, za, zb = Octavian.zstridedpointer.((C,A,B))
8-
nspawn = min(Threads.nthreads(), VectorizationBase.NUM_CORES)
8+
nspawn = min(Threads.nthreads(), VectorizationBase.num_cores())
99
@elapsed(
1010
Octavian.matmul_pack_A_and_B!(
1111
zc, za, zb, StaticInt{1}(), StaticInt{0}(), M, K, N, nspawn,
@@ -60,8 +60,8 @@ end
6060

6161

6262
T = Float64
63-
min_size = round(Int, sqrt(0.65 * Octavian.VectorizationBase.CACHE_SIZE[3] / sizeof(T)))
64-
max_size = round(Int, sqrt( 32 * Octavian.VectorizationBase.CACHE_SIZE[3] / sizeof(T)))
63+
min_size = round(Int, sqrt(0.65 * Octavian.VectorizationBase.cache_size(Val(3)) / sizeof(T)))
64+
max_size = round(Int, sqrt( 32 * Octavian.VectorizationBase.cache_size(Val(3)) / sizeof(T)))
6565

6666
SR = size_range(max_size, min_size, 100);
6767
const CsConst, AsConst, BsConst = matrix_range(SR, T);
@@ -77,7 +77,7 @@ end
7777

7878
using Optim
7979
hours = 60.0*60.0; days = 24hours;
80-
init = [Octavian.W₁Default, Octavian.W₂Default, Octavian.R₁Default, Octavian.R₂Default]
80+
init = Float64[Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default()]
8181

8282
opt = Optim.optimize(
8383
matmul_objective, init, ParticleSwarm(lower = [0.001, 0.01, 0.3, 0.4], upper = [0.2, 2.0, 0.9, 0.99]),

src/Octavian.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@ module Octavian
22

33
using VectorizationBase, ArrayInterface, LoopVectorization
44

5-
using VectorizationBase: align, gep, AbstractStridedPointer, AbstractSIMDVector, vnoaliasstore!, staticm1,
6-
static_sizeof, lazymul, vmul_fast, StridedPointer, gesp, zero_offsets, pause,
7-
CACHE_COUNT, NUM_CORES, CACHE_INCLUSIVITY, zstridedpointer
8-
using LoopVectorization: maybestaticsize, mᵣ, nᵣ, preserve_buffer, CloseOpen
5+
using VectorizationBase: align, AbstractStridedPointer, zstridedpointer,
6+
static_sizeof, lazymul, StridedPointer, gesp, pause, pick_vector_width_val,
7+
snum_cache_levels, scache_size, snum_cores, num_cores, cache_inclusivity, scacheline_size
8+
using LoopVectorization: maybestaticsize, matmul_params, preserve_buffer, CloseOpen
99
using ArrayInterface: StaticInt, Zero, One, OptionallyStaticUnitRange, size, strides, offsets, indices,
1010
static_length, static_first, static_last, axes,
1111
dense_dims, DenseDims, stride_rank, StrideRank

src/block_sizes.jl

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,25 @@
11

2-
first_effective_cache(::Type{T}) where {T} = StaticInt{FIRST__CACHE_SIZE}() ÷ static_sizeof(T)
3-
second_effective_cache(::Type{T}) where {T} = StaticInt{SECOND_CACHE_SIZE}() ÷ static_sizeof(T)
42

53
function block_sizes(::Type{T}, _α, _β, R₁, R₂) where {T}
6-
W = VectorizationBase.pick_vector_width_val(T)
4+
W = pick_vector_width_val(T)
75
α =* W
86
β =* W
9-
L₁ₑ = first_effective_cache(T) * R₁
10-
L₂ₑ = second_effective_cache(T) * R₂
7+
L₁ₑ = first_cache_size(T) * R₁
8+
L₂ₑ = second_cache_size(T) * R₂
119
block_sizes(W, α, β, L₁ₑ, L₂ₑ)
1210
end
1311
function block_sizes(W, α, β, L₁ₑ, L₂ₑ)
14-
MᵣW = StaticInt{mᵣ}() * W
12+
mᵣ, nᵣ = matmul_params()
13+
MᵣW = mᵣ * W
1514

1615
Mc = floortostaticint((L₁ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₂ₑ) / MᵣW) * MᵣW
1716
Kc = roundtostaticint((L₁ₑ)*√(L₂ₑ)/√(L₁ₑ*β + L₂ₑ*α))
18-
Nc = floortostaticint((L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ) / StaticInt{nᵣ}()) * StaticInt{nᵣ}()
17+
Nc = floortostaticint((L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ) / nᵣ) * nᵣ
1918

2019
Mc, Kc, Nc
2120
end
2221
function block_sizes(::Type{T}) where {T}
23-
block_sizes(T, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}())
22+
block_sizes(T, W₁Default(), W₂Default(), R₁Default(), R₂Default())
2423
end
2524

2625
"""
@@ -159,11 +158,11 @@ Note that for synchronization on `B`, all threads must have the same values for
159158
independently of `M`, this algorithm guarantees all threads are on the same page.
160159
"""
161160
@inline function solve_block_sizes(::Type{T}, M, K, N, _α, _β, R₂, R₃, Wfactor) where {T}
162-
W = VectorizationBase.pick_vector_width_val(T)
161+
W = pick_vector_width_val(T)
163162
α =* W
164163
β =* W
165-
L₁ₑ = first_effective_cache(T) * R₂
166-
L₂ₑ = second_effective_cache(T) * R₃
164+
L₁ₑ = first_cache_size(T) * R₂
165+
L₂ₑ = second_cache_size(T) * R₃
167166

168167
# Nc_init = round(Int, √(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)/√(L₁ₑ))
169168
Nc_init⁻¹ = (L₁ₑ) / ((L₂ₑ)*√* L₂ₑ + β * L₁ₑ))
@@ -178,11 +177,11 @@ independently of `M`, this algorithm guarantees all threads are on the same page
178177
end
179178
# Takes Nc, calcs Mc and Kc
180179
@inline function solve_McKc(::Type{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
181-
W = VectorizationBase.pick_vector_width_val(T)
180+
W = pick_vector_width_val(T)
182181
α =* W
183182
β =* W
184-
L₁ₑ = first_effective_cache(T) * R₂
185-
L₂ₑ = second_effective_cache(T) * R₃
183+
L₁ₑ = first_cache_size(T) * R₂
184+
L₂ₑ = second_cache_size(T) * R₃
186185

187186
Kc_init⁻¹ = Base.FastMath.max_fast(/L₁ₑ), Nc*inv(L₂ₑ))
188187
Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
@@ -201,27 +200,28 @@ end
201200
"""
202201
find_first_acceptable(M, W)
203202
204-
Finds first combination of `Miter` and `Niter` that doesn't make `M` too small while producing `Miter * Niter = NUM_CORES`.
203+
Finds first combination of `Miter` and `Niter` that doesn't make `M` too small while producing `Miter * Niter = num_cores()`.
205204
This would be awkard if there are computers with prime numbers of cores. I should probably consider that possibility at some point.
206205
"""
207206
@inline function find_first_acceptable(M, W)
208-
Mᵣ = StaticInt{mᵣ}() * W
209-
for (miter,niter) CORE_FACTORS
210-
if miter * ((MᵣW_mul_factor - One()) * Mᵣ) M + (W + W)
207+
Mᵣ, Nᵣ = matmul_params()
208+
factors = calc_factors()
209+
for (miter, niter) factors
210+
if miter * ((MᵣW_mul_factor() - One()) * Mᵣ) M + (W + W)
211211
return miter, niter
212212
end
213213
end
214-
last(CORE_FACTORS)
214+
last(factors)
215215
end
216216
"""
217217
divide_blocks(M, Ntotal, _nspawn, W)
218218
219219
Splits both `M` and `N` into blocks when trying to spawn a large number of threads relative to the size of the matrices.
220220
"""
221221
@inline function divide_blocks(M, Ntotal, _nspawn, W)
222-
_nspawn == NUM_CORES && return find_first_acceptable(M, W)
223-
224-
Miter = clamp(div_fast(M, W*StaticInt{mᵣ}() * MᵣW_mul_factor), 1, _nspawn)
222+
_nspawn == num_cores() && return find_first_acceptable(M, W)
223+
mᵣ, nᵣ = matmul_params()
224+
Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn)
225225
nspawn = div_fast(_nspawn, Miter)
226226
if (nspawn 1) & (Miter < _nspawn)
227227
# rebalance Miter

src/funcptrs.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@ function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd})(p::Ptr{UInt}) where {P,TC,TA
1515
end
1616
@inline _call_loopmul!(C, A, B, α, β, M, K, N, ::Val{false}) = loopmul!(C, A, B, α, β, M, K, N)
1717
@inline function _call_loopmul!(C::StridedPointer{T}, A, B, α, β, M, K, N, ::Val{true}) where {T}
18-
if M*K < first_effective_cache(T) * R₂Default
18+
if M*K < first_cache_size(T) * R₂Default()
1919
packaloopmul!(C, A, B, α, β, M, K, N)
2020
return
2121
else
22-
matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}())
22+
matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, W₁Default(), W₂Default(), R₁Default(), R₂Default())
2323
return
2424
end
2525
end

src/global_constants.jl

Lines changed: 87 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,103 @@
1-
const BCACHE = UInt8[]
21

32
const OCTAVIAN_NUM_TASKS = Ref(1)
43
_nthreads() = OCTAVIAN_NUM_TASKS[]
54

6-
@generated function calc_factors(::Val{nc} = Val{NUM_CORES}()) where {nc}
5+
@generated function calc_factors(::Union{Val{nc},StaticInt{nc}} = snum_cores()) where {nc}
76
t = Expr(:tuple)
87
for i nc:-1:1
98
d, r = divrem(nc, i)
109
iszero(r) && push!(t.args, (i, d))
1110
end
1211
t
1312
end
14-
const CORE_FACTORS = calc_factors()
15-
16-
const MᵣW_mul_factor = VectorizationBase.REGISTER_SIZE === 64 ? StaticInt{4}() : StaticInt{9}()
17-
18-
if VectorizationBase.AVX512F
19-
const W₁Default = 0.006089395198610773
20-
const W₂Default = 0.7979822724696168
21-
const R₁Default = 0.5900561503730485
22-
const R₂Default = 0.762152930709678
23-
else
24-
const W₁Default = 0.1 # TODO: relax bounds; this was the upper bound set for the optimizer.
25-
const W₂Default = 0.15989396641218157
26-
const R₁Default = 0.4203583148344484
27-
const R₂Default = 0.6344856142604789
28-
end
29-
30-
const FIRST__CACHE = 1 + (VectorizationBase.CACHE_SIZE[3] !== nothing)
31-
const SECOND_CACHE = 2 + (VectorizationBase.CACHE_SIZE[3] !== nothing)
32-
const FIRST__CACHE_SIZE = VectorizationBase.CACHE_SIZE[FIRST__CACHE] === nothing ? 262144 :
33-
(((FIRST__CACHE == 2) & CACHE_INCLUSIVITY[2]) ? (VectorizationBase.CACHE_SIZE[2] - VectorizationBase.CACHE_SIZE[1]) :
34-
VectorizationBase.CACHE_SIZE[FIRST__CACHE])
35-
const SECOND_CACHE_SIZE = (VectorizationBase.CACHE_SIZE[SECOND_CACHE] === nothing ? 3145728 :
36-
(CACHE_INCLUSIVITY[SECOND_CACHE] ? (VectorizationBase.CACHE_SIZE[SECOND_CACHE] - VectorizationBase.CACHE_SIZE[FIRST__CACHE]) :
37-
VectorizationBase.CACHE_SIZE[SECOND_CACHE])) * something(VectorizationBase.CACHE_COUNT[SECOND_CACHE], 1)
38-
39-
const CACHELINESIZE = something(VectorizationBase.L₁CACHE.linesize, 64) % UInt
40-
const BCACHE_COUNT = something(VectorizationBase.CACHE_COUNT[3], 1);
13+
# const CORE_FACTORS = calc_factors()
14+
15+
@generated function MᵣW_mul_factor()
16+
f = VectorizationBase.has_feature("x86_64_avx512f") ? 4 : 9
17+
Expr(:call, Expr(:curly, :StaticInt, f))
18+
end
19+
20+
@generated function W₁Default()
21+
w = if VectorizationBase.has_feature("x86_64_avx512f")
22+
0.006089395198610773
23+
elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
24+
0.1
25+
elseif Sys.CPU_NAME === "znver1"
26+
0.053918949422353986
27+
else
28+
0.1
29+
end
30+
Expr(:call, Expr(:curly, :StaticFloat, w))
31+
end
32+
@generated function W₂Default()
33+
w = if VectorizationBase.has_feature("x86_64_avx512f")
34+
0.7979822724696168
35+
elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
36+
0.993489411720157
37+
elseif Sys.CPU_NAME === "znver1"
38+
0.3013238122374886
39+
else
40+
0.15989396641218157
41+
end
42+
Expr(:call, Expr(:curly, :StaticFloat, w))
43+
end
44+
@generated function R₁Default()
45+
w = if VectorizationBase.has_feature("x86_64_avx512f")
46+
0.5900561503730485
47+
elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
48+
0.6052218809954467
49+
elseif Sys.CPU_NAME === "znver1"
50+
0.6077103834481342
51+
else
52+
0.4203583148344484
53+
end
54+
Expr(:call, Expr(:curly, :StaticFloat, w))
55+
end
56+
@generated function R₂Default()
57+
w = if VectorizationBase.has_feature("x86_64_avx512f")
58+
0.762152930709678
59+
elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
60+
0.7594052633561165
61+
elseif Sys.CPU_NAME === "znver1"
62+
0.8775382433240162
63+
else
64+
0.6344856142604789
65+
end
66+
Expr(:call, Expr(:curly, :StaticFloat, w))
67+
end
68+
69+
first_cache() = StaticInt{1}() + (snum_cache_levels() > StaticInt{2}() ? One() : Zero())
70+
second_cache() = StaticInt{2}() + (snum_cache_levels() > StaticInt{2}() ? One() : Zero())
71+
72+
function first_cache_size()
73+
fcs = scache_size(first_cache())
74+
if fcs === Zero()
75+
return StaticInt(262144)
76+
elseif (first_cache() === StaticInt(2)) && cache_inclusivity()[2]
77+
return fcs - scache_size(One())
78+
else
79+
return fcs
80+
end
81+
end
82+
function second_cache_size()
83+
scs = scache_size(second_cache())
84+
if scs === Zero()
85+
return StaticInt(3145728)
86+
elseif cache_inclusivity()[second_cache()]
87+
return scs - scache_size(first_cache())
88+
else
89+
return scs
90+
end
91+
end
92+
first_cache_size(::Type{T}) where {T} = first_cache_size() ÷ static_sizeof(T)
93+
second_cache_size(::Type{T}) where {T} = second_cache_size() ÷ static_sizeof(T)
94+
95+
bcache_count() = VectorizationBase.scache_count(second_cache())
96+
97+
const BCACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
4198
const BCACHE_LOCK = Threads.Atomic{UInt}(zero(UInt))
4299

43100
if Sys.WORD_SIZE 32
44-
const ACACHE = UInt8[]
45-
const ACACHEPTR = Ref{Ptr{UInt8}}(C_NULL)
101+
const ACACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
46102
end
47103

0 commit comments

Comments
 (0)