JuliaLinearAlgebra
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 7 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎Project.toml‎
Lines changed: 3 additions & 3 deletions b/‎Project.toml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎benchmark/staticarraybench.jl‎
Lines changed: 4 additions & 4 deletions b/‎benchmark/staticarraybench.jl‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmark/tilesearch.jl‎
Lines changed: 4 additions & 4 deletions b/‎benchmark/tilesearch.jl‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/Octavian.jl‎
Lines changed: 4 additions & 4 deletions b/‎src/Octavian.jl‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/block_sizes.jl‎
Lines changed: 22 additions & 22 deletions b/‎src/block_sizes.jl‎
Lines changed: 22 additions & 22 deletions
diff --git a/‎src/funcptrs.jl‎
Lines changed: 2 additions & 2 deletions b/‎src/funcptrs.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/global_constants.jl‎
Lines changed: 87 additions & 31 deletions b/‎src/global_constants.jl‎
Lines changed: 87 additions & 31 deletions
@@ -109,10 +109,13 @@ jobs:
             using Pkg
             Pkg.develop(PackageSpec(path=pwd()))
             Pkg.instantiate()'
+        env:
+          JULIA_PKG_SERVER: ""
       - run: julia --project=docs docs/make.jl
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+          JULIA_PKG_SERVER: ""
   doctests:
     name: Doctests
     runs-on: ubuntu-latest
@@ -126,8 +129,12 @@ jobs:
             using Pkg
             Pkg.develop(PackageSpec(path=pwd()))
             Pkg.instantiate()'
+        env:
+          JULIA_PKG_SERVER: ""
       - run: |
           julia --project=docs -e '
             using Documenter: doctest
             using Octavian
             doctest(Octavian)'
+        env:
+          JULIA_PKG_SERVER: ""
@@ -1,7 +1,7 @@
 name = "Octavian"
 uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
-version = "0.2.5"
+version = "0.2.6"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -11,9 +11,9 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 ArrayInterface = "2.14"
-LoopVectorization = "0.9.18"
+LoopVectorization = "0.10"
 ThreadingUtilities = "0.2"
-VectorizationBase = "0.15.2"
+VectorizationBase = "0.16"
 julia = "1.5"
 
 [extras]
 
@@ -63,14 +63,14 @@ rename!(df, matmulmethodnames);
 df.Size = sizerange
 
 function pick_suffix(desc = "")
-    suffix = if Octavian.VectorizationBase.AVX512F
+    suffix = if Octavian.VectorizationBase.has_feature("x86_64_avx512f")
         "AVX512"
-    elseif Octavian.VectorizationBase.AVX2
+    elseif Octavian.VectorizationBase.has_feature("x86_64_avx2")
         "AVX2"
-    elseif Octavian.VectorizationBase.REGISTER_SIZE == 32
+    elseif Octavian.VectorizationBase.has_feature("x86_64_avx")
         "AVX"
     else
-        "REGSUZE$(Octavian.VectorizationBase.REGISTER_SIZE)"
+        "REGSIZE$(Octavian.VectorizationBase.register_size())"
     end
     if desc != ""
         suffix *= '_' * desc
 
@@ -5,7 +5,7 @@ using Octavian: StaticFloat
 function matmul_pack_ab!(C, A, B, ::Val{W₁}, ::Val{W₂}, ::Val{R₁}, ::Val{R₂}) where {W₁, W₂, R₁, R₂}
     M, N = size(C); K = size(B,1)
     zc, za, zb = Octavian.zstridedpointer.((C,A,B))
-    nspawn = min(Threads.nthreads(), VectorizationBase.NUM_CORES)
+    nspawn = min(Threads.nthreads(), VectorizationBase.num_cores())
     @elapsed(
         Octavian.matmul_pack_A_and_B!(
             zc, za, zb, StaticInt{1}(), StaticInt{0}(), M, K, N, nspawn,
@@ -60,8 +60,8 @@ end
 
 
 T = Float64
-min_size = round(Int, sqrt(0.65 * Octavian.VectorizationBase.CACHE_SIZE[3] / sizeof(T)))
-max_size = round(Int, sqrt( 32  * Octavian.VectorizationBase.CACHE_SIZE[3] / sizeof(T)))
+min_size = round(Int, sqrt(0.65 * Octavian.VectorizationBase.cache_size(Val(3)) / sizeof(T)))
+max_size = round(Int, sqrt( 32  * Octavian.VectorizationBase.cache_size(Val(3)) / sizeof(T)))
 
 SR = size_range(max_size, min_size, 100);
 const CsConst, AsConst, BsConst = matrix_range(SR, T);
@@ -77,7 +77,7 @@ end
 
 using Optim
 hours = 60.0*60.0; days = 24hours;
-init = [Octavian.W₁Default, Octavian.W₂Default, Octavian.R₁Default, Octavian.R₂Default]
+init = Float64[Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default()]
 
 opt = Optim.optimize(
     matmul_objective, init, ParticleSwarm(lower = [0.001, 0.01, 0.3, 0.4], upper = [0.2, 2.0, 0.9, 0.99]),
 
@@ -2,10 +2,10 @@ module Octavian
 
 using VectorizationBase, ArrayInterface, LoopVectorization
 
-using VectorizationBase: align, gep, AbstractStridedPointer, AbstractSIMDVector, vnoaliasstore!, staticm1,
-    static_sizeof, lazymul, vmul_fast, StridedPointer, gesp, zero_offsets, pause,
-    CACHE_COUNT, NUM_CORES, CACHE_INCLUSIVITY, zstridedpointer
-using LoopVectorization: maybestaticsize, mᵣ, nᵣ, preserve_buffer, CloseOpen
+using VectorizationBase: align, AbstractStridedPointer, zstridedpointer,
+    static_sizeof, lazymul, StridedPointer, gesp, pause, pick_vector_width_val,
+    snum_cache_levels, scache_size, snum_cores, num_cores, cache_inclusivity, scacheline_size
+using LoopVectorization: maybestaticsize, matmul_params, preserve_buffer, CloseOpen
 using ArrayInterface: StaticInt, Zero, One, OptionallyStaticUnitRange, size, strides, offsets, indices,
     static_length, static_first, static_last, axes,
     dense_dims, DenseDims, stride_rank, StrideRank
 
@@ -1,26 +1,25 @@
 
-first_effective_cache(::Type{T}) where {T} = StaticInt{FIRST__CACHE_SIZE}() ÷ static_sizeof(T)
-second_effective_cache(::Type{T}) where {T} = StaticInt{SECOND_CACHE_SIZE}() ÷ static_sizeof(T)
 
 function block_sizes(::Type{T}, _α, _β, R₁, R₂) where {T}
-    W = VectorizationBase.pick_vector_width_val(T)
+    W = pick_vector_width_val(T)
     α = _α * W
     β = _β * W
-    L₁ₑ = first_effective_cache(T) * R₁
-    L₂ₑ = second_effective_cache(T) * R₂
+    L₁ₑ = first_cache_size(T) * R₁
+    L₂ₑ = second_cache_size(T) * R₂
     block_sizes(W, α, β, L₁ₑ, L₂ₑ)
 end
 function block_sizes(W, α, β, L₁ₑ, L₂ₑ)
-    MᵣW = StaticInt{mᵣ}() * W
+    mᵣ, nᵣ = matmul_params()
+    MᵣW = mᵣ * W
 
     Mc = floortostaticint(√(L₁ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₂ₑ) / MᵣW) * MᵣW
     Kc = roundtostaticint(√(L₁ₑ)*√(L₂ₑ)/√(L₁ₑ*β + L₂ₑ*α))
-    Nc = floortostaticint(√(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ) / StaticInt{nᵣ}()) * StaticInt{nᵣ}()
+    Nc = floortostaticint(√(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ) / nᵣ) * nᵣ
 
     Mc, Kc, Nc
 end
 function block_sizes(::Type{T}) where {T}
-    block_sizes(T, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}())
+    block_sizes(T, W₁Default(), W₂Default(), R₁Default(), R₂Default())
 end
 
 """
@@ -159,11 +158,11 @@ Note that for synchronization on `B`, all threads must have the same values for
 independently of `M`, this algorithm guarantees all threads are on the same page.
 """
 @inline function solve_block_sizes(::Type{T}, M, K, N, _α, _β, R₂, R₃, Wfactor) where {T}
-    W = VectorizationBase.pick_vector_width_val(T)
+    W = pick_vector_width_val(T)
     α = _α * W
     β = _β * W
-    L₁ₑ =  first_effective_cache(T) * R₂
-    L₂ₑ = second_effective_cache(T) * R₃
+    L₁ₑ =  first_cache_size(T) * R₂
+    L₂ₑ = second_cache_size(T) * R₃
 
     # Nc_init = round(Int, √(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)/√(L₁ₑ))
     Nc_init⁻¹ = √(L₁ₑ) / (√(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ))
@@ -178,11 +177,11 @@ independently of `M`, this algorithm guarantees all threads are on the same page
 end
 # Takes Nc, calcs Mc and Kc
 @inline function solve_McKc(::Type{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
-    W = VectorizationBase.pick_vector_width_val(T)
+    W = pick_vector_width_val(T)
     α = _α * W
     β = _β * W
-    L₁ₑ =  first_effective_cache(T) * R₂
-    L₂ₑ = second_effective_cache(T) * R₃
+    L₁ₑ =  first_cache_size(T) * R₂
+    L₂ₑ = second_cache_size(T) * R₃
 
     Kc_init⁻¹ = Base.FastMath.max_fast(√(α/L₁ₑ), Nc*inv(L₂ₑ))
     Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
@@ -201,27 +200,28 @@ end
 """
   find_first_acceptable(M, W)
 
-Finds first combination of `Miter` and `Niter` that doesn't make `M` too small while producing `Miter * Niter = NUM_CORES`.
+Finds first combination of `Miter` and `Niter` that doesn't make `M` too small while producing `Miter * Niter = num_cores()`.
 This would be awkard if there are computers with prime numbers of cores. I should probably consider that possibility at some point.
 """
 @inline function find_first_acceptable(M, W)
-    Mᵣ = StaticInt{mᵣ}() * W
-    for (miter,niter) ∈ CORE_FACTORS
-        if miter * ((MᵣW_mul_factor - One()) * Mᵣ) ≤ M + (W + W)
+    Mᵣ, Nᵣ = matmul_params()
+    factors = calc_factors()
+    for (miter, niter) ∈ factors
+        if miter * ((MᵣW_mul_factor() - One()) * Mᵣ) ≤ M + (W + W)
             return miter, niter
         end
     end
-    last(CORE_FACTORS)
+    last(factors)
 end
 """
   divide_blocks(M, Ntotal, _nspawn, W)
 
 Splits both `M` and `N` into blocks when trying to spawn a large number of threads relative to the size of the matrices.
 """
 @inline function divide_blocks(M, Ntotal, _nspawn, W)
-    _nspawn == NUM_CORES && return find_first_acceptable(M, W)
-    
-    Miter = clamp(div_fast(M, W*StaticInt{mᵣ}() * MᵣW_mul_factor), 1, _nspawn)
+    _nspawn == num_cores() && return find_first_acceptable(M, W)
+    mᵣ, nᵣ = matmul_params()
+    Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn)
     nspawn = div_fast(_nspawn, Miter)
     if (nspawn ≤ 1) & (Miter < _nspawn)
         # rebalance Miter
 
@@ -15,11 +15,11 @@ function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd})(p::Ptr{UInt}) where {P,TC,TA
 end
 @inline _call_loopmul!(C, A, B, α, β, M, K, N, ::Val{false}) = loopmul!(C, A, B, α, β, M, K, N)
 @inline function _call_loopmul!(C::StridedPointer{T}, A, B, α, β, M, K, N, ::Val{true}) where {T}
-    if M*K < first_effective_cache(T) * R₂Default
+    if M*K < first_cache_size(T) * R₂Default()
         packaloopmul!(C, A, B, α, β, M, K, N)
         return
     else
-        matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}())
+        matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, W₁Default(), W₂Default(), R₁Default(), R₂Default())
         return
     end
 end
 
@@ -1,47 +1,103 @@
-const BCACHE = UInt8[]
 
 const OCTAVIAN_NUM_TASKS = Ref(1)
 _nthreads() = OCTAVIAN_NUM_TASKS[]
 
-@generated function calc_factors(::Val{nc} = Val{NUM_CORES}()) where {nc}
+@generated function calc_factors(::Union{Val{nc},StaticInt{nc}} = snum_cores()) where {nc}
     t = Expr(:tuple)
     for i ∈ nc:-1:1
         d, r = divrem(nc, i)
         iszero(r) && push!(t.args, (i, d))
     end
     t
 end
-const CORE_FACTORS = calc_factors()
-
-const MᵣW_mul_factor = VectorizationBase.REGISTER_SIZE === 64 ? StaticInt{4}() : StaticInt{9}()
-
-if VectorizationBase.AVX512F
-    const W₁Default = 0.006089395198610773
-    const W₂Default = 0.7979822724696168
-    const R₁Default = 0.5900561503730485
-    const R₂Default = 0.762152930709678
-else
-    const W₁Default = 0.1 # TODO: relax bounds; this was the upper bound set for the optimizer.
-    const W₂Default = 0.15989396641218157
-    const R₁Default = 0.4203583148344484
-    const R₂Default = 0.6344856142604789
-end
-
-const FIRST__CACHE = 1 + (VectorizationBase.CACHE_SIZE[3] !== nothing)
-const SECOND_CACHE = 2 + (VectorizationBase.CACHE_SIZE[3] !== nothing)
-const FIRST__CACHE_SIZE = VectorizationBase.CACHE_SIZE[FIRST__CACHE] === nothing ? 262144 :
-    (((FIRST__CACHE == 2) & CACHE_INCLUSIVITY[2]) ? (VectorizationBase.CACHE_SIZE[2] - VectorizationBase.CACHE_SIZE[1]) :
-    VectorizationBase.CACHE_SIZE[FIRST__CACHE])
-const SECOND_CACHE_SIZE = (VectorizationBase.CACHE_SIZE[SECOND_CACHE] === nothing ? 3145728 :
-    (CACHE_INCLUSIVITY[SECOND_CACHE] ? (VectorizationBase.CACHE_SIZE[SECOND_CACHE] - VectorizationBase.CACHE_SIZE[FIRST__CACHE]) :
-    VectorizationBase.CACHE_SIZE[SECOND_CACHE])) * something(VectorizationBase.CACHE_COUNT[SECOND_CACHE], 1)
-
-const CACHELINESIZE = something(VectorizationBase.L₁CACHE.linesize, 64) % UInt
-const BCACHE_COUNT = something(VectorizationBase.CACHE_COUNT[3], 1);
+# const CORE_FACTORS = calc_factors()
+
+@generated function MᵣW_mul_factor()
+    f = VectorizationBase.has_feature("x86_64_avx512f") ? 4 : 9
+    Expr(:call, Expr(:curly, :StaticInt, f))
+end
+
+@generated function W₁Default()
+    w = if VectorizationBase.has_feature("x86_64_avx512f")
+        0.006089395198610773
+    elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
+        0.1
+    elseif Sys.CPU_NAME === "znver1"
+        0.053918949422353986
+    else
+        0.1
+    end
+    Expr(:call, Expr(:curly, :StaticFloat, w))
+end
+@generated function W₂Default()
+    w = if VectorizationBase.has_feature("x86_64_avx512f")
+        0.7979822724696168
+    elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
+        0.993489411720157
+    elseif Sys.CPU_NAME === "znver1"
+        0.3013238122374886
+    else
+        0.15989396641218157
+    end
+    Expr(:call, Expr(:curly, :StaticFloat, w))
+end
+@generated function R₁Default()
+    w = if VectorizationBase.has_feature("x86_64_avx512f")
+        0.5900561503730485
+    elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
+        0.6052218809954467
+    elseif Sys.CPU_NAME === "znver1"
+        0.6077103834481342
+    else
+        0.4203583148344484
+    end
+    Expr(:call, Expr(:curly, :StaticFloat, w))
+end
+@generated function R₂Default()
+    w = if VectorizationBase.has_feature("x86_64_avx512f")
+        0.762152930709678
+    elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
+        0.7594052633561165
+    elseif Sys.CPU_NAME === "znver1"
+        0.8775382433240162
+    else
+        0.6344856142604789
+    end
+    Expr(:call, Expr(:curly, :StaticFloat, w))
+end
+
+first_cache() = StaticInt{1}() + (snum_cache_levels() > StaticInt{2}() ? One() : Zero())
+second_cache() = StaticInt{2}() + (snum_cache_levels() > StaticInt{2}() ? One() : Zero())
+
+function first_cache_size()
+    fcs = scache_size(first_cache())
+    if fcs === Zero()
+        return StaticInt(262144)
+    elseif (first_cache() === StaticInt(2)) && cache_inclusivity()[2]
+        return fcs - scache_size(One())
+    else
+        return fcs
+    end
+end
+function second_cache_size()
+    scs = scache_size(second_cache())
+    if scs === Zero()
+        return StaticInt(3145728)
+    elseif cache_inclusivity()[second_cache()]
+        return scs - scache_size(first_cache())
+    else
+        return scs
+    end
+end
+first_cache_size(::Type{T}) where {T} = first_cache_size() ÷ static_sizeof(T)
+second_cache_size(::Type{T}) where {T} = second_cache_size() ÷ static_sizeof(T)
+
+bcache_count() = VectorizationBase.scache_count(second_cache())
+
+const BCACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
 const BCACHE_LOCK = Threads.Atomic{UInt}(zero(UInt))
 
 if Sys.WORD_SIZE ≤ 32
-    const ACACHE = UInt8[]
-    const ACACHEPTR = Ref{Ptr{UInt8}}(C_NULL)
+    const ACACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
 end