|
| 1 | + |
| 2 | + |
| 3 | +using Octavian, VectorizationBase, ProgressMeter |
| 4 | +using Octavian: StaticFloat |
| 5 | +function matmul_pack_ab!(C, A, B, ::Val{W₁}, ::Val{W₂}, ::Val{R₁}, ::Val{R₂}) where {W₁, W₂, R₁, R₂} |
| 6 | + M, N = size(C); K = size(B,1) |
| 7 | + zc, za, zb = Octavian.zstridedpointer.((C,A,B)) |
| 8 | + nspawn = min(Threads.nthreads(), VectorizationBase.NUM_CORES) |
| 9 | + @elapsed( |
| 10 | + Octavian.matmul_pack_A_and_B!( |
| 11 | + zc, za, zb, StaticInt{1}(), StaticInt{0}(), M, K, N, nspawn, |
| 12 | + StaticFloat{W₁}(), StaticFloat{W₂}(), StaticFloat{R₁}(), StaticFloat{R₂}() |
| 13 | + ) |
| 14 | + ) |
| 15 | +end |
| 16 | + |
| 17 | +function bench_size(Cs, As, Bs, ::Val{W₁}, ::Val{W₂}, ::Val{R₁}, ::Val{R₂}) where {W₁, W₂, R₁, R₂} |
| 18 | + if length(first(Cs)) < length(last(Cs)) |
| 19 | + matmul_pack_ab!(first(Cs), first(As), first(Bs), Val{W₁}(), Val{W₂}(), Val{R₁}(), Val{R₂}()) |
| 20 | + else |
| 21 | + matmul_pack_ab!(last(Cs), last(As), last(Bs), Val{W₁}(), Val{W₂}(), Val{R₁}(), Val{R₂}()) |
| 22 | + end |
| 23 | + gflop = 0.0 |
| 24 | + for (C,A,B) ∈ zip(Cs,As,Bs) |
| 25 | + M, K, N = Octavian.matmul_sizes(C, A, B) |
| 26 | + # sleep(0.5) |
| 27 | + t = matmul_pack_ab!(C, A, B, Val{W₁}(), Val{W₂}(), Val{R₁}(), Val{R₂}()) |
| 28 | + gf = 2e-9M*K*N / t |
| 29 | + gflop += gf |
| 30 | + end |
| 31 | + gflop / length(As) |
| 32 | +end |
| 33 | +matrix_sizes(s::Int) = (s,s,s) |
| 34 | +matrix_sizes(MKN::NTuple{3,Int}) = MKN |
| 35 | +size_range(l, u, len) = round.(Int, exp.(range(log(l), stop = log(u), length = len))) |
| 36 | +function matrix_range(l, u, len, ::Type{T} = Float64) where {T} |
| 37 | + matrix_range(size_range(l, u, len), T) |
| 38 | +end |
| 39 | +function matrix_range(S, ::Type{T} = Float64) where {T} |
| 40 | + Alen = 0; Blen = 0; Clen = 0; |
| 41 | + for s ∈ S |
| 42 | + M, K, N = matrix_sizes(s) |
| 43 | + Alen = max(Alen, M*K) |
| 44 | + Blen = max(Blen, K*N) |
| 45 | + Clen = max(Clen, M*N) |
| 46 | + end |
| 47 | + Abuf = rand(T, Alen) |
| 48 | + Bbuf = rand(T, Blen) |
| 49 | + Cbuf = rand(T, Clen) |
| 50 | + As = Vector{Base.ReshapedArray{T, 2, SubArray{T, 1, Vector{T}, Tuple{Base.OneTo{Int}}, true}, Tuple{}}}(undef, length(S)) |
| 51 | + Bs = similar(As); Cs = similar(As); |
| 52 | + for (i,s) ∈ enumerate(S) |
| 53 | + M, K, N = matrix_sizes(s) |
| 54 | + As[i] = reshape(view(Abuf, Base.OneTo(M * K)), (M, K)) |
| 55 | + Bs[i] = reshape(view(Bbuf, Base.OneTo(K * N)), (K, N)) |
| 56 | + Cs[i] = reshape(view(Cbuf, Base.OneTo(M * N)), (M, N)) |
| 57 | + end |
| 58 | + Cs, As, Bs |
| 59 | +end |
| 60 | + |
| 61 | + |
| 62 | +T = Float64 |
| 63 | +min_size = round(Int, sqrt(0.65 * Octavian.VectorizationBase.CACHE_SIZE[3] / sizeof(T))) |
| 64 | +max_size = round(Int, sqrt( 32 * Octavian.VectorizationBase.CACHE_SIZE[3] / sizeof(T))) |
| 65 | + |
| 66 | +SR = size_range(max_size, min_size, 100); |
| 67 | +const CsConst, AsConst, BsConst = matrix_range(SR, T); |
| 68 | + |
| 69 | +function matmul_objective(params) |
| 70 | + print("Params: ", params, "; ") |
| 71 | + W₁, W₂, R₁, R₂ = params |
| 72 | + # print("(W₁ = $(round(W₁, sigdigits=4)); W₂ = $(round(W₂, sigdigits=4)); R₁ = $(round(R₁, sigdigits=4)); R₂ = $(round(R₂, sigdigits=4))); ") |
| 73 | + gflop = bench_size(CsConst, AsConst, BsConst, Val{W₁}(), Val{W₂}(), Val{R₁}(), Val{R₂}()) |
| 74 | + println(gflop) |
| 75 | + - gflop |
| 76 | +end |
| 77 | + |
| 78 | +using Optim |
| 79 | +hours = 60.0*60.0; days = 24hours; |
| 80 | +init = [Octavian.W₁Default, Octavian.W₂Default, Octavian.R₁Default, Octavian.R₂Default] |
| 81 | + |
| 82 | +opt = Optim.optimize( |
| 83 | + matmul_objective, init, ParticleSwarm(lower = [0.001, 0.01, 0.3, 0.4], upper = [0.1, 2.0, 0.9, 0.99]), |
| 84 | + Optim.Options(iterations = 10^6, time_limit = 8hours) |
| 85 | +); |
| 86 | + |
| 87 | + |
| 88 | + |
| 89 | + |
| 90 | + |
0 commit comments