Skip to content

Commit 2dd77ea

Browse files
authored
M1 performance improvements (#82)
* Adjust threading threshold for better performance on M1 * Fix cache sizes for M1 * Bump version.
1 parent 9457cdb commit 2dd77ea

File tree

3 files changed

+26
-13
lines changed

3 files changed

+26
-13
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "Octavian"
22
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
33
authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
4-
version = "0.2.13"
4+
version = "0.2.14"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

src/global_constants.jl

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,17 +48,22 @@ R₂Default() = R₂Default(has_feature(Val(:x86_64_avx512f)))
4848

4949

5050

51-
52-
first_cache() = ifelse(gt(num_cache_levels(), StaticInt{2}()), StaticInt{2}(), StaticInt{1}())
51+
_first_cache(::StaticInt{1}) = StaticInt{1}()
52+
_first_cache(::StaticInt) = StaticInt{2}()
53+
first_cache() = _first_cache(VectorizationBase.num_l2cache())
5354
second_cache() = first_cache() + One()
5455

5556
_first_cache_size(fcs::StaticInt) = ifelse(eq(first_cache(), StaticInt(2)) & cache_inclusive(StaticInt(2)), fcs - cache_size(One()), fcs)
5657
_first_cache_size(::Nothing) = StaticInt(262144)
5758
first_cache_size() = _first_cache_size(cache_size(first_cache()))
5859

59-
_second_cache_size(scs::StaticInt) = ifelse(cache_inclusive(second_cache()), scs - cache_size(first_cache()), scs)
60-
_second_cache_size(::Nothing) = StaticInt(3145728)
61-
second_cache_size() = _second_cache_size(cache_size(second_cache()))
60+
_second_cache_size(scs::StaticInt, ::True) = scs - cache_size(first_cache())
61+
_second_cache_size(scs::StaticInt, ::False) = scs
62+
_second_cache_size(::StaticInt{0}, ::Nothing) = StaticInt(3145728)
63+
function second_cache_size()
64+
sc = second_cache()
65+
_second_cache_size(cache_size(sc), cache_inclusive(sc))
66+
end
6267

6368
first_cache_size(::Type{T}) where {T} = first_cache_size() ÷ static_sizeof(T)
6469
second_cache_size(::Type{T}) where {T} = second_cache_size() ÷ static_sizeof(T)

src/matmul.jl

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -260,12 +260,18 @@ end
260260
if maybeinline(M, N, T, ArrayInterface.is_column_major(A)) # check MUST be compile-time resolvable
261261
inlineloopmul!(pC, pA, pB, One(), Zero(), M, K, N)
262262
return
263-
elseif (nᵣ N) || (M*K*N < (StaticInt{4096}() * W))
264-
loopmul!(pC, pA, pB, α, β, M, K, N)
265-
return
266263
else
264+
(nᵣ N) && @goto LOOPMUL
265+
if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686)
266+
(M*K*N < (StaticInt{4_096}() * W)) && @goto LOOPMUL
267+
else
268+
(M*K*N < (StaticInt{32_000}() * W)) && @goto LOOPMUL
269+
end
267270
__matmul!(pC, pA, pB, α, β, M, K, N, nthread)
268271
return
272+
@label LOOPMUL
273+
loopmul!(pC, pA, pB, α, β, M, K, N)
274+
return
269275
end
270276
end
271277
end
@@ -326,11 +332,13 @@ function __matmul!(
326332
return
327333
end
328334
# We are threading, but how many threads?
329-
L = StaticInt{128}() * W
330-
# L = StaticInt{64}() * W
331-
nspawn = clamp(div_fast(M * N, L), 1, _nthread)
332-
335+
nspawn = if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686)
336+
clamp(div_fast(M * N, StaticInt{128}() * W), 1, _nthread)
337+
else
338+
clamp(div_fast(M * N, StaticInt{256}() * W), 1, _nthread)
339+
end
333340
# nkern = cld_fast(M * N, MᵣW * Nᵣ)
341+
334342
# Approach:
335343
# Check if we don't want to pack A,
336344
# if not, aggressively subdivide

0 commit comments

Comments
 (0)