M1 performance improvements (#82)

chriselrod · web-flow · commit 2dd77ea29601 · 2021-05-06T02:41:08.000Z
* Adjust threading threshold for better performance on M1

* Fix cache sizes for M1

* Bump version.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Octavian"
 uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
-version = "0.2.13"
+version = "0.2.14"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
diff --git a/src/global_constants.jl b/src/global_constants.jl
@@ -48,17 +48,22 @@ R₂Default() = R₂Default(has_feature(Val(:x86_64_avx512f)))
 
 
 
-
-first_cache() = ifelse(gt(num_cache_levels(), StaticInt{2}()), StaticInt{2}(), StaticInt{1}())
+_first_cache(::StaticInt{1}) = StaticInt{1}()
+_first_cache(::StaticInt) = StaticInt{2}()
+first_cache() = _first_cache(VectorizationBase.num_l2cache())
 second_cache() = first_cache() + One()
 
 _first_cache_size(fcs::StaticInt) = ifelse(eq(first_cache(), StaticInt(2)) & cache_inclusive(StaticInt(2)), fcs - cache_size(One()), fcs)
 _first_cache_size(::Nothing) = StaticInt(262144)
 first_cache_size() = _first_cache_size(cache_size(first_cache()))
 
-_second_cache_size(scs::StaticInt) = ifelse(cache_inclusive(second_cache()), scs - cache_size(first_cache()), scs)
-_second_cache_size(::Nothing) = StaticInt(3145728)
-second_cache_size() = _second_cache_size(cache_size(second_cache()))
+_second_cache_size(scs::StaticInt, ::True) = scs - cache_size(first_cache())
+_second_cache_size(scs::StaticInt, ::False) = scs
+_second_cache_size(::StaticInt{0}, ::Nothing) = StaticInt(3145728)
+function second_cache_size()
+    sc = second_cache()
+    _second_cache_size(cache_size(sc), cache_inclusive(sc))
+end
 
 first_cache_size(::Type{T}) where {T} = first_cache_size() ÷ static_sizeof(T)
 second_cache_size(::Type{T}) where {T} = second_cache_size() ÷ static_sizeof(T)
diff --git a/src/matmul.jl b/src/matmul.jl
@@ -260,12 +260,18 @@ end
         if maybeinline(M, N, T, ArrayInterface.is_column_major(A)) # check MUST be compile-time resolvable
             inlineloopmul!(pC, pA, pB, One(), Zero(), M, K, N)
             return
-        elseif (nᵣ ≥ N) || (M*K*N < (StaticInt{4096}() * W))
-            loopmul!(pC, pA, pB, α, β, M, K, N)
-            return
         else
+            (nᵣ ≥ N) && @goto LOOPMUL
+            if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686)
+                (M*K*N < (StaticInt{4_096}() * W)) && @goto LOOPMUL
+            else
+                (M*K*N < (StaticInt{32_000}() * W)) && @goto LOOPMUL
+            end
             __matmul!(pC, pA, pB, α, β, M, K, N, nthread)
             return
+            @label LOOPMUL
+            loopmul!(pC, pA, pB, α, β, M, K, N)
+            return
         end
     end
 end
@@ -326,11 +332,13 @@ function __matmul!(
         return
     end
     # We are threading, but how many threads?
-    L = StaticInt{128}() * W
-    # L = StaticInt{64}() * W
-    nspawn = clamp(div_fast(M * N, L), 1, _nthread)
-
+    nspawn = if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686)
+        clamp(div_fast(M * N, StaticInt{128}() * W), 1, _nthread)
+    else
+        clamp(div_fast(M * N, StaticInt{256}() * W), 1, _nthread)
+    end
     # nkern = cld_fast(M * N,  MᵣW * Nᵣ)
+    
     # Approach:
     # Check if we don't want to pack A,
     #    if not, aggressively subdivide