JuliaGPU · christiangnrd · Jun 25, 2025 · Jun 25, 2025 · May 21, 2025 · May 21, 2025
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -138,6 +138,7 @@ steps:
 
         println("--- :julia: Instantiating project")
         Pkg.develop([PackageSpec(path=pwd())])
+        Pkg.add(url="https://github.com/JuliaGPU/AcceleratedKernels.jl", rev="main")
 
         println("+++ :julia: Benchmarking")
         include("perf/runbenchmarks.jl")'

diff --git a/Project.toml b/Project.toml
@@ -3,6 +3,7 @@ uuid = "dde4c033-4e86-420c-a63e-0dd931031962"
 version = "1.6.2"
 
 [deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
@@ -32,6 +33,7 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 SpecialFunctionsExt = "SpecialFunctions"
 
 [compat]
+AcceleratedKernels = "0.4"
 Adapt = "4"
 BFloat16s = "0.5"
 CEnum = "0.4, 0.5"

diff --git a/perf/array.jl b/perf/array.jl
@@ -10,7 +10,7 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
     gpu_vec = reshape(gpu_mat, length(gpu_mat))
     gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
     gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
-    gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, Int, m, n))
+    gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, -10:10, m, n))
     gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
     gpu_mat_bools = MtlMatrix{Bool,S}(rand(rng, Bool, m, n))
     gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
@@ -58,19 +58,43 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
 
     # no need to test inplace version, which performs the same operation (but with an alloc)
     let group = addgroup!(group, "accumulate")
-        group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec)
-        group["2d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1)
+        let group = addgroup!(group, "Float32")
+            group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec)
+            group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1)
+            group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=2)
+        end
+        let group = addgroup!(group, "Int64")
+            group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec_ints)
+            group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=1)
+            group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=2)
+        end
     end
 
     let group = addgroup!(group, "reductions")
         let group = addgroup!(group, "reduce")
-            group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec)
-            group["2d"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1)
+            let group = addgroup!(group, "Float32")
+                group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec)
+                group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1)
+                group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=2)
+            end
+            let group = addgroup!(group, "Int64")
+                group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec_ints)
+                group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=1)
+                group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=2)
+            end
         end
 
         let group = addgroup!(group, "mapreduce")
-            group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec)
-            group["2d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1)
+            let group = addgroup!(group, "Float32")
+                group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec)
+                group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1)
+                group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=2)
+            end
+            let group = addgroup!(group, "Int64")
+                group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec_ints)
+                group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
+                group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
+            end
         end
 
         # used by sum, prod, minimum, maximum, all, any, count

diff --git a/src/Metal.jl b/src/Metal.jl
@@ -12,6 +12,7 @@ using ExprTools: splitdef, combinedef
 using ObjectiveC, .CoreFoundation, .Foundation, .Dispatch, .OS
 import ObjectiveC: is_macos, darwin_version, macos_version
 import KernelAbstractions
+import AcceleratedKernels as AK
 using ScopedValues
 
 include("version.jl")

diff --git a/src/accumulate.jl b/src/accumulate.jl
@@ -170,33 +170,34 @@ end
 ## Base interface
 
 Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlVector, dims::Nothing, init::Nothing) =
-    scan!(op, output, input; dims=1)
+    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output)))
 
 Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Nothing) =
-    scan!(op, output, input; dims=dims)
-
+    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output)))
 Base._accumulate!(op, output::WrappedMtlArray, input::MtlVector, dims::Nothing, init::Some) =
-    scan!(op, output, input; dims=1, init=init)
+    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init))
 
 Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Some) =
-    scan!(op, output, input; dims=dims, init=init)
+    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init))
 
-Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = accumulate!(op, result, v)
+Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v, MetalBackend(); init=AK.neutral_element(op, eltype(result)))
 
 # default behavior unless dims are specified by the user
 function Base.accumulate(op, A::WrappedMtlArray;
                          dims::Union{Nothing,Integer}=nothing, kw...)
+    nt = values(kw)
     if dims === nothing && !(A isa AbstractVector)
         # This branch takes care of the cases not handled by `_accumulate!`.
-        return reshape(accumulate(op, A[:]; kw...), size(A))
+        return reshape(AK.accumulate(op, A[:], MetalBackend(); init = (:init in keys(kw) ? nt.init : AK.neutral_element(op, eltype(A)))), size(A))
     end
-    nt = values(kw)
     if isempty(kw)
         out = similar(A, Base.promote_op(op, eltype(A), eltype(A)))
+        init = AK.neutral_element(op, eltype(out))
     elseif keys(nt) === (:init,)
         out = similar(A, Base.promote_op(op, typeof(nt.init), eltype(A)))
+        init = nt.init
     else
         throw(ArgumentError("accumulate does not support the keyword arguments $(setdiff(keys(nt), (:init,)))"))
     end
-    accumulate!(op, out, A; dims=dims, kw...)
+    AK.accumulate!(op, out, A, MetalBackend(); dims, init)
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -142,6 +142,33 @@ end
 
 ## COV_EXCL_STOP
 
+Base.mapreduce(f, op, A::WrappedMtlArray;
+            dims=:, init=nothing) = _mapreduce(f, op, A, init, dims)
+            # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
+Base.mapreduce(f, op, A::Broadcast.Broadcasted{<:MtlArrayStyle};
+            dims=:, init=nothing) = _mapreduce(f, op, A, init, dims)
+            # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
+
+# "Borrowed" from GPUArrays
+@inline function _init_value(f, op, init, As...)
+    if init === nothing
+        ET = Broadcast.combine_eltypes(f, As)
+        ET = Base.promote_op(op, ET, ET)
+        (ET === Union{} || ET === Any) &&
+            error("mapreduce cannot figure the output element type, please pass an explicit init value")
+
+        init = AK.neutral_element(op, ET)
+    end
+    return init
+end
+
+function _mapreduce(f, op, A, init, dims::Union{Nothing, Integer})
+    init_val = _init_value(f, op, init, A)
+    AK.mapreduce(f, op, A; init=init_val, neutral=init_val, dims)
+end
+_mapreduce(f, op, A, init, ::Colon) = _mapreduce(f, op, A, init, nothing)
+_mapreduce(f, op, A, init, dims) = GPUArrays._mapreduce(f, op, A; dims, init)
+
 function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
                                  A::Union{AbstractArray,Broadcast.Broadcasted};
                                  init=nothing) where {F, OP, T}

diff --git a/test/Project.toml b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
@@ -11,6 +12,7 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 ObjectiveC = "e86c9b32-1129-44ac-8ea0-90d5bb39ded9"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,3 +1,5 @@
+using Pkg
+Pkg.develop("AcceleratedKernels")
 using Distributed
 using Dates
 using Metal