From 93186dd5df80824916b7f924b418c09a3406dfa3 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 25 Jun 2025 12:20:26 -0300 Subject: [PATCH 1/7] Test exapanded benchmarking [only benchmarks] --- perf/array.jl | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/perf/array.jl b/perf/array.jl index 954014ccc..0d94e8a2b 100644 --- a/perf/array.jl +++ b/perf/array.jl @@ -10,7 +10,7 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar gpu_vec = reshape(gpu_mat, length(gpu_mat)) gpu_arr_3d = reshape(gpu_mat, (m, 40, 25)) gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10)) - gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, Int, m, n)) + gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, -10:10, m, n)) gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints)) gpu_mat_bools = MtlMatrix{Bool,S}(rand(rng, Bool, m, n)) gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools)) @@ -58,19 +58,43 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar # no need to test inplace version, which performs the same operation (but with an alloc) let group = addgroup!(group, "accumulate") - group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec) - group["2d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1) + let group = addgroup!(group, "Float32") + group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec) + group["2d_dims_1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1) + group["2d_dims_2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=2) + end + let group = addgroup!(group, "Int64") + group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec_ints) + group["2d_dims_1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=1) + group["2d_dims_2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=2) + end end let group = addgroup!(group, "reductions") let group = addgroup!(group, "reduce") - group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec) - group["2d"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1) + let group = addgroup!(group, "Float32") + group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec) + group["2d_dims_1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1) + group["2d_dims_2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=2) + end + let group = addgroup!(group, "Int64") + group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec_ints) + group["2d_dims_1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=1) + group["2d_dims_2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=2) + end end let group = addgroup!(group, "mapreduce") - group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec) - group["2d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1) + let group = addgroup!(group, "Float32") + group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec) + group["2d_dims_1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1) + group["2d_dims_2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=2) + end + let group = addgroup!(group, "Int64") + group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec_ints) + group["2d_dims_1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=1) + group["2d_dims_2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=2) + end end # used by sum, prod, minimum, maximum, all, any, count From 5703e315a144ff92a62227682e94218829eade7c Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 25 Jun 2025 13:07:20 -0300 Subject: [PATCH 2/7] Naming --- perf/array.jl | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/perf/array.jl b/perf/array.jl index 0d94e8a2b..f3ef0adf2 100644 --- a/perf/array.jl +++ b/perf/array.jl @@ -60,13 +60,13 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar let group = addgroup!(group, "accumulate") let group = addgroup!(group, "Float32") group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec) - group["2d_dims_1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1) - group["2d_dims_2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=2) + group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1) + group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=2) end let group = addgroup!(group, "Int64") group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec_ints) - group["2d_dims_1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=1) - group["2d_dims_2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=2) + group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=1) + group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=2) end end @@ -74,26 +74,26 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar let group = addgroup!(group, "reduce") let group = addgroup!(group, "Float32") group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec) - group["2d_dims_1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1) - group["2d_dims_2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=2) + group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1) + group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=2) end let group = addgroup!(group, "Int64") group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec_ints) - group["2d_dims_1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=1) - group["2d_dims_2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=2) + group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=1) + group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=2) end end let group = addgroup!(group, "mapreduce") let group = addgroup!(group, "Float32") group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec) - group["2d_dims_1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1) - group["2d_dims_2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=2) + group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1) + group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=2) end let group = addgroup!(group, "Int64") group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec_ints) - group["2d_dims_1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=1) - group["2d_dims_2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=2) + group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=1) + group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=2) end end From 55eceff5a23881ddf18b9044eaa7a34b0fd47a46 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 20 May 2025 22:34:28 -0300 Subject: [PATCH 3/7] Test AK accumulate --- Project.toml | 2 ++ src/Metal.jl | 1 + src/accumulate.jl | 15 +++++++-------- test/Project.toml | 2 ++ test/runtests.jl | 2 ++ 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/Project.toml b/Project.toml index 6129ad5df..4c42f4bfd 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,7 @@ uuid = "dde4c033-4e86-420c-a63e-0dd931031962" version = "1.6.2" [deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" @@ -32,6 +33,7 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" SpecialFunctionsExt = "SpecialFunctions" [compat] +AcceleratedKernels = "0.3.3" Adapt = "4" BFloat16s = "0.5" CEnum = "0.4, 0.5" diff --git a/src/Metal.jl b/src/Metal.jl index b6c974588..b9bdad0e1 100644 --- a/src/Metal.jl +++ b/src/Metal.jl @@ -12,6 +12,7 @@ using ExprTools: splitdef, combinedef using ObjectiveC, .CoreFoundation, .Foundation, .Dispatch, .OS import ObjectiveC: is_macos, darwin_version, macos_version import KernelAbstractions +import AcceleratedKernels as AK using ScopedValues include("version.jl") diff --git a/src/accumulate.jl b/src/accumulate.jl index 31e2dc4fe..dee031fb3 100644 --- a/src/accumulate.jl +++ b/src/accumulate.jl @@ -170,27 +170,26 @@ end ## Base interface Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlVector, dims::Nothing, init::Nothing) = - scan!(op, output, input; dims=1) + @inline AK.accumulate!(op, output, input; dims, init=AK.neutral_element(op, eltype(output)), alg=AK.ScanPrefixes()) Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Nothing) = - scan!(op, output, input; dims=dims) - + @inline AK.accumulate!(op, output, input; dims, init=AK.neutral_element(op, eltype(output)), alg=AK.ScanPrefixes()) Base._accumulate!(op, output::WrappedMtlArray, input::MtlVector, dims::Nothing, init::Some) = - scan!(op, output, input; dims=1, init=init) + @inline AK.accumulate!(op, output, input; dims, init=something(init), alg=AK.ScanPrefixes()) Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Some) = - scan!(op, output, input; dims=dims, init=init) + @inline AK.accumulate!(op, output, input; dims, init=something(init), alg=AK.ScanPrefixes()) -Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = accumulate!(op, result, v) +Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v; init=AK.neutral_element(op, eltype(result)), alg=AK.ScanPrefixes()) # default behavior unless dims are specified by the user function Base.accumulate(op, A::WrappedMtlArray; dims::Union{Nothing,Integer}=nothing, kw...) + nt = values(kw) if dims === nothing && !(A isa AbstractVector) # This branch takes care of the cases not handled by `_accumulate!`. - return reshape(accumulate(op, A[:]; kw...), size(A)) + return reshape(AK.accumulate(op, A[:]; init = (:init in keys(kw) ? nt.init : AK.neutral_element(op, eltype(A))), alg=AK.ScanPrefixes()), size(A)) end - nt = values(kw) if isempty(kw) out = similar(A, Base.promote_op(op, eltype(A), eltype(A))) elseif keys(nt) === (:init,) diff --git a/test/Project.toml b/test/Project.toml index b64b414d6..a511d64fb 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" @@ -11,6 +12,7 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" ObjectiveC = "e86c9b32-1129-44ac-8ea0-90d5bb39ded9" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/test/runtests.jl b/test/runtests.jl index b46c4ee71..0fdaaa134 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,3 +1,5 @@ +using Pkg +Pkg.develop("AcceleratedKernels") using Distributed using Dates using Metal From bce8a7b5b4c7742b9e66d7bec8fcfe9916830f2a Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 21 May 2025 10:14:44 -0300 Subject: [PATCH 4/7] [only benchmarks] --- .buildkite/pipeline.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 2d69ec0e5..bbeb3aa9c 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -138,6 +138,7 @@ steps: println("--- :julia: Instantiating project") Pkg.develop([PackageSpec(path=pwd())]) + Pkg.add(url="https://github.com/JuliaGPU/AcceleratedKernels.jl", rev="main") println("+++ :julia: Benchmarking") include("perf/runbenchmarks.jl")' From 5c2f529228e28c8e00c9083d1b8f2836bb03a8f9 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 27 May 2025 11:11:19 -0300 Subject: [PATCH 5/7] Use default algorithm from AK --- src/accumulate.jl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/accumulate.jl b/src/accumulate.jl index dee031fb3..7af43a09f 100644 --- a/src/accumulate.jl +++ b/src/accumulate.jl @@ -170,17 +170,17 @@ end ## Base interface Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlVector, dims::Nothing, init::Nothing) = - @inline AK.accumulate!(op, output, input; dims, init=AK.neutral_element(op, eltype(output)), alg=AK.ScanPrefixes()) + @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output))) Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Nothing) = - @inline AK.accumulate!(op, output, input; dims, init=AK.neutral_element(op, eltype(output)), alg=AK.ScanPrefixes()) + @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output))) Base._accumulate!(op, output::WrappedMtlArray, input::MtlVector, dims::Nothing, init::Some) = - @inline AK.accumulate!(op, output, input; dims, init=something(init), alg=AK.ScanPrefixes()) + @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init)) Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Some) = - @inline AK.accumulate!(op, output, input; dims, init=something(init), alg=AK.ScanPrefixes()) + @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init)) -Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v; init=AK.neutral_element(op, eltype(result)), alg=AK.ScanPrefixes()) +Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v, MetalBackend(); init=AK.neutral_element(op, eltype(result))) # default behavior unless dims are specified by the user function Base.accumulate(op, A::WrappedMtlArray; @@ -188,14 +188,16 @@ function Base.accumulate(op, A::WrappedMtlArray; nt = values(kw) if dims === nothing && !(A isa AbstractVector) # This branch takes care of the cases not handled by `_accumulate!`. - return reshape(AK.accumulate(op, A[:]; init = (:init in keys(kw) ? nt.init : AK.neutral_element(op, eltype(A))), alg=AK.ScanPrefixes()), size(A)) + return reshape(AK.accumulate(op, A[:], MetalBackend(); init = (:init in keys(kw) ? nt.init : AK.neutral_element(op, eltype(A)))), size(A)) end if isempty(kw) out = similar(A, Base.promote_op(op, eltype(A), eltype(A))) + init = AK.neutral_element(op, eltype(out)) elseif keys(nt) === (:init,) out = similar(A, Base.promote_op(op, typeof(nt.init), eltype(A))) + init = nt.init else throw(ArgumentError("accumulate does not support the keyword arguments $(setdiff(keys(nt), (:init,)))")) end - accumulate!(op, out, A; dims=dims, kw...) + AK.accumulate!(op, out, A, MetalBackend(); dims, init) end From d2e94b81b86b931fdaad34a4a9f1f27346ea900d Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 27 May 2025 11:13:46 -0300 Subject: [PATCH 6/7] Compat --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 4c42f4bfd..7038e1edd 100644 --- a/Project.toml +++ b/Project.toml @@ -33,7 +33,7 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" SpecialFunctionsExt = "SpecialFunctions" [compat] -AcceleratedKernels = "0.3.3" +AcceleratedKernels = "0.4" Adapt = "4" BFloat16s = "0.5" CEnum = "0.4, 0.5" From e1935eda5ca4934ef85b91276a011f43dcc7100e Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 25 Jun 2025 15:37:36 -0300 Subject: [PATCH 7/7] Use AK for supported reductions --- src/mapreduce.jl | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/mapreduce.jl b/src/mapreduce.jl index 8a353e3c2..59e5f8dbd 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -142,6 +142,33 @@ end ## COV_EXCL_STOP +Base.mapreduce(f, op, A::WrappedMtlArray; + dims=:, init=nothing) = _mapreduce(f, op, A, init, dims) + # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims) +Base.mapreduce(f, op, A::Broadcast.Broadcasted{<:MtlArrayStyle}; + dims=:, init=nothing) = _mapreduce(f, op, A, init, dims) + # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims) + +# "Borrowed" from GPUArrays +@inline function _init_value(f, op, init, As...) + if init === nothing + ET = Broadcast.combine_eltypes(f, As) + ET = Base.promote_op(op, ET, ET) + (ET === Union{} || ET === Any) && + error("mapreduce cannot figure the output element type, please pass an explicit init value") + + init = AK.neutral_element(op, ET) + end + return init +end + +function _mapreduce(f, op, A, init, dims::Union{Nothing, Integer}) + init_val = _init_value(f, op, init, A) + AK.mapreduce(f, op, A; init=init_val, neutral=init_val, dims) +end +_mapreduce(f, op, A, init, ::Colon) = _mapreduce(f, op, A, init, nothing) +_mapreduce(f, op, A, init, dims) = GPUArrays._mapreduce(f, op, A; dims, init) + function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T}, A::Union{AbstractArray,Broadcast.Broadcasted}; init=nothing) where {F, OP, T}