From fd26e92c7822f850bc0c7592614a8f3e4203ad5b Mon Sep 17 00:00:00 2001 From: jop611 Date: Fri, 9 Aug 2024 12:53:37 +0200 Subject: [PATCH 01/34] changes to consistent and assemble for performance --- src/jagged_array.jl | 10 + src/p_sparse_matrix.jl | 858 +++++++++++++++++++++++++++++------------ 2 files changed, 615 insertions(+), 253 deletions(-) diff --git a/src/jagged_array.jl b/src/jagged_array.jl index 60a74dec..a3d61789 100644 --- a/src/jagged_array.jl +++ b/src/jagged_array.jl @@ -154,6 +154,16 @@ function JaggedArray{T,Ti}(a::AbstractArray{<:AbstractArray}) where {T,Ti} JaggedArray(data,ptrs) end +# New +function jagged_range(a::Union{JaggedArray,GenericJaggedArray},i::Integer) + u = one(eltype(a.ptrs)) + pini = a.ptrs[i] + pend = a.ptrs[i+1]-u + pini:pend +end + + +########### Base.size(a::Union{JaggedArray,GenericJaggedArray}) = (length(a.ptrs)-1,) function Base.getindex(a::Union{JaggedArray,GenericJaggedArray},i::Int) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index f2b65fc8..d9561d6e 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -1357,146 +1357,322 @@ function psparse_assemble_impl(A,::Type,rows) error("Case not implemented yet") end -function psparse_assemble_impl( - A, - ::Type{<:AbstractSplitMatrix}, - rows; - reuse=Val(false), - assembly_neighbors_options_cols=(;)) - - function setup_cache_snd(A,parts_snd,rows_sa,cols_sa) - A_ghost_own = A.blocks.ghost_own - A_ghost_ghost = A.blocks.ghost_ghost - gen = ( owner=>i for (i,owner) in enumerate(parts_snd) ) - owner_to_p = Dict(gen) - ptrs = zeros(Int32,length(parts_snd)+1) - ghost_to_owner_row = ghost_to_owner(rows_sa) - ghost_to_global_row = ghost_to_global(rows_sa) - own_to_global_col = own_to_global(cols_sa) - ghost_to_global_col = ghost_to_global(cols_sa) - for (i,_,_) in nziterator(A_ghost_own) - owner = ghost_to_owner_row[i] - ptrs[owner_to_p[owner]+1] += 1 - end - for (i,_,_) in nziterator(A_ghost_ghost) - owner = ghost_to_owner_row[i] - ptrs[owner_to_p[owner]+1] += 1 - end - length_to_ptrs!(ptrs) - Tv = eltype(A_ghost_own) - ndata = ptrs[end]-1 - I_snd_data = zeros(Int,ndata) - J_snd_data = zeros(Int,ndata) - V_snd_data = zeros(Tv,ndata) - k_snd_data = zeros(Int32,ndata) - nnz_ghost_own = 0 - for (k,(i,j,v)) in enumerate(nziterator(A_ghost_own)) - owner = ghost_to_owner_row[i] - p = ptrs[owner_to_p[owner]] - I_snd_data[p] = ghost_to_global_row[i] - J_snd_data[p] = own_to_global_col[j] - V_snd_data[p] = v - k_snd_data[p] = k - ptrs[owner_to_p[owner]] += 1 - nnz_ghost_own += 1 - end - for (k,(i,j,v)) in enumerate(nziterator(A_ghost_ghost)) - owner = ghost_to_owner_row[i] - p = ptrs[owner_to_p[owner]] - I_snd_data[p] = ghost_to_global_row[i] - J_snd_data[p] = ghost_to_global_col[j] - V_snd_data[p] = v - k_snd_data[p] = k+nnz_ghost_own - ptrs[owner_to_p[owner]] += 1 - end - rewind_ptrs!(ptrs) - I_snd = JaggedArray(I_snd_data,ptrs) - J_snd = JaggedArray(J_snd_data,ptrs) - V_snd = JaggedArray(V_snd_data,ptrs) - k_snd = JaggedArray(k_snd_data,ptrs) - (;I_snd,J_snd,V_snd,k_snd,parts_snd) - end - function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv) - k_rcv_data = zeros(Int32,length(I_rcv.data)) - k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs) - (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv) - end - function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa) - nz_own_own = findnz(A.blocks.own_own) - nz_own_ghost = findnz(A.blocks.own_ghost) - I_rcv_data = cache_rcv.I_rcv.data - J_rcv_data = cache_rcv.J_rcv.data - V_rcv_data = cache_rcv.V_rcv.data - k_rcv_data = cache_rcv.k_rcv.data - global_to_own_col = global_to_own(cols_sa) - is_ghost = findall(j->global_to_own_col[j]==0,J_rcv_data) - is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data) - I_rcv_own = view(I_rcv_data,is_own) - J_rcv_own = view(J_rcv_data,is_own) - V_rcv_own = view(V_rcv_data,is_own) - k_rcv_own = view(k_rcv_data,is_own) - I_rcv_ghost = view(I_rcv_data,is_ghost) - J_rcv_ghost = view(J_rcv_data,is_ghost) - V_rcv_ghost = view(V_rcv_data,is_ghost) - k_rcv_ghost = view(k_rcv_data,is_ghost) - # After this col ids in own_ghost triplet remain global - map_global_to_own!(I_rcv_own,rows_sa) - map_global_to_own!(J_rcv_own,cols_sa) - map_global_to_own!(I_rcv_ghost,rows_sa) - map_ghost_to_global!(nz_own_ghost[2],cols_sa) - own_own_I = vcat(nz_own_own[1],I_rcv_own) - own_own_J = vcat(nz_own_own[2],J_rcv_own) - own_own_V = vcat(nz_own_own[3],V_rcv_own) - own_own_triplet = (own_own_I,own_own_J,own_own_V) - own_ghost_I = vcat(nz_own_ghost[1],I_rcv_ghost) - own_ghost_J = vcat(nz_own_ghost[2],J_rcv_ghost) - own_ghost_V = vcat(nz_own_ghost[3],V_rcv_ghost) - map_global_to_ghost!(nz_own_ghost[2],cols_sa) - own_ghost_triplet = (own_ghost_I,own_ghost_J,own_ghost_V) - triplets = (own_own_triplet,own_ghost_triplet) - aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) - triplets, own_ghost_J, aux - end - function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) - (own_own_triplet,own_ghost_triplet) = triplets - (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux - map_global_to_ghost!(own_ghost_triplet[2],cols_fa) - map_global_to_ghost!(J_rcv_ghost,cols_fa) - TA = typeof(A.blocks.own_own) - n_own_rows = own_length(rows_fa) - n_own_cols = own_length(cols_fa) - n_ghost_rows = ghost_length(rows_fa) - n_ghost_cols = ghost_length(cols_fa) - Ti = indextype(A.blocks.own_own) - Tv = eltype(A.blocks.own_own) - own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols) - own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols) - ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols) - ghost_ghost = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_ghost_cols) - blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) - values = split_matrix(blocks,local_permutation(rows_fa),local_permutation(cols_fa)) - nnz_own_own = nnz(own_own) - k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...) - k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...) - for p in 1:length(I_rcv_own) - i = I_rcv_own[p] - j = J_rcv_own[p] - k_rcv_own[p] = nzindex(own_own,i,j) - end - for p in 1:length(I_rcv_ghost) - i = I_rcv_ghost[p] - j = J_rcv_ghost[p] - k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own - end - cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...) - values, cache +# function psparse_assemble_impl( +# A, +# ::Type{<:AbstractSplitMatrix}, +# rows; +# reuse=Val(false), +# assembly_neighbors_options_cols=(;)) + +# function setup_cache_snd(A,parts_snd,rows_sa,cols_sa) +# A_ghost_own = A.blocks.ghost_own +# A_ghost_ghost = A.blocks.ghost_ghost +# gen = ( owner=>i for (i,owner) in enumerate(parts_snd) ) +# owner_to_p = Dict(gen) +# ptrs = zeros(Int32,length(parts_snd)+1) +# ghost_to_owner_row = ghost_to_owner(rows_sa) +# ghost_to_global_row = ghost_to_global(rows_sa) +# own_to_global_col = own_to_global(cols_sa) +# ghost_to_global_col = ghost_to_global(cols_sa) +# for (i,_,_) in nziterator(A_ghost_own) +# owner = ghost_to_owner_row[i] +# ptrs[owner_to_p[owner]+1] += 1 +# end +# for (i,_,_) in nziterator(A_ghost_ghost) +# owner = ghost_to_owner_row[i] +# ptrs[owner_to_p[owner]+1] += 1 +# end +# length_to_ptrs!(ptrs) +# Tv = eltype(A_ghost_own) +# ndata = ptrs[end]-1 +# I_snd_data = zeros(Int,ndata) +# J_snd_data = zeros(Int,ndata) +# V_snd_data = zeros(Tv,ndata) +# k_snd_data = zeros(Int32,ndata) +# nnz_ghost_own = 0 +# for (k,(i,j,v)) in enumerate(nziterator(A_ghost_own)) +# owner = ghost_to_owner_row[i] +# p = ptrs[owner_to_p[owner]] +# I_snd_data[p] = ghost_to_global_row[i] +# J_snd_data[p] = own_to_global_col[j] +# V_snd_data[p] = v +# k_snd_data[p] = k +# ptrs[owner_to_p[owner]] += 1 +# nnz_ghost_own += 1 +# end +# for (k,(i,j,v)) in enumerate(nziterator(A_ghost_ghost)) +# owner = ghost_to_owner_row[i] +# p = ptrs[owner_to_p[owner]] +# I_snd_data[p] = ghost_to_global_row[i] +# J_snd_data[p] = ghost_to_global_col[j] +# V_snd_data[p] = v +# k_snd_data[p] = k+nnz_ghost_own +# ptrs[owner_to_p[owner]] += 1 +# end +# rewind_ptrs!(ptrs) +# I_snd = JaggedArray(I_snd_data,ptrs) +# J_snd = JaggedArray(J_snd_data,ptrs) +# V_snd = JaggedArray(V_snd_data,ptrs) +# k_snd = JaggedArray(k_snd_data,ptrs) +# (;I_snd,J_snd,V_snd,k_snd,parts_snd) +# end +# function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv) +# k_rcv_data = zeros(Int32,length(I_rcv.data)) +# k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs) +# (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv) +# end +# function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa) +# nz_own_own = findnz(A.blocks.own_own) +# nz_own_ghost = findnz(A.blocks.own_ghost) +# I_rcv_data = cache_rcv.I_rcv.data +# J_rcv_data = cache_rcv.J_rcv.data +# V_rcv_data = cache_rcv.V_rcv.data +# k_rcv_data = cache_rcv.k_rcv.data +# global_to_own_col = global_to_own(cols_sa) +# is_ghost = findall(j->global_to_own_col[j]==0,J_rcv_data) +# is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data) +# I_rcv_own = view(I_rcv_data,is_own) +# J_rcv_own = view(J_rcv_data,is_own) +# V_rcv_own = view(V_rcv_data,is_own) +# k_rcv_own = view(k_rcv_data,is_own) +# I_rcv_ghost = view(I_rcv_data,is_ghost) +# J_rcv_ghost = view(J_rcv_data,is_ghost) +# V_rcv_ghost = view(V_rcv_data,is_ghost) +# k_rcv_ghost = view(k_rcv_data,is_ghost) +# # After this col ids in own_ghost triplet remain global +# map_global_to_own!(I_rcv_own,rows_sa) +# map_global_to_own!(J_rcv_own,cols_sa) +# map_global_to_own!(I_rcv_ghost,rows_sa) +# map_ghost_to_global!(nz_own_ghost[2],cols_sa) +# own_own_I = vcat(nz_own_own[1],I_rcv_own) +# own_own_J = vcat(nz_own_own[2],J_rcv_own) +# own_own_V = vcat(nz_own_own[3],V_rcv_own) +# own_own_triplet = (own_own_I,own_own_J,own_own_V) +# own_ghost_I = vcat(nz_own_ghost[1],I_rcv_ghost) +# own_ghost_J = vcat(nz_own_ghost[2],J_rcv_ghost) +# own_ghost_V = vcat(nz_own_ghost[3],V_rcv_ghost) +# map_global_to_ghost!(nz_own_ghost[2],cols_sa) +# own_ghost_triplet = (own_ghost_I,own_ghost_J,own_ghost_V) +# triplets = (own_own_triplet,own_ghost_triplet) +# aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) +# triplets, own_ghost_J, aux +# end +# function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) +# (own_own_triplet,own_ghost_triplet) = triplets +# (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux +# map_global_to_ghost!(own_ghost_triplet[2],cols_fa) +# map_global_to_ghost!(J_rcv_ghost,cols_fa) +# TA = typeof(A.blocks.own_own) +# n_own_rows = own_length(rows_fa) +# n_own_cols = own_length(cols_fa) +# n_ghost_rows = ghost_length(rows_fa) +# n_ghost_cols = ghost_length(cols_fa) +# Ti = indextype(A.blocks.own_own) +# Tv = eltype(A.blocks.own_own) +# own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols) +# own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols) +# ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols) +# ghost_ghost = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_ghost_cols) +# blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) +# values = split_matrix(blocks,local_permutation(rows_fa),local_permutation(cols_fa)) +# nnz_own_own = nnz(own_own) +# k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...) +# k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...) +# for p in 1:length(I_rcv_own) +# i = I_rcv_own[p] +# j = J_rcv_own[p] +# k_rcv_own[p] = nzindex(own_own,i,j) +# end +# for p in 1:length(I_rcv_ghost) +# i = I_rcv_ghost[p] +# j = J_rcv_ghost[p] +# k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own +# end +# cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...) +# values, cache +# end +# rows_sa = partition(axes(A,1)) +# cols_sa = partition(axes(A,2)) +# #rows = map(remove_ghost,rows_sa) +# cols = map(remove_ghost,cols_sa) +# parts_snd, parts_rcv = assembly_neighbors(rows_sa) +# cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa) +# I_snd = map(i->i.I_snd,cache_snd) +# J_snd = map(i->i.J_snd,cache_snd) +# V_snd = map(i->i.V_snd,cache_snd) +# graph = ExchangeGraph(parts_snd,parts_rcv) +# t_I = exchange(I_snd,graph) +# t_J = exchange(J_snd,graph) +# t_V = exchange(V_snd,graph) +# @fake_async begin +# I_rcv = fetch(t_I) +# J_rcv = fetch(t_J) +# V_rcv = fetch(t_V) +# cache_rcv = map(setup_cache_rcv,I_rcv,J_rcv,V_rcv,parts_rcv) +# triplets,J,aux = map(setup_own_triplets,partition(A),cache_rcv,rows_sa,cols_sa) |> tuple_of_arrays +# J_owner = find_owner(cols_sa,J) +# rows_fa = rows +# cols_fa = map(union_ghost,cols,J,J_owner) +# assembly_neighbors(cols_fa;assembly_neighbors_options_cols...) +# vals_fa, cache = map(finalize_values,partition(A),rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) |> tuple_of_arrays +# assembled = true +# B = PSparseMatrix(vals_fa,rows_fa,cols_fa,assembled) +# if val_parameter(reuse) == false +# B +# else +# B, cache +# end +# end +# end + +# New assemble +#################### + +function setup_cache_snd(A,parts_snd,rows_sa,cols_sa) + A_ghost_own = A.blocks.ghost_own + A_ghost_ghost = A.blocks.ghost_ghost + gen = ( owner=>i for (i,owner) in enumerate(parts_snd) ) + owner_to_p = Dict(gen) + ptrs = zeros(Int32,length(parts_snd)+1) + ghost_to_owner_row = ghost_to_owner(rows_sa) + ghost_to_global_row = ghost_to_global(rows_sa) + own_to_global_col = own_to_global(cols_sa) + ghost_to_global_col = ghost_to_global(cols_sa) + for (i,_,_) in nziterator(A_ghost_own) + owner = ghost_to_owner_row[i] + ptrs[owner_to_p[owner]+1] += 1 + end + for (i,_,_) in nziterator(A_ghost_ghost) + owner = ghost_to_owner_row[i] + ptrs[owner_to_p[owner]+1] += 1 + end + length_to_ptrs!(ptrs) + Tv = eltype(A_ghost_own) + ndata = ptrs[end]-1 + I_snd_data = zeros(Int,ndata) + J_snd_data = zeros(Int,ndata) + V_snd_data = zeros(Tv,ndata) + k_snd_data = zeros(Int32,ndata) + nnz_ghost_own = 0 + for (k,(i,j,v)) in enumerate(nziterator(A_ghost_own)) + owner = ghost_to_owner_row[i] + p = ptrs[owner_to_p[owner]] + I_snd_data[p] = ghost_to_global_row[i] + J_snd_data[p] = own_to_global_col[j] + V_snd_data[p] = v + k_snd_data[p] = k + ptrs[owner_to_p[owner]] += 1 + nnz_ghost_own += 1 + end + for (k,(i,j,v)) in enumerate(nziterator(A_ghost_ghost)) + owner = ghost_to_owner_row[i] + p = ptrs[owner_to_p[owner]] + I_snd_data[p] = ghost_to_global_row[i] + J_snd_data[p] = ghost_to_global_col[j] + V_snd_data[p] = v + k_snd_data[p] = k+nnz_ghost_own + ptrs[owner_to_p[owner]] += 1 + end + rewind_ptrs!(ptrs) + I_snd = JaggedArray(I_snd_data,ptrs) + J_snd = JaggedArray(J_snd_data,ptrs) + V_snd = JaggedArray(V_snd_data,ptrs) + k_snd = JaggedArray(k_snd_data,ptrs) + (;I_snd,J_snd,V_snd,k_snd,parts_snd) +end + +function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv) + k_rcv_data = zeros(Int32,length(I_rcv.data)) + k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs) + (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv) +end + +function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa) + nz_own_own = findnz(A.blocks.own_own) + nz_own_ghost = findnz(A.blocks.own_ghost) + I_rcv_data = cache_rcv.I_rcv.data + J_rcv_data = cache_rcv.J_rcv.data + V_rcv_data = cache_rcv.V_rcv.data + k_rcv_data = cache_rcv.k_rcv.data + global_to_own_col = global_to_own(cols_sa) + is_ghost = findall(j->global_to_own_col[j]==0,J_rcv_data) + is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data) + I_rcv_own = view(I_rcv_data,is_own) + J_rcv_own = view(J_rcv_data,is_own) + V_rcv_own = view(V_rcv_data,is_own) + k_rcv_own = view(k_rcv_data,is_own) + I_rcv_ghost = view(I_rcv_data,is_ghost) + J_rcv_ghost = view(J_rcv_data,is_ghost) + V_rcv_ghost = view(V_rcv_data,is_ghost) + k_rcv_ghost = view(k_rcv_data,is_ghost) + # After this col ids in own_ghost triplet remain global + map_global_to_own!(I_rcv_own,rows_sa) + map_global_to_own!(J_rcv_own,cols_sa) + map_global_to_own!(I_rcv_ghost,rows_sa) + map_ghost_to_global!(nz_own_ghost[2],cols_sa) + own_own_I = vcat(nz_own_own[1],I_rcv_own) + own_own_J = vcat(nz_own_own[2],J_rcv_own) + own_own_V = vcat(nz_own_own[3],V_rcv_own) + own_own_triplet = (own_own_I,own_own_J,own_own_V) + own_ghost_I = vcat(nz_own_ghost[1],I_rcv_ghost) + own_ghost_J = vcat(nz_own_ghost[2],J_rcv_ghost) + own_ghost_V = vcat(nz_own_ghost[3],V_rcv_ghost) + map_global_to_ghost!(nz_own_ghost[2],cols_sa) + own_ghost_triplet = (own_ghost_I,own_ghost_J,own_ghost_V) + triplets = (own_own_triplet,own_ghost_triplet) + aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) + triplets, own_ghost_J, aux +end + +function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) + (own_own_triplet,own_ghost_triplet) = triplets + (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux + map_global_to_ghost!(own_ghost_triplet[2],cols_fa) + map_global_to_ghost!(J_rcv_ghost,cols_fa) + TA = typeof(A.blocks.own_own) + n_own_rows = own_length(rows_fa) + n_own_cols = own_length(cols_fa) + n_ghost_rows = ghost_length(rows_fa) + n_ghost_cols = ghost_length(cols_fa) + Ti = indextype(A.blocks.own_own) + Tv = eltype(A.blocks.own_own) + own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols) + own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols) + ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols) + ghost_ghost = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_ghost_cols) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + values = split_matrix(blocks,local_permutation(rows_fa),local_permutation(cols_fa)) + nnz_own_own = nnz(own_own) + k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...) + k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...) + for p in 1:length(I_rcv_own) + i = I_rcv_own[p] + j = J_rcv_own[p] + k_rcv_own[p] = nzindex(own_own,i,j) + end + for p in 1:length(I_rcv_ghost) + i = I_rcv_ghost[p] + j = J_rcv_ghost[p] + k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own end + cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...) + values, cache +end + +function PartitionedArrays.psparse_assemble_impl( + A, + ::Type{<:AbstractSplitMatrix}, + rows; + reuse=Val(false), + assembly_neighbors_options_cols=(;)) + + rows_sa = partition(axes(A,1)) cols_sa = partition(axes(A,2)) - #rows = map(remove_ghost,rows_sa) cols = map(remove_ghost,cols_sa) parts_snd, parts_rcv = assembly_neighbors(rows_sa) cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa) + I_snd = map(i->i.I_snd,cache_snd) J_snd = map(i->i.J_snd,cache_snd) V_snd = map(i->i.V_snd,cache_snd) @@ -1525,6 +1701,9 @@ function psparse_assemble_impl( end end +# End new assemble +#################### + function psparse_assemble_impl!(B,A,::Type,cache) error("case not implemented") end @@ -1603,136 +1782,306 @@ function consistent!(B::PSparseMatrix,A::PSparseMatrix,cache) psparse_consistent_impl!(B,A,T,cache) end -function psparse_consistent_impl( - A, - ::Type{<:AbstractSplitMatrix}, - rows_co; - reuse=Val(false)) - - function setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa) - own_to_local_row = own_to_local(rows_co) - own_to_global_row = own_to_global(rows_co) - own_to_global_col = own_to_global(cols_fa) - ghost_to_global_col = ghost_to_global(cols_fa) - nl = size(A,1) - li_to_ps_ptrs = zeros(Int32,nl+1) - for p in 1:length(lids_snd) - for li in lids_snd[p] - li_to_ps_ptrs[li+1] += 1 - end +# function psparse_consistent_impl( +# A, +# ::Type{<:AbstractSplitMatrix}, +# rows_co; +# reuse=Val(false)) + +# function setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa) +# own_to_local_row = own_to_local(rows_co) +# own_to_global_row = own_to_global(rows_co) +# own_to_global_col = own_to_global(cols_fa) +# ghost_to_global_col = ghost_to_global(cols_fa) +# nl = size(A,1) +# li_to_ps_ptrs = zeros(Int32,nl+1) +# for p in 1:length(lids_snd) +# for li in lids_snd[p] +# li_to_ps_ptrs[li+1] += 1 +# end +# end +# length_to_ptrs!(li_to_ps_ptrs) +# ndata = li_to_ps_ptrs[end]-1 +# li_to_ps_data = zeros(Int32,ndata) +# for p in 1:length(lids_snd) +# for li in lids_snd[p] +# q = li_to_ps_ptrs[li] +# li_to_ps_data[q] = p +# li_to_ps_ptrs[li] = q + 1 +# end +# end +# rewind_ptrs!(li_to_ps_ptrs) +# li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs) +# ptrs = zeros(Int32,length(parts_snd)+1) +# for (i,j,v) in nziterator(A.blocks.own_own) +# li = own_to_local_row[i] +# for p in li_to_ps[li] +# ptrs[p+1] += 1 +# end +# end +# for (i,j,v) in nziterator(A.blocks.own_ghost) +# li = own_to_local_row[i] +# for p in li_to_ps[li] +# ptrs[p+1] += 1 +# end +# end +# length_to_ptrs!(ptrs) +# ndata = ptrs[end]-1 +# T = eltype(A) +# I_snd = JaggedArray(zeros(Int,ndata),ptrs) +# J_snd = JaggedArray(zeros(Int,ndata),ptrs) +# V_snd = JaggedArray(zeros(T,ndata),ptrs) +# k_snd = JaggedArray(zeros(Int32,ndata),ptrs) +# for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own)) +# li = own_to_local_row[i] +# for p in li_to_ps[li] +# q = ptrs[p] +# I_snd.data[q] = own_to_global_row[i] +# J_snd.data[q] = own_to_global_col[j] +# V_snd.data[q] = v +# k_snd.data[q] = k +# ptrs[p] += 1 +# end +# end +# nnz_own_own = nnz(A.blocks.own_own) +# for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost)) +# li = own_to_local_row[i] +# for p in li_to_ps[li] +# q = ptrs[p] +# I_snd.data[q] = own_to_global_row[i] +# J_snd.data[q] = ghost_to_global_col[j] +# V_snd.data[q] = v +# k_snd.data[q] = k+nnz_own_own +# ptrs[p] += 1 +# end +# end +# rewind_ptrs!(ptrs) +# cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd) +# cache_snd +# end +# function setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) +# cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) +# cache_rcv +# end +# function finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co) +# I_rcv_data = cache_rcv.I_rcv.data +# J_rcv_data = cache_rcv.J_rcv.data +# V_rcv_data = cache_rcv.V_rcv.data +# global_to_own_col = global_to_own(cols_co) +# global_to_ghost_col = global_to_ghost(cols_co) +# is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data) +# is_ghost = findall(j->global_to_ghost_col[j]!=0,J_rcv_data) +# I_rcv_own = I_rcv_data[is_own] +# J_rcv_own = J_rcv_data[is_own] +# V_rcv_own = V_rcv_data[is_own] +# I_rcv_ghost = I_rcv_data[is_ghost] +# J_rcv_ghost = J_rcv_data[is_ghost] +# V_rcv_ghost = V_rcv_data[is_ghost] +# map_global_to_ghost!(I_rcv_own,rows_co) +# map_global_to_ghost!(I_rcv_ghost,rows_co) +# map_global_to_own!(J_rcv_own,cols_co) +# map_global_to_ghost!(J_rcv_ghost,cols_co) +# I2,J2,V2 = findnz(A.blocks.own_ghost) +# map_ghost_to_global!(J2,cols_fa) +# map_global_to_ghost!(J2,cols_co) +# n_own_rows = own_length(rows_co) +# n_ghost_rows = ghost_length(rows_co) +# n_own_cols = own_length(cols_co) +# n_ghost_cols = ghost_length(cols_co) +# TA = typeof(A.blocks.ghost_own) +# own_own = A.blocks.own_own +# own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved +# ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols) +# ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols) +# K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own) +# K_ghost = precompute_nzindex(ghost_ghost,I_rcv_ghost,J_rcv_ghost) +# blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) +# values = split_matrix(blocks,local_permutation(rows_co),local_permutation(cols_co)) +# k_snd = cache_snd.k_snd +# V_snd = cache_snd.V_snd +# V_rcv = cache_rcv.V_rcv +# parts_snd = cache_snd.parts_snd +# parts_rcv = cache_rcv.parts_rcv +# cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_ghost,is_own,V_rcv_own,V_rcv_ghost,K_own,K_ghost) +# values,cache +# end +# @assert matching_own_indices(axes(A,1),PRange(rows_co)) +# rows_fa = partition(axes(A,1)) +# cols_fa = partition(axes(A,2)) +# # snd and rcv are swapped on purpose +# parts_rcv,parts_snd = assembly_neighbors(rows_co) +# lids_rcv,lids_snd = assembly_local_indices(rows_co) +# cache_snd = map(setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa) +# I_snd = map(i->i.I_snd,cache_snd) +# J_snd = map(i->i.J_snd,cache_snd) +# V_snd = map(i->i.V_snd,cache_snd) +# graph = ExchangeGraph(parts_snd,parts_rcv) +# t_I = exchange(I_snd,graph) +# t_J = exchange(J_snd,graph) +# t_V = exchange(V_snd,graph) +# @fake_async begin +# I_rcv = fetch(t_I) +# J_rcv = fetch(t_J) +# V_rcv = fetch(t_V) +# J_rcv_data = map(x->x.data,J_rcv) +# J_rcv_owner = find_owner(cols_fa,J_rcv_data) +# cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner) +# cache_rcv = map(setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) +# values,cache = map(finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays +# B = PSparseMatrix(values,rows_co,cols_co,A.assembled) +# if val_parameter(reuse) == false +# B +# else +# B,cache +# end +# end +# end + +# New consistent +#################### + +function consistent_setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa) + own_to_local_row::UnitRange{Int32} = own_to_local(rows_co) + own_to_global_row = own_to_global(rows_co) + own_to_global_col = own_to_global(cols_fa) + ghost_to_global_col = ghost_to_global(cols_fa) + nl = size(A,1) + li_to_ps_ptrs = zeros(Int32,nl+1) + for p in 1:length(lids_snd) + for li_ptr in jagged_range(lids_snd,p) + li = lids_snd.data[li_ptr] + li_to_ps_ptrs[li+1] += 1 end - length_to_ptrs!(li_to_ps_ptrs) - ndata = li_to_ps_ptrs[end]-1 - li_to_ps_data = zeros(Int32,ndata) - for p in 1:length(lids_snd) - for li in lids_snd[p] - q = li_to_ps_ptrs[li] - li_to_ps_data[q] = p - li_to_ps_ptrs[li] = q + 1 - end + end + length_to_ptrs!(li_to_ps_ptrs) + ndata = li_to_ps_ptrs[end]-1 + li_to_ps_data = zeros(Int32,ndata) + for p in 1:length(lids_snd) + for li_ptr in jagged_range(lids_snd,p) + li = lids_snd.data[li_ptr] + q = li_to_ps_ptrs[li] + li_to_ps_data[q] = p + li_to_ps_ptrs[li] = q + 1 end - rewind_ptrs!(li_to_ps_ptrs) - li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs) - ptrs = zeros(Int32,length(parts_snd)+1) - for (i,j,v) in nziterator(A.blocks.own_own) - li = own_to_local_row[i] - for p in li_to_ps[li] - ptrs[p+1] += 1 - end + end + + rewind_ptrs!(li_to_ps_ptrs) + li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs) + ptrs = zeros(Int32,length(parts_snd)+1) + for (i,j,v) in nziterator(A.blocks.own_own) + # @show(typeof(own_to_local_row)) + li = own_to_local_row[i] + for li_ptr in jagged_range(li_to_ps,li) + p = li_to_ps.data[li_ptr] + ptrs[p+1] += 1 end - for (i,j,v) in nziterator(A.blocks.own_ghost) - li = own_to_local_row[i] - for p in li_to_ps[li] - ptrs[p+1] += 1 - end + end + + for (i,j,v) in nziterator(A.blocks.own_ghost) + li = own_to_local_row[i] + for ptr in jagged_range(li_to_ps,li) + p=li_to_ps.data[ptr] + ptrs[p+1] += 1 end - length_to_ptrs!(ptrs) - ndata = ptrs[end]-1 - T = eltype(A) - I_snd = JaggedArray(zeros(Int,ndata),ptrs) - J_snd = JaggedArray(zeros(Int,ndata),ptrs) - V_snd = JaggedArray(zeros(T,ndata),ptrs) - k_snd = JaggedArray(zeros(Int32,ndata),ptrs) - for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own)) - li = own_to_local_row[i] - for p in li_to_ps[li] - q = ptrs[p] - I_snd.data[q] = own_to_global_row[i] - J_snd.data[q] = own_to_global_col[j] - V_snd.data[q] = v - k_snd.data[q] = k - ptrs[p] += 1 - end + end + length_to_ptrs!(ptrs) + ndata = ptrs[end]-1 + T = eltype(A) + I_snd = JaggedArray(zeros(Int,ndata),ptrs) + J_snd = JaggedArray(zeros(Int,ndata),ptrs) + V_snd = JaggedArray(zeros(T,ndata),ptrs) + k_snd = JaggedArray(zeros(Int32,ndata),ptrs) + for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own)) + li = own_to_local_row[i] + for p_ptr in jagged_range(li_to_ps,li) + p = li_to_ps.data[p_ptr] + q = ptrs[p] + I_snd.data[q] = own_to_global_row[i] + J_snd.data[q] = own_to_global_col[j] + V_snd.data[q] = v + k_snd.data[q] = k + ptrs[p] += 1 end - nnz_own_own = nnz(A.blocks.own_own) - for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost)) - li = own_to_local_row[i] - for p in li_to_ps[li] - q = ptrs[p] - I_snd.data[q] = own_to_global_row[i] - J_snd.data[q] = ghost_to_global_col[j] - V_snd.data[q] = v - k_snd.data[q] = k+nnz_own_own - ptrs[p] += 1 - end + end + + nnz_own_own = nnz(A.blocks.own_own) + for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost)) + li = own_to_local_row[i] + for p_ptr in jagged_range(li_to_ps,li) + p=li_to_ps.data[p_ptr] + q = ptrs[p] + I_snd.data[q] = own_to_global_row[i] + J_snd.data[q] = ghost_to_global_col[j] + V_snd.data[q] = v + k_snd.data[q] = k+nnz_own_own + ptrs[p] += 1 end - rewind_ptrs!(ptrs) - cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd) - cache_snd - end - function setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) - cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) - cache_rcv - end - function finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co) - I_rcv_data = cache_rcv.I_rcv.data - J_rcv_data = cache_rcv.J_rcv.data - V_rcv_data = cache_rcv.V_rcv.data - global_to_own_col = global_to_own(cols_co) - global_to_ghost_col = global_to_ghost(cols_co) - is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data) - is_ghost = findall(j->global_to_ghost_col[j]!=0,J_rcv_data) - I_rcv_own = I_rcv_data[is_own] - J_rcv_own = J_rcv_data[is_own] - V_rcv_own = V_rcv_data[is_own] - I_rcv_ghost = I_rcv_data[is_ghost] - J_rcv_ghost = J_rcv_data[is_ghost] - V_rcv_ghost = V_rcv_data[is_ghost] - map_global_to_ghost!(I_rcv_own,rows_co) - map_global_to_ghost!(I_rcv_ghost,rows_co) - map_global_to_own!(J_rcv_own,cols_co) - map_global_to_ghost!(J_rcv_ghost,cols_co) - I2,J2,V2 = findnz(A.blocks.own_ghost) - map_ghost_to_global!(J2,cols_fa) - map_global_to_ghost!(J2,cols_co) - n_own_rows = own_length(rows_co) - n_ghost_rows = ghost_length(rows_co) - n_own_cols = own_length(cols_co) - n_ghost_cols = ghost_length(cols_co) - TA = typeof(A.blocks.ghost_own) - own_own = A.blocks.own_own - own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved - ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols) - ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols) - K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own) - K_ghost = precompute_nzindex(ghost_ghost,I_rcv_ghost,J_rcv_ghost) - blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) - values = split_matrix(blocks,local_permutation(rows_co),local_permutation(cols_co)) - k_snd = cache_snd.k_snd - V_snd = cache_snd.V_snd - V_rcv = cache_rcv.V_rcv - parts_snd = cache_snd.parts_snd - parts_rcv = cache_rcv.parts_rcv - cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_ghost,is_own,V_rcv_own,V_rcv_ghost,K_own,K_ghost) - values,cache end + rewind_ptrs!(ptrs) + cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd) + cache_snd +end + +function consistent_setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) + cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) + cache_rcv +end + +function consistent_finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co) + I_rcv_data = cache_rcv.I_rcv.data + J_rcv_data = cache_rcv.J_rcv.data + V_rcv_data = cache_rcv.V_rcv.data + global_to_own_col = global_to_own(cols_co) + global_to_ghost_col = global_to_ghost(cols_co) + is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data) + is_ghost = findall(j->global_to_ghost_col[j]!=0,J_rcv_data) + I_rcv_own = I_rcv_data[is_own] + J_rcv_own = J_rcv_data[is_own] + V_rcv_own = V_rcv_data[is_own] + I_rcv_ghost = I_rcv_data[is_ghost] + J_rcv_ghost = J_rcv_data[is_ghost] + V_rcv_ghost = V_rcv_data[is_ghost] + map_global_to_ghost!(I_rcv_own,rows_co) + map_global_to_ghost!(I_rcv_ghost,rows_co) + map_global_to_own!(J_rcv_own,cols_co) + map_global_to_ghost!(J_rcv_ghost,cols_co) + I2,J2,V2 = findnz(A.blocks.own_ghost) + map_ghost_to_global!(J2,cols_fa) + map_global_to_ghost!(J2,cols_co) + n_own_rows = own_length(rows_co) + n_ghost_rows = ghost_length(rows_co) + n_own_cols = own_length(cols_co) + n_ghost_cols = ghost_length(cols_co) + TA = typeof(A.blocks.ghost_own) + own_own = A.blocks.own_own + own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved + ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols) + ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols) + K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own) + K_ghost = precompute_nzindex(ghost_ghost,I_rcv_ghost,J_rcv_ghost) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + values = split_matrix(blocks,local_permutation(rows_co),local_permutation(cols_co)) + k_snd = cache_snd.k_snd + V_snd = cache_snd.V_snd + V_rcv = cache_rcv.V_rcv + parts_snd = cache_snd.parts_snd + parts_rcv = cache_rcv.parts_rcv + cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_ghost,is_own,V_rcv_own,V_rcv_ghost,K_own,K_ghost) + values,cache +end + +function psparse_consistent_impl( + A, + ::Type{<:AbstractSplitMatrix}, + rows_co; + reuse=Val(false)) @assert matching_own_indices(axes(A,1),PRange(rows_co)) rows_fa = partition(axes(A,1)) cols_fa = partition(axes(A,2)) # snd and rcv are swapped on purpose parts_rcv,parts_snd = assembly_neighbors(rows_co) lids_rcv,lids_snd = assembly_local_indices(rows_co) - cache_snd = map(setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa) + cache_snd = map(consistent_setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa) I_snd = map(i->i.I_snd,cache_snd) J_snd = map(i->i.J_snd,cache_snd) V_snd = map(i->i.V_snd,cache_snd) @@ -1747,8 +2096,8 @@ function psparse_consistent_impl( J_rcv_data = map(x->x.data,J_rcv) J_rcv_owner = find_owner(cols_fa,J_rcv_data) cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner) - cache_rcv = map(setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) - values,cache = map(finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays + cache_rcv = map(consistent_setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) + values,cache = map(consistent_finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays B = PSparseMatrix(values,rows_co,cols_co,A.assembled) if val_parameter(reuse) == false B @@ -1758,6 +2107,9 @@ function psparse_consistent_impl( end end +# End new consistent +#################### + function psparse_consistent_impl!(B,A,::Type{<:AbstractSplitMatrix},cache) function setup_snd(A,cache) k_snd_data = cache.k_snd.data From b36653ef812da029cb6b00224d12dc38938edfa2 Mon Sep 17 00:00:00 2001 From: jop611 Date: Fri, 9 Aug 2024 17:16:10 +0200 Subject: [PATCH 02/34] added relevant functions to export --- src/PartitionedArrays.jl | 1 + src/jagged_array.jl | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl index bec2f615..45ec6594 100644 --- a/src/PartitionedArrays.jl +++ b/src/PartitionedArrays.jl @@ -11,6 +11,7 @@ import Distances export length_to_ptrs! export rewind_ptrs! +export jagged_range export jagged_array export GenericJaggedArray export JaggedArray diff --git a/src/jagged_array.jl b/src/jagged_array.jl index a3d61789..4a58f48a 100644 --- a/src/jagged_array.jl +++ b/src/jagged_array.jl @@ -162,7 +162,6 @@ function jagged_range(a::Union{JaggedArray,GenericJaggedArray},i::Integer) pini:pend end - ########### Base.size(a::Union{JaggedArray,GenericJaggedArray}) = (length(a.ptrs)-1,) From 29fcd0afa3ee4f50f9338924f33d4aaf7b9eddee Mon Sep 17 00:00:00 2001 From: jop611 Date: Wed, 14 Aug 2024 16:05:25 +0200 Subject: [PATCH 03/34] minor optimization to consistent, including reduced cache size --- src/p_sparse_matrix.jl | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index d9561d6e..a2d12e44 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -1659,7 +1659,7 @@ function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) values, cache end -function PartitionedArrays.psparse_assemble_impl( +function psparse_assemble_impl( A, ::Type{<:AbstractSplitMatrix}, rows; @@ -2033,11 +2033,12 @@ function consistent_finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co) V_rcv_data = cache_rcv.V_rcv.data global_to_own_col = global_to_own(cols_co) global_to_ghost_col = global_to_ghost(cols_co) - is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data) - is_ghost = findall(j->global_to_ghost_col[j]!=0,J_rcv_data) + is_own_condition = k -> global_to_own_col[k]!=0 + is_own = is_own_condition.(J_rcv_data) I_rcv_own = I_rcv_data[is_own] J_rcv_own = J_rcv_data[is_own] V_rcv_own = V_rcv_data[is_own] + is_ghost = map!(!,is_own, is_own) # inverse is_own bitvector to effectively represent is_ghost mask I_rcv_ghost = I_rcv_data[is_ghost] J_rcv_ghost = J_rcv_data[is_ghost] V_rcv_ghost = V_rcv_data[is_ghost] @@ -2066,7 +2067,7 @@ function consistent_finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co) V_rcv = cache_rcv.V_rcv parts_snd = cache_snd.parts_snd parts_rcv = cache_rcv.parts_rcv - cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_ghost,is_own,V_rcv_own,V_rcv_ghost,K_own,K_ghost) + cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,V_rcv_own,V_rcv_ghost,K_own,K_ghost) values,cache end @@ -2127,13 +2128,10 @@ function psparse_consistent_impl!(B,A,::Type{<:AbstractSplitMatrix},cache) end end function setup_rcv(B,cache) - is_ghost = cache.is_ghost - is_own = cache.is_own - V_rcv_data = cache.V_rcv.data K_own = cache.K_own K_ghost = cache.K_ghost - V_rcv_own = V_rcv_data[is_own] - V_rcv_ghost = V_rcv_data[is_ghost] + V_rcv_own = cache.V_rcv_own + V_rcv_ghost = cache.V_rcv_ghost setcoofast!(B.blocks.ghost_own,V_rcv_own,K_own) setcoofast!(B.blocks.ghost_ghost,V_rcv_ghost,K_ghost) B From a4e1960931aab50d8a22660fa4012f6ff5e25c5a Mon Sep 17 00:00:00 2001 From: jop611 Date: Thu, 15 Aug 2024 18:28:45 +0200 Subject: [PATCH 04/34] movig some functions back to inner scope --- src/p_sparse_matrix.jl | 827 +++++++++++++++++------------------------ 1 file changed, 345 insertions(+), 482 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index a2d12e44..b315e2af 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -1528,177 +1528,187 @@ end # New assemble #################### -function setup_cache_snd(A,parts_snd,rows_sa,cols_sa) - A_ghost_own = A.blocks.ghost_own - A_ghost_ghost = A.blocks.ghost_ghost - gen = ( owner=>i for (i,owner) in enumerate(parts_snd) ) - owner_to_p = Dict(gen) - ptrs = zeros(Int32,length(parts_snd)+1) - ghost_to_owner_row = ghost_to_owner(rows_sa) - ghost_to_global_row = ghost_to_global(rows_sa) - own_to_global_col = own_to_global(cols_sa) - ghost_to_global_col = ghost_to_global(cols_sa) - for (i,_,_) in nziterator(A_ghost_own) - owner = ghost_to_owner_row[i] - ptrs[owner_to_p[owner]+1] += 1 - end - for (i,_,_) in nziterator(A_ghost_ghost) - owner = ghost_to_owner_row[i] - ptrs[owner_to_p[owner]+1] += 1 - end - length_to_ptrs!(ptrs) - Tv = eltype(A_ghost_own) - ndata = ptrs[end]-1 - I_snd_data = zeros(Int,ndata) - J_snd_data = zeros(Int,ndata) - V_snd_data = zeros(Tv,ndata) - k_snd_data = zeros(Int32,ndata) - nnz_ghost_own = 0 - for (k,(i,j,v)) in enumerate(nziterator(A_ghost_own)) - owner = ghost_to_owner_row[i] - p = ptrs[owner_to_p[owner]] - I_snd_data[p] = ghost_to_global_row[i] - J_snd_data[p] = own_to_global_col[j] - V_snd_data[p] = v - k_snd_data[p] = k - ptrs[owner_to_p[owner]] += 1 - nnz_ghost_own += 1 - end - for (k,(i,j,v)) in enumerate(nziterator(A_ghost_ghost)) - owner = ghost_to_owner_row[i] - p = ptrs[owner_to_p[owner]] - I_snd_data[p] = ghost_to_global_row[i] - J_snd_data[p] = ghost_to_global_col[j] - V_snd_data[p] = v - k_snd_data[p] = k+nnz_ghost_own - ptrs[owner_to_p[owner]] += 1 - end - rewind_ptrs!(ptrs) - I_snd = JaggedArray(I_snd_data,ptrs) - J_snd = JaggedArray(J_snd_data,ptrs) - V_snd = JaggedArray(V_snd_data,ptrs) - k_snd = JaggedArray(k_snd_data,ptrs) - (;I_snd,J_snd,V_snd,k_snd,parts_snd) -end - -function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv) - k_rcv_data = zeros(Int32,length(I_rcv.data)) - k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs) - (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv) -end - -function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa) - nz_own_own = findnz(A.blocks.own_own) - nz_own_ghost = findnz(A.blocks.own_ghost) - I_rcv_data = cache_rcv.I_rcv.data - J_rcv_data = cache_rcv.J_rcv.data - V_rcv_data = cache_rcv.V_rcv.data - k_rcv_data = cache_rcv.k_rcv.data - global_to_own_col = global_to_own(cols_sa) - is_ghost = findall(j->global_to_own_col[j]==0,J_rcv_data) - is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data) - I_rcv_own = view(I_rcv_data,is_own) - J_rcv_own = view(J_rcv_data,is_own) - V_rcv_own = view(V_rcv_data,is_own) - k_rcv_own = view(k_rcv_data,is_own) - I_rcv_ghost = view(I_rcv_data,is_ghost) - J_rcv_ghost = view(J_rcv_data,is_ghost) - V_rcv_ghost = view(V_rcv_data,is_ghost) - k_rcv_ghost = view(k_rcv_data,is_ghost) - # After this col ids in own_ghost triplet remain global - map_global_to_own!(I_rcv_own,rows_sa) - map_global_to_own!(J_rcv_own,cols_sa) - map_global_to_own!(I_rcv_ghost,rows_sa) - map_ghost_to_global!(nz_own_ghost[2],cols_sa) - own_own_I = vcat(nz_own_own[1],I_rcv_own) - own_own_J = vcat(nz_own_own[2],J_rcv_own) - own_own_V = vcat(nz_own_own[3],V_rcv_own) - own_own_triplet = (own_own_I,own_own_J,own_own_V) - own_ghost_I = vcat(nz_own_ghost[1],I_rcv_ghost) - own_ghost_J = vcat(nz_own_ghost[2],J_rcv_ghost) - own_ghost_V = vcat(nz_own_ghost[3],V_rcv_ghost) - map_global_to_ghost!(nz_own_ghost[2],cols_sa) - own_ghost_triplet = (own_ghost_I,own_ghost_J,own_ghost_V) - triplets = (own_own_triplet,own_ghost_triplet) - aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) - triplets, own_ghost_J, aux -end - -function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) - (own_own_triplet,own_ghost_triplet) = triplets - (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux - map_global_to_ghost!(own_ghost_triplet[2],cols_fa) - map_global_to_ghost!(J_rcv_ghost,cols_fa) - TA = typeof(A.blocks.own_own) - n_own_rows = own_length(rows_fa) - n_own_cols = own_length(cols_fa) - n_ghost_rows = ghost_length(rows_fa) - n_ghost_cols = ghost_length(cols_fa) - Ti = indextype(A.blocks.own_own) - Tv = eltype(A.blocks.own_own) - own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols) - own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols) - ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols) - ghost_ghost = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_ghost_cols) - blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) - values = split_matrix(blocks,local_permutation(rows_fa),local_permutation(cols_fa)) - nnz_own_own = nnz(own_own) - k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...) - k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...) - for p in 1:length(I_rcv_own) - i = I_rcv_own[p] - j = J_rcv_own[p] - k_rcv_own[p] = nzindex(own_own,i,j) - end - for p in 1:length(I_rcv_ghost) - i = I_rcv_ghost[p] - j = J_rcv_ghost[p] - k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own - end - cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...) - values, cache -end - function psparse_assemble_impl( - A, - ::Type{<:AbstractSplitMatrix}, - rows; - reuse=Val(false), - assembly_neighbors_options_cols=(;)) - - - rows_sa = partition(axes(A,1)) - cols_sa = partition(axes(A,2)) - cols = map(remove_ghost,cols_sa) - parts_snd, parts_rcv = assembly_neighbors(rows_sa) - cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa) - - I_snd = map(i->i.I_snd,cache_snd) - J_snd = map(i->i.J_snd,cache_snd) - V_snd = map(i->i.V_snd,cache_snd) - graph = ExchangeGraph(parts_snd,parts_rcv) - t_I = exchange(I_snd,graph) - t_J = exchange(J_snd,graph) - t_V = exchange(V_snd,graph) - @fake_async begin - I_rcv = fetch(t_I) - J_rcv = fetch(t_J) - V_rcv = fetch(t_V) - cache_rcv = map(setup_cache_rcv,I_rcv,J_rcv,V_rcv,parts_rcv) - triplets,J,aux = map(setup_own_triplets,partition(A),cache_rcv,rows_sa,cols_sa) |> tuple_of_arrays - J_owner = find_owner(cols_sa,J) - rows_fa = rows - cols_fa = map(union_ghost,cols,J,J_owner) - assembly_neighbors(cols_fa;assembly_neighbors_options_cols...) - vals_fa, cache = map(finalize_values,partition(A),rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) |> tuple_of_arrays - assembled = true - B = PSparseMatrix(vals_fa,rows_fa,cols_fa,assembled) - if val_parameter(reuse) == false - B - else - B, cache + A, + ::Type{T}, + rows; + reuse=Val(false), + assembly_neighbors_options_cols=(;)) where T<:AbstractSplitMatrix + + function setup_cache_snd(A,parts_snd,rows_sa,cols_sa) + A_ghost_own = A.blocks.ghost_own + A_ghost_ghost = A.blocks.ghost_ghost + gen = ( owner=>i for (i,owner) in enumerate(parts_snd) ) + owner_to_p = Dict(gen) + ptrs = zeros(Int32,length(parts_snd)+1) + ghost_to_owner_row = ghost_to_owner(rows_sa) + ghost_to_global_row = ghost_to_global(rows_sa) + own_to_global_col = own_to_global(cols_sa) + ghost_to_global_col = ghost_to_global(cols_sa) + for (i,_,_) in nziterator(A_ghost_own) + owner = ghost_to_owner_row[i] + ptrs[owner_to_p[owner]+1] += 1 + end + for (i,_,_) in nziterator(A_ghost_ghost) + owner = ghost_to_owner_row[i] + ptrs[owner_to_p[owner]+1] += 1 + end + length_to_ptrs!(ptrs) + Tv = eltype(A_ghost_own) + ndata = ptrs[end]-1 + I_snd_data = zeros(Int,ndata) + J_snd_data = zeros(Int,ndata) + V_snd_data = zeros(Tv,ndata) + k_snd_data = zeros(Int32,ndata) + nnz_ghost_own = 0 + for (k,(i,j,v)) in enumerate(nziterator(A_ghost_own)) + owner = ghost_to_owner_row[i] + p = ptrs[owner_to_p[owner]] + I_snd_data[p] = ghost_to_global_row[i] + J_snd_data[p] = own_to_global_col[j] + V_snd_data[p] = v + k_snd_data[p] = k + ptrs[owner_to_p[owner]] += 1 + nnz_ghost_own += 1 + end + for (k,(i,j,v)) in enumerate(nziterator(A_ghost_ghost)) + owner = ghost_to_owner_row[i] + p = ptrs[owner_to_p[owner]] + I_snd_data[p] = ghost_to_global_row[i] + J_snd_data[p] = ghost_to_global_col[j] + V_snd_data[p] = v + k_snd_data[p] = k+nnz_ghost_own + ptrs[owner_to_p[owner]] += 1 + end + rewind_ptrs!(ptrs) + I_snd = JaggedArray(I_snd_data,ptrs) + J_snd = JaggedArray(J_snd_data,ptrs) + V_snd = JaggedArray(V_snd_data,ptrs) + k_snd = JaggedArray(k_snd_data,ptrs) + (;I_snd,J_snd,V_snd,k_snd,parts_snd) + end + + function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv) + k_rcv_data = zeros(Int32,length(I_rcv.data)) + k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs) + (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv) + end + + function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa) + nz_own_own = findnz(A.blocks.own_own) + nz_own_ghost = findnz(A.blocks.own_ghost) + I_rcv_data = cache_rcv.I_rcv.data + J_rcv_data = cache_rcv.J_rcv.data + V_rcv_data = cache_rcv.V_rcv.data + k_rcv_data = cache_rcv.k_rcv.data + global_to_own_col = global_to_own(cols_sa) + is_ghost = findall(j->global_to_own_col[j]==0,J_rcv_data) + is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data) + I_rcv_own = view(I_rcv_data,is_own) + J_rcv_own = view(J_rcv_data,is_own) + V_rcv_own = view(V_rcv_data,is_own) + k_rcv_own = view(k_rcv_data,is_own) + I_rcv_ghost = view(I_rcv_data,is_ghost) + J_rcv_ghost = view(J_rcv_data,is_ghost) + V_rcv_ghost = view(V_rcv_data,is_ghost) + k_rcv_ghost = view(k_rcv_data,is_ghost) + # After this col ids in own_ghost triplet remain global + map_global_to_own!(I_rcv_own,rows_sa) + map_global_to_own!(J_rcv_own,cols_sa) + map_global_to_own!(I_rcv_ghost,rows_sa) + map_ghost_to_global!(nz_own_ghost[2],cols_sa) + own_own_I = vcat(nz_own_own[1],I_rcv_own) + own_own_J = vcat(nz_own_own[2],J_rcv_own) + own_own_V = vcat(nz_own_own[3],V_rcv_own) + own_own_triplet = (own_own_I,own_own_J,own_own_V) + own_ghost_I = vcat(nz_own_ghost[1],I_rcv_ghost) + own_ghost_J = vcat(nz_own_ghost[2],J_rcv_ghost) + own_ghost_V = vcat(nz_own_ghost[3],V_rcv_ghost) + map_global_to_ghost!(nz_own_ghost[2],cols_sa) + own_ghost_triplet = (own_ghost_I,own_ghost_J,own_ghost_V) + triplets = (own_own_triplet,own_ghost_triplet) + aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) + triplets, own_ghost_J, aux + end + + function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) + (own_own_triplet,own_ghost_triplet) = triplets + (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux + map_global_to_ghost!(own_ghost_triplet[2],cols_fa) + map_global_to_ghost!(J_rcv_ghost,cols_fa) + TA = typeof(A.blocks.own_own) + n_own_rows = own_length(rows_fa) + n_own_cols = own_length(cols_fa) + n_ghost_rows = ghost_length(rows_fa) + n_ghost_cols = ghost_length(cols_fa) + Ti = indextype(A.blocks.own_own) + Tv = eltype(A.blocks.own_own) + own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols) + own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols) + ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols) + ghost_ghost = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_ghost_cols) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + values = split_matrix(blocks,local_permutation(rows_fa),local_permutation(cols_fa)) + nnz_own_own = nnz(own_own) + k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...) + k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...) + for p in 1:length(I_rcv_own) + i = I_rcv_own[p] + j = J_rcv_own[p] + k_rcv_own[p] = nzindex(own_own,i,j) + end + for p in 1:length(I_rcv_ghost) + i = I_rcv_ghost[p] + j = J_rcv_ghost[p] + k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own + end + cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...) + values, cache + end + + function _psparse_assemble_impl( + A, + ::Type{<:AbstractSplitMatrix}, + rows; + reuse=Val(false), + assembly_neighbors_options_cols=(;)) + + + rows_sa = partition(axes(A,1)) + cols_sa = partition(axes(A,2)) + cols = map(remove_ghost,cols_sa) + parts_snd, parts_rcv = assembly_neighbors(rows_sa) + cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa) + + I_snd = map(i->i.I_snd,cache_snd) + J_snd = map(i->i.J_snd,cache_snd) + V_snd = map(i->i.V_snd,cache_snd) + graph = ExchangeGraph(parts_snd,parts_rcv) + t_I = exchange(I_snd,graph) + t_J = exchange(J_snd,graph) + t_V = exchange(V_snd,graph) + @fake_async begin + I_rcv = fetch(t_I) + J_rcv = fetch(t_J) + V_rcv = fetch(t_V) + cache_rcv = map(setup_cache_rcv,I_rcv,J_rcv,V_rcv,parts_rcv) + triplets,J,aux = map(setup_own_triplets,partition(A),cache_rcv,rows_sa,cols_sa) |> tuple_of_arrays + J_owner = find_owner(cols_sa,J) + rows_fa = rows + cols_fa = map(union_ghost,cols,J,J_owner) + assembly_neighbors(cols_fa;assembly_neighbors_options_cols...) + vals_fa, cache = map(finalize_values,partition(A),rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) |> tuple_of_arrays + assembled = true + B = PSparseMatrix(vals_fa,rows_fa,cols_fa,assembled) + if val_parameter(reuse) == false + B + else + B, cache + end end end + + _psparse_assemble_impl(A,T,rows;reuse,assembly_neighbors_options_cols) end # End new assemble @@ -1782,332 +1792,185 @@ function consistent!(B::PSparseMatrix,A::PSparseMatrix,cache) psparse_consistent_impl!(B,A,T,cache) end -# function psparse_consistent_impl( -# A, -# ::Type{<:AbstractSplitMatrix}, -# rows_co; -# reuse=Val(false)) - -# function setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa) -# own_to_local_row = own_to_local(rows_co) -# own_to_global_row = own_to_global(rows_co) -# own_to_global_col = own_to_global(cols_fa) -# ghost_to_global_col = ghost_to_global(cols_fa) -# nl = size(A,1) -# li_to_ps_ptrs = zeros(Int32,nl+1) -# for p in 1:length(lids_snd) -# for li in lids_snd[p] -# li_to_ps_ptrs[li+1] += 1 -# end -# end -# length_to_ptrs!(li_to_ps_ptrs) -# ndata = li_to_ps_ptrs[end]-1 -# li_to_ps_data = zeros(Int32,ndata) -# for p in 1:length(lids_snd) -# for li in lids_snd[p] -# q = li_to_ps_ptrs[li] -# li_to_ps_data[q] = p -# li_to_ps_ptrs[li] = q + 1 -# end -# end -# rewind_ptrs!(li_to_ps_ptrs) -# li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs) -# ptrs = zeros(Int32,length(parts_snd)+1) -# for (i,j,v) in nziterator(A.blocks.own_own) -# li = own_to_local_row[i] -# for p in li_to_ps[li] -# ptrs[p+1] += 1 -# end -# end -# for (i,j,v) in nziterator(A.blocks.own_ghost) -# li = own_to_local_row[i] -# for p in li_to_ps[li] -# ptrs[p+1] += 1 -# end -# end -# length_to_ptrs!(ptrs) -# ndata = ptrs[end]-1 -# T = eltype(A) -# I_snd = JaggedArray(zeros(Int,ndata),ptrs) -# J_snd = JaggedArray(zeros(Int,ndata),ptrs) -# V_snd = JaggedArray(zeros(T,ndata),ptrs) -# k_snd = JaggedArray(zeros(Int32,ndata),ptrs) -# for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own)) -# li = own_to_local_row[i] -# for p in li_to_ps[li] -# q = ptrs[p] -# I_snd.data[q] = own_to_global_row[i] -# J_snd.data[q] = own_to_global_col[j] -# V_snd.data[q] = v -# k_snd.data[q] = k -# ptrs[p] += 1 -# end -# end -# nnz_own_own = nnz(A.blocks.own_own) -# for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost)) -# li = own_to_local_row[i] -# for p in li_to_ps[li] -# q = ptrs[p] -# I_snd.data[q] = own_to_global_row[i] -# J_snd.data[q] = ghost_to_global_col[j] -# V_snd.data[q] = v -# k_snd.data[q] = k+nnz_own_own -# ptrs[p] += 1 -# end -# end -# rewind_ptrs!(ptrs) -# cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd) -# cache_snd -# end -# function setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) -# cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) -# cache_rcv -# end -# function finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co) -# I_rcv_data = cache_rcv.I_rcv.data -# J_rcv_data = cache_rcv.J_rcv.data -# V_rcv_data = cache_rcv.V_rcv.data -# global_to_own_col = global_to_own(cols_co) -# global_to_ghost_col = global_to_ghost(cols_co) -# is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data) -# is_ghost = findall(j->global_to_ghost_col[j]!=0,J_rcv_data) -# I_rcv_own = I_rcv_data[is_own] -# J_rcv_own = J_rcv_data[is_own] -# V_rcv_own = V_rcv_data[is_own] -# I_rcv_ghost = I_rcv_data[is_ghost] -# J_rcv_ghost = J_rcv_data[is_ghost] -# V_rcv_ghost = V_rcv_data[is_ghost] -# map_global_to_ghost!(I_rcv_own,rows_co) -# map_global_to_ghost!(I_rcv_ghost,rows_co) -# map_global_to_own!(J_rcv_own,cols_co) -# map_global_to_ghost!(J_rcv_ghost,cols_co) -# I2,J2,V2 = findnz(A.blocks.own_ghost) -# map_ghost_to_global!(J2,cols_fa) -# map_global_to_ghost!(J2,cols_co) -# n_own_rows = own_length(rows_co) -# n_ghost_rows = ghost_length(rows_co) -# n_own_cols = own_length(cols_co) -# n_ghost_cols = ghost_length(cols_co) -# TA = typeof(A.blocks.ghost_own) -# own_own = A.blocks.own_own -# own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved -# ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols) -# ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols) -# K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own) -# K_ghost = precompute_nzindex(ghost_ghost,I_rcv_ghost,J_rcv_ghost) -# blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) -# values = split_matrix(blocks,local_permutation(rows_co),local_permutation(cols_co)) -# k_snd = cache_snd.k_snd -# V_snd = cache_snd.V_snd -# V_rcv = cache_rcv.V_rcv -# parts_snd = cache_snd.parts_snd -# parts_rcv = cache_rcv.parts_rcv -# cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_ghost,is_own,V_rcv_own,V_rcv_ghost,K_own,K_ghost) -# values,cache -# end -# @assert matching_own_indices(axes(A,1),PRange(rows_co)) -# rows_fa = partition(axes(A,1)) -# cols_fa = partition(axes(A,2)) -# # snd and rcv are swapped on purpose -# parts_rcv,parts_snd = assembly_neighbors(rows_co) -# lids_rcv,lids_snd = assembly_local_indices(rows_co) -# cache_snd = map(setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa) -# I_snd = map(i->i.I_snd,cache_snd) -# J_snd = map(i->i.J_snd,cache_snd) -# V_snd = map(i->i.V_snd,cache_snd) -# graph = ExchangeGraph(parts_snd,parts_rcv) -# t_I = exchange(I_snd,graph) -# t_J = exchange(J_snd,graph) -# t_V = exchange(V_snd,graph) -# @fake_async begin -# I_rcv = fetch(t_I) -# J_rcv = fetch(t_J) -# V_rcv = fetch(t_V) -# J_rcv_data = map(x->x.data,J_rcv) -# J_rcv_owner = find_owner(cols_fa,J_rcv_data) -# cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner) -# cache_rcv = map(setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) -# values,cache = map(finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays -# B = PSparseMatrix(values,rows_co,cols_co,A.assembled) -# if val_parameter(reuse) == false -# B -# else -# B,cache -# end -# end -# end - # New consistent #################### - -function consistent_setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa) - own_to_local_row::UnitRange{Int32} = own_to_local(rows_co) - own_to_global_row = own_to_global(rows_co) - own_to_global_col = own_to_global(cols_fa) - ghost_to_global_col = ghost_to_global(cols_fa) - nl = size(A,1) - li_to_ps_ptrs = zeros(Int32,nl+1) - for p in 1:length(lids_snd) - for li_ptr in jagged_range(lids_snd,p) - li = lids_snd.data[li_ptr] - li_to_ps_ptrs[li+1] += 1 +function psparse_consistent_impl( + A, + ::Type{T}, + rows_co; + reuse=Val(false)) where T<:AbstractSplitMatrix + function consistent_setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa) + own_to_local_row::UnitRange{Int32} = own_to_local(rows_co) + own_to_global_row = own_to_global(rows_co) + own_to_global_col = own_to_global(cols_fa) + ghost_to_global_col = ghost_to_global(cols_fa) + nl = size(A,1) + li_to_ps_ptrs = zeros(Int32,nl+1) + for p in 1:length(lids_snd) + for li_ptr in jagged_range(lids_snd,p) + li = lids_snd.data[li_ptr] + li_to_ps_ptrs[li+1] += 1 + end end - end - length_to_ptrs!(li_to_ps_ptrs) - ndata = li_to_ps_ptrs[end]-1 - li_to_ps_data = zeros(Int32,ndata) - for p in 1:length(lids_snd) - for li_ptr in jagged_range(lids_snd,p) - li = lids_snd.data[li_ptr] - q = li_to_ps_ptrs[li] - li_to_ps_data[q] = p - li_to_ps_ptrs[li] = q + 1 + length_to_ptrs!(li_to_ps_ptrs) + ndata = li_to_ps_ptrs[end]-1 + li_to_ps_data = zeros(Int32,ndata) + for p in 1:length(lids_snd) + for li_ptr in jagged_range(lids_snd,p) + li = lids_snd.data[li_ptr] + q = li_to_ps_ptrs[li] + li_to_ps_data[q] = p + li_to_ps_ptrs[li] = q + 1 + end end - end - - rewind_ptrs!(li_to_ps_ptrs) - li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs) - ptrs = zeros(Int32,length(parts_snd)+1) - for (i,j,v) in nziterator(A.blocks.own_own) - # @show(typeof(own_to_local_row)) - li = own_to_local_row[i] - for li_ptr in jagged_range(li_to_ps,li) - p = li_to_ps.data[li_ptr] - ptrs[p+1] += 1 + + rewind_ptrs!(li_to_ps_ptrs) + li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs) + ptrs = zeros(Int32,length(parts_snd)+1) + for (i,j,v) in nziterator(A.blocks.own_own) + # @show(typeof(own_to_local_row)) + li = own_to_local_row[i] + for li_ptr in jagged_range(li_to_ps,li) + p = li_to_ps.data[li_ptr] + ptrs[p+1] += 1 + end end - end - - for (i,j,v) in nziterator(A.blocks.own_ghost) - li = own_to_local_row[i] - for ptr in jagged_range(li_to_ps,li) - p=li_to_ps.data[ptr] - ptrs[p+1] += 1 + + for (i,j,v) in nziterator(A.blocks.own_ghost) + li = own_to_local_row[i] + for ptr in jagged_range(li_to_ps,li) + p=li_to_ps.data[ptr] + ptrs[p+1] += 1 + end end - end - length_to_ptrs!(ptrs) - ndata = ptrs[end]-1 - T = eltype(A) - I_snd = JaggedArray(zeros(Int,ndata),ptrs) - J_snd = JaggedArray(zeros(Int,ndata),ptrs) - V_snd = JaggedArray(zeros(T,ndata),ptrs) - k_snd = JaggedArray(zeros(Int32,ndata),ptrs) - for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own)) - li = own_to_local_row[i] - for p_ptr in jagged_range(li_to_ps,li) - p = li_to_ps.data[p_ptr] - q = ptrs[p] - I_snd.data[q] = own_to_global_row[i] - J_snd.data[q] = own_to_global_col[j] - V_snd.data[q] = v - k_snd.data[q] = k - ptrs[p] += 1 + length_to_ptrs!(ptrs) + ndata = ptrs[end]-1 + T = eltype(A) + I_snd = JaggedArray(zeros(Int,ndata),ptrs) + J_snd = JaggedArray(zeros(Int,ndata),ptrs) + V_snd = JaggedArray(zeros(T,ndata),ptrs) + k_snd = JaggedArray(zeros(Int32,ndata),ptrs) + for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own)) + li = own_to_local_row[i] + for p_ptr in jagged_range(li_to_ps,li) + p = li_to_ps.data[p_ptr] + q = ptrs[p] + I_snd.data[q] = own_to_global_row[i] + J_snd.data[q] = own_to_global_col[j] + V_snd.data[q] = v + k_snd.data[q] = k + ptrs[p] += 1 + end end - end - - nnz_own_own = nnz(A.blocks.own_own) - for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost)) - li = own_to_local_row[i] - for p_ptr in jagged_range(li_to_ps,li) - p=li_to_ps.data[p_ptr] - q = ptrs[p] - I_snd.data[q] = own_to_global_row[i] - J_snd.data[q] = ghost_to_global_col[j] - V_snd.data[q] = v - k_snd.data[q] = k+nnz_own_own - ptrs[p] += 1 + + nnz_own_own = nnz(A.blocks.own_own) + for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost)) + li = own_to_local_row[i] + for p_ptr in jagged_range(li_to_ps,li) + p=li_to_ps.data[p_ptr] + q = ptrs[p] + I_snd.data[q] = own_to_global_row[i] + J_snd.data[q] = ghost_to_global_col[j] + V_snd.data[q] = v + k_snd.data[q] = k+nnz_own_own + ptrs[p] += 1 + end end - end - rewind_ptrs!(ptrs) - cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd) - cache_snd -end - -function consistent_setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) - cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) - cache_rcv -end - -function consistent_finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co) - I_rcv_data = cache_rcv.I_rcv.data - J_rcv_data = cache_rcv.J_rcv.data - V_rcv_data = cache_rcv.V_rcv.data - global_to_own_col = global_to_own(cols_co) - global_to_ghost_col = global_to_ghost(cols_co) - is_own_condition = k -> global_to_own_col[k]!=0 - is_own = is_own_condition.(J_rcv_data) - I_rcv_own = I_rcv_data[is_own] - J_rcv_own = J_rcv_data[is_own] - V_rcv_own = V_rcv_data[is_own] - is_ghost = map!(!,is_own, is_own) # inverse is_own bitvector to effectively represent is_ghost mask - I_rcv_ghost = I_rcv_data[is_ghost] - J_rcv_ghost = J_rcv_data[is_ghost] - V_rcv_ghost = V_rcv_data[is_ghost] - map_global_to_ghost!(I_rcv_own,rows_co) - map_global_to_ghost!(I_rcv_ghost,rows_co) - map_global_to_own!(J_rcv_own,cols_co) - map_global_to_ghost!(J_rcv_ghost,cols_co) - I2,J2,V2 = findnz(A.blocks.own_ghost) - map_ghost_to_global!(J2,cols_fa) - map_global_to_ghost!(J2,cols_co) - n_own_rows = own_length(rows_co) - n_ghost_rows = ghost_length(rows_co) - n_own_cols = own_length(cols_co) - n_ghost_cols = ghost_length(cols_co) - TA = typeof(A.blocks.ghost_own) - own_own = A.blocks.own_own - own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved - ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols) - ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols) - K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own) - K_ghost = precompute_nzindex(ghost_ghost,I_rcv_ghost,J_rcv_ghost) - blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) - values = split_matrix(blocks,local_permutation(rows_co),local_permutation(cols_co)) - k_snd = cache_snd.k_snd - V_snd = cache_snd.V_snd - V_rcv = cache_rcv.V_rcv - parts_snd = cache_snd.parts_snd - parts_rcv = cache_rcv.parts_rcv - cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,V_rcv_own,V_rcv_ghost,K_own,K_ghost) - values,cache -end - -function psparse_consistent_impl( - A, - ::Type{<:AbstractSplitMatrix}, - rows_co; - reuse=Val(false)) - @assert matching_own_indices(axes(A,1),PRange(rows_co)) - rows_fa = partition(axes(A,1)) - cols_fa = partition(axes(A,2)) - # snd and rcv are swapped on purpose - parts_rcv,parts_snd = assembly_neighbors(rows_co) - lids_rcv,lids_snd = assembly_local_indices(rows_co) - cache_snd = map(consistent_setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa) - I_snd = map(i->i.I_snd,cache_snd) - J_snd = map(i->i.J_snd,cache_snd) - V_snd = map(i->i.V_snd,cache_snd) - graph = ExchangeGraph(parts_snd,parts_rcv) - t_I = exchange(I_snd,graph) - t_J = exchange(J_snd,graph) - t_V = exchange(V_snd,graph) - @fake_async begin - I_rcv = fetch(t_I) - J_rcv = fetch(t_J) - V_rcv = fetch(t_V) - J_rcv_data = map(x->x.data,J_rcv) - J_rcv_owner = find_owner(cols_fa,J_rcv_data) - cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner) - cache_rcv = map(consistent_setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) - values,cache = map(consistent_finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays - B = PSparseMatrix(values,rows_co,cols_co,A.assembled) - if val_parameter(reuse) == false - B - else - B,cache + rewind_ptrs!(ptrs) + cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd) + cache_snd + end + + function consistent_setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) + cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) + cache_rcv + end + + function consistent_finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co) + I_rcv_data = cache_rcv.I_rcv.data + J_rcv_data = cache_rcv.J_rcv.data + V_rcv_data = cache_rcv.V_rcv.data + global_to_own_col = global_to_own(cols_co) + global_to_ghost_col = global_to_ghost(cols_co) + is_own_condition = k -> global_to_own_col[k]!=0 + is_own = is_own_condition.(J_rcv_data) + I_rcv_own = I_rcv_data[is_own] + J_rcv_own = J_rcv_data[is_own] + V_rcv_own = V_rcv_data[is_own] + is_ghost = map!(!,is_own, is_own) # inverse is_own bitvector to effectively represent is_ghost mask + I_rcv_ghost = I_rcv_data[is_ghost] + J_rcv_ghost = J_rcv_data[is_ghost] + V_rcv_ghost = V_rcv_data[is_ghost] + map_global_to_ghost!(I_rcv_own,rows_co) + map_global_to_ghost!(I_rcv_ghost,rows_co) + map_global_to_own!(J_rcv_own,cols_co) + map_global_to_ghost!(J_rcv_ghost,cols_co) + I2,J2,V2 = findnz(A.blocks.own_ghost) + map_ghost_to_global!(J2,cols_fa) + map_global_to_ghost!(J2,cols_co) + n_own_rows = own_length(rows_co) + n_ghost_rows = ghost_length(rows_co) + n_own_cols = own_length(cols_co) + n_ghost_cols = ghost_length(cols_co) + TA = typeof(A.blocks.ghost_own) + own_own = A.blocks.own_own + own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved + ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols) + ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols) + K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own) + K_ghost = precompute_nzindex(ghost_ghost,I_rcv_ghost,J_rcv_ghost) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + values = split_matrix(blocks,local_permutation(rows_co),local_permutation(cols_co)) + k_snd = cache_snd.k_snd + V_snd = cache_snd.V_snd + V_rcv = cache_rcv.V_rcv + parts_snd = cache_snd.parts_snd + parts_rcv = cache_rcv.parts_rcv + cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,V_rcv_own,V_rcv_ghost,K_own,K_ghost) + values,cache + end + + function _psparse_consistent_impl( + A, + ::Type{<:AbstractSplitMatrix}, + rows_co; + reuse=Val(false)) + @assert matching_own_indices(axes(A,1),PRange(rows_co)) + rows_fa = partition(axes(A,1)) + cols_fa = partition(axes(A,2)) + # snd and rcv are swapped on purpose + parts_rcv,parts_snd = assembly_neighbors(rows_co) + lids_rcv,lids_snd = assembly_local_indices(rows_co) + cache_snd = map(consistent_setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa) + I_snd = map(i->i.I_snd,cache_snd) + J_snd = map(i->i.J_snd,cache_snd) + V_snd = map(i->i.V_snd,cache_snd) + graph = ExchangeGraph(parts_snd,parts_rcv) + t_I = exchange(I_snd,graph) + t_J = exchange(J_snd,graph) + t_V = exchange(V_snd,graph) + @fake_async begin + I_rcv = fetch(t_I) + J_rcv = fetch(t_J) + V_rcv = fetch(t_V) + J_rcv_data = map(x->x.data,J_rcv) + J_rcv_owner = find_owner(cols_fa,J_rcv_data) + cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner) + cache_rcv = map(consistent_setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) + values,cache = map(consistent_finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays + B = PSparseMatrix(values,rows_co,cols_co,A.assembled) + if val_parameter(reuse) == false + B + else + B,cache + end end end + + _psparse_consistent_impl(A,T,rows_co;reuse) end + # End new consistent #################### From 7bdb36561704d756b6f65770da198ff414ad4468 Mon Sep 17 00:00:00 2001 From: jop611 Date: Thu, 15 Aug 2024 18:35:48 +0200 Subject: [PATCH 05/34] fix in consistent_impl --- src/p_sparse_matrix.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index b315e2af..4c299a94 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -1534,7 +1534,7 @@ function psparse_assemble_impl( rows; reuse=Val(false), assembly_neighbors_options_cols=(;)) where T<:AbstractSplitMatrix - + function setup_cache_snd(A,parts_snd,rows_sa,cols_sa) A_ghost_own = A.blocks.ghost_own A_ghost_ghost = A.blocks.ghost_ghost @@ -1845,10 +1845,10 @@ function psparse_consistent_impl( end length_to_ptrs!(ptrs) ndata = ptrs[end]-1 - T = eltype(A) + Tv = eltype(A) I_snd = JaggedArray(zeros(Int,ndata),ptrs) J_snd = JaggedArray(zeros(Int,ndata),ptrs) - V_snd = JaggedArray(zeros(T,ndata),ptrs) + V_snd = JaggedArray(zeros(Tv,ndata),ptrs) k_snd = JaggedArray(zeros(Int32,ndata),ptrs) for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own)) li = own_to_local_row[i] @@ -1932,7 +1932,7 @@ function psparse_consistent_impl( function _psparse_consistent_impl( A, - ::Type{<:AbstractSplitMatrix}, + ::T, rows_co; reuse=Val(false)) @assert matching_own_indices(axes(A,1),PRange(rows_co)) From cc8a11de29a9aaa6b1ddcabac8c7ba25fb5d3321 Mon Sep 17 00:00:00 2001 From: jop611 Date: Thu, 15 Aug 2024 18:37:12 +0200 Subject: [PATCH 06/34] fix in consistent_impl --- src/p_sparse_matrix.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index 4c299a94..367ae1aa 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -1668,7 +1668,7 @@ function psparse_assemble_impl( function _psparse_assemble_impl( A, - ::Type{<:AbstractSplitMatrix}, + ::T, rows; reuse=Val(false), assembly_neighbors_options_cols=(;)) From 851b3b125d7a455eb131cedfb379fc530e63e555 Mon Sep 17 00:00:00 2001 From: jop611 Date: Thu, 15 Aug 2024 18:49:34 +0200 Subject: [PATCH 07/34] fix in assemble_impl --- src/p_sparse_matrix.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index 367ae1aa..a44d1f0f 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -1668,10 +1668,10 @@ function psparse_assemble_impl( function _psparse_assemble_impl( A, - ::T, + ::Type{T}, rows; reuse=Val(false), - assembly_neighbors_options_cols=(;)) + assembly_neighbors_options_cols=(;)) where T<:AbstractSplitMatrix rows_sa = partition(axes(A,1)) @@ -1799,6 +1799,7 @@ function psparse_consistent_impl( ::Type{T}, rows_co; reuse=Val(false)) where T<:AbstractSplitMatrix + function consistent_setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa) own_to_local_row::UnitRange{Int32} = own_to_local(rows_co) own_to_global_row = own_to_global(rows_co) From cb2d5ef56852ba1dd4539b47d3c5be159d1c6c52 Mon Sep 17 00:00:00 2001 From: jop611 Date: Thu, 15 Aug 2024 18:52:41 +0200 Subject: [PATCH 08/34] fix in consistent_impl --- src/p_sparse_matrix.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index a44d1f0f..3a372245 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -1933,9 +1933,9 @@ function psparse_consistent_impl( function _psparse_consistent_impl( A, - ::T, + ::Type{T}, rows_co; - reuse=Val(false)) + reuse=Val(false)) where T<:AbstractSplitMatrix @assert matching_own_indices(axes(A,1),PRange(rows_co)) rows_fa = partition(axes(A,1)) cols_fa = partition(axes(A,2)) From b40dcdee042d622a9454f402de71b02e7a59aaf3 Mon Sep 17 00:00:00 2001 From: jop611 Date: Fri, 16 Aug 2024 15:06:18 +0200 Subject: [PATCH 09/34] minor changes+some cleanup --- src/p_sparse_matrix.jl | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index 3a372245..e1c2ed8d 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -1528,12 +1528,11 @@ end # New assemble #################### -function psparse_assemble_impl( - A, - ::Type{T}, - rows; - reuse=Val(false), - assembly_neighbors_options_cols=(;)) where T<:AbstractSplitMatrix +function psparse_assemble_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D}, + ::Type{T}, + rows; + reuse=Val(false), + assembly_neighbors_options_cols=(;)) where {T<:AbstractSplitMatrix, Tv} function setup_cache_snd(A,parts_snd,rows_sa,cols_sa) A_ghost_own = A.blocks.ghost_own @@ -1554,7 +1553,6 @@ function psparse_assemble_impl( ptrs[owner_to_p[owner]+1] += 1 end length_to_ptrs!(ptrs) - Tv = eltype(A_ghost_own) ndata = ptrs[end]-1 I_snd_data = zeros(Int,ndata) J_snd_data = zeros(Int,ndata) @@ -1642,7 +1640,6 @@ function psparse_assemble_impl( n_ghost_rows = ghost_length(rows_fa) n_ghost_cols = ghost_length(cols_fa) Ti = indextype(A.blocks.own_own) - Tv = eltype(A.blocks.own_own) own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols) own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols) ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols) @@ -1652,12 +1649,12 @@ function psparse_assemble_impl( nnz_own_own = nnz(own_own) k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...) k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...) - for p in 1:length(I_rcv_own) + for p in eachindex(I_rcv_own) i = I_rcv_own[p] j = J_rcv_own[p] k_rcv_own[p] = nzindex(own_own,i,j) end - for p in 1:length(I_rcv_ghost) + for p in eachindex(I_rcv_ghost) i = I_rcv_ghost[p] j = J_rcv_ghost[p] k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own @@ -1687,7 +1684,7 @@ function psparse_assemble_impl( t_I = exchange(I_snd,graph) t_J = exchange(J_snd,graph) t_V = exchange(V_snd,graph) - @fake_async begin + @sync begin I_rcv = fetch(t_I) J_rcv = fetch(t_J) V_rcv = fetch(t_V) @@ -1794,14 +1791,13 @@ end # New consistent #################### -function psparse_consistent_impl( - A, - ::Type{T}, - rows_co; - reuse=Val(false)) where T<:AbstractSplitMatrix +function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D}, + ::Type{T}, + rows_co; + reuse=Val(false)) where {T<:AbstractSplitMatrix, Tv} function consistent_setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa) - own_to_local_row::UnitRange{Int32} = own_to_local(rows_co) + own_to_local_row = own_to_local(rows_co) own_to_global_row = own_to_global(rows_co) own_to_global_col = own_to_global(cols_fa) ghost_to_global_col = ghost_to_global(cols_fa) @@ -1829,7 +1825,6 @@ function psparse_consistent_impl( li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs) ptrs = zeros(Int32,length(parts_snd)+1) for (i,j,v) in nziterator(A.blocks.own_own) - # @show(typeof(own_to_local_row)) li = own_to_local_row[i] for li_ptr in jagged_range(li_to_ps,li) p = li_to_ps.data[li_ptr] @@ -1846,7 +1841,6 @@ function psparse_consistent_impl( end length_to_ptrs!(ptrs) ndata = ptrs[end]-1 - Tv = eltype(A) I_snd = JaggedArray(zeros(Int,ndata),ptrs) J_snd = JaggedArray(zeros(Int,ndata),ptrs) V_snd = JaggedArray(zeros(Tv,ndata),ptrs) @@ -1892,7 +1886,7 @@ function psparse_consistent_impl( J_rcv_data = cache_rcv.J_rcv.data V_rcv_data = cache_rcv.V_rcv.data global_to_own_col = global_to_own(cols_co) - global_to_ghost_col = global_to_ghost(cols_co) + # global_to_ghost_col = global_to_ghost(cols_co) is_own_condition = k -> global_to_own_col[k]!=0 is_own = is_own_condition.(J_rcv_data) I_rcv_own = I_rcv_data[is_own] @@ -1937,7 +1931,6 @@ function psparse_consistent_impl( rows_co; reuse=Val(false)) where T<:AbstractSplitMatrix @assert matching_own_indices(axes(A,1),PRange(rows_co)) - rows_fa = partition(axes(A,1)) cols_fa = partition(axes(A,2)) # snd and rcv are swapped on purpose parts_rcv,parts_snd = assembly_neighbors(rows_co) @@ -1950,7 +1943,7 @@ function psparse_consistent_impl( t_I = exchange(I_snd,graph) t_J = exchange(J_snd,graph) t_V = exchange(V_snd,graph) - @fake_async begin + @sync begin I_rcv = fetch(t_I) J_rcv = fetch(t_J) V_rcv = fetch(t_V) @@ -1967,11 +1960,8 @@ function psparse_consistent_impl( end end end - _psparse_consistent_impl(A,T,rows_co;reuse) end - - # End new consistent #################### From 315999cfb8c60d54d08af034b3e727da775828b0 Mon Sep 17 00:00:00 2001 From: jop611 Date: Fri, 16 Aug 2024 15:23:45 +0200 Subject: [PATCH 10/34] fixed leftover debug setting --- src/p_sparse_matrix.jl | 4 ++-- times.txt | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 times.txt diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index e1c2ed8d..07e43889 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -1684,7 +1684,7 @@ function psparse_assemble_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D}, t_I = exchange(I_snd,graph) t_J = exchange(J_snd,graph) t_V = exchange(V_snd,graph) - @sync begin + @fake_async begin I_rcv = fetch(t_I) J_rcv = fetch(t_J) V_rcv = fetch(t_V) @@ -1943,7 +1943,7 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D}, t_I = exchange(I_snd,graph) t_J = exchange(J_snd,graph) t_V = exchange(V_snd,graph) - @sync begin + @fake_async begin I_rcv = fetch(t_I) J_rcv = fetch(t_J) V_rcv = fetch(t_V) diff --git a/times.txt b/times.txt new file mode 100644 index 00000000..cc438bc4 --- /dev/null +++ b/times.txt @@ -0,0 +1 @@ +Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2010888, max = 0.2010888, avg = 0.2010888), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4020026, max = 0.4020026, avg = 0.4020026), "Phase 1" => (min = 5.0e-7, max = 5.0e-7, avg = 5.0e-7)) From 395fa618007a3ed3177ef57ae3b2b5719b7873f0 Mon Sep 17 00:00:00 2001 From: jop611 Date: Fri, 16 Aug 2024 19:46:26 +0200 Subject: [PATCH 11/34] fixed bug in updated psaprse_consistent_impl! --- src/p_sparse_matrix.jl | 13 +++++++++---- test/p_sparse_matrix_tests.jl | 1 - test/runtests.jl | 2 +- times.txt | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index 07e43889..b8af2da2 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -1889,10 +1889,10 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D}, # global_to_ghost_col = global_to_ghost(cols_co) is_own_condition = k -> global_to_own_col[k]!=0 is_own = is_own_condition.(J_rcv_data) + is_ghost = map(!,is_own) # inverse is_own bitvector to effectively represent is_ghost mask I_rcv_own = I_rcv_data[is_own] J_rcv_own = J_rcv_data[is_own] V_rcv_own = V_rcv_data[is_own] - is_ghost = map!(!,is_own, is_own) # inverse is_own bitvector to effectively represent is_ghost mask I_rcv_ghost = I_rcv_data[is_ghost] J_rcv_ghost = J_rcv_data[is_ghost] V_rcv_ghost = V_rcv_data[is_ghost] @@ -1921,7 +1921,7 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D}, V_rcv = cache_rcv.V_rcv parts_snd = cache_snd.parts_snd parts_rcv = cache_rcv.parts_rcv - cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,V_rcv_own,V_rcv_ghost,K_own,K_ghost) + cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_own,is_ghost,V_rcv_own,V_rcv_ghost,K_own,K_ghost) values,cache end @@ -1962,6 +1962,7 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D}, end _psparse_consistent_impl(A,T,rows_co;reuse) end + # End new consistent #################### @@ -1982,10 +1983,14 @@ function psparse_consistent_impl!(B,A,::Type{<:AbstractSplitMatrix},cache) end end function setup_rcv(B,cache) + is_own = cache.is_own + is_ghost = cache.is_ghost + V_rcv_data = cache.V_rcv.data K_own = cache.K_own K_ghost = cache.K_ghost - V_rcv_own = cache.V_rcv_own - V_rcv_ghost = cache.V_rcv_ghost + # Allocates memory, while cache.V_rcv_own/ghost could be reused. + V_rcv_own = V_rcv_data[is_own] + V_rcv_ghost = V_rcv_data[is_ghost] setcoofast!(B.blocks.ghost_own,V_rcv_own,K_own) setcoofast!(B.blocks.ghost_ghost,V_rcv_ghost,K_ghost) B diff --git a/test/p_sparse_matrix_tests.jl b/test/p_sparse_matrix_tests.jl index 5ed4c903..08b24574 100644 --- a/test/p_sparse_matrix_tests.jl +++ b/test/p_sparse_matrix_tests.jl @@ -378,7 +378,6 @@ function p_sparse_matrix_tests(distribute) A_seq = centralize(A) spmm!(B,Z,A,cacheB) @test centralize(B) ≈ Z_seq*(A_seq) - B = transpose(Z)*A @test centralize(B) ≈ transpose(Z_seq)*A_seq diff --git a/test/runtests.jl b/test/runtests.jl index 92768453..ed7aff49 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,6 +5,6 @@ using Test @testset "jagged_array" begin include("jagged_array_tests.jl") end @testset "sparse_utils" begin include("sparse_utils_tests.jl") end @testset "debug_array" begin include("debug_array/runtests.jl") end -@testset "mpi_array" begin include("mpi_array/runtests.jl") end +# @testset "mpi_array" begin include("mpi_array/runtests.jl") end end # module diff --git a/times.txt b/times.txt index cc438bc4..7db5927d 100644 --- a/times.txt +++ b/times.txt @@ -1 +1 @@ -Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2010888, max = 0.2010888, avg = 0.2010888), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4020026, max = 0.4020026, avg = 0.4020026), "Phase 1" => (min = 5.0e-7, max = 5.0e-7, avg = 5.0e-7)) +Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2188811, max = 0.2188811, avg = 0.2188811), "Phase 3" => (min = 0.0, max = 0.0, avg = 0.0), "Matrix Assembly" => (min = 0.4098738, max = 0.4098738, avg = 0.4098738), "Phase 1" => (min = 5.0e-7, max = 5.0e-7, avg = 5.0e-7)) From c04ce81a22dbfe36c92d78d32be0374b74826ddd Mon Sep 17 00:00:00 2001 From: jop611 Date: Thu, 5 Sep 2024 18:07:34 +0200 Subject: [PATCH 12/34] reset to earlier state --- src/p_sparse_matrix.jl | 167 ----------------------------------------- times.txt | 2 +- 2 files changed, 1 insertion(+), 168 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index b8af2da2..5018965e 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -1357,173 +1357,6 @@ function psparse_assemble_impl(A,::Type,rows) error("Case not implemented yet") end -# function psparse_assemble_impl( -# A, -# ::Type{<:AbstractSplitMatrix}, -# rows; -# reuse=Val(false), -# assembly_neighbors_options_cols=(;)) - -# function setup_cache_snd(A,parts_snd,rows_sa,cols_sa) -# A_ghost_own = A.blocks.ghost_own -# A_ghost_ghost = A.blocks.ghost_ghost -# gen = ( owner=>i for (i,owner) in enumerate(parts_snd) ) -# owner_to_p = Dict(gen) -# ptrs = zeros(Int32,length(parts_snd)+1) -# ghost_to_owner_row = ghost_to_owner(rows_sa) -# ghost_to_global_row = ghost_to_global(rows_sa) -# own_to_global_col = own_to_global(cols_sa) -# ghost_to_global_col = ghost_to_global(cols_sa) -# for (i,_,_) in nziterator(A_ghost_own) -# owner = ghost_to_owner_row[i] -# ptrs[owner_to_p[owner]+1] += 1 -# end -# for (i,_,_) in nziterator(A_ghost_ghost) -# owner = ghost_to_owner_row[i] -# ptrs[owner_to_p[owner]+1] += 1 -# end -# length_to_ptrs!(ptrs) -# Tv = eltype(A_ghost_own) -# ndata = ptrs[end]-1 -# I_snd_data = zeros(Int,ndata) -# J_snd_data = zeros(Int,ndata) -# V_snd_data = zeros(Tv,ndata) -# k_snd_data = zeros(Int32,ndata) -# nnz_ghost_own = 0 -# for (k,(i,j,v)) in enumerate(nziterator(A_ghost_own)) -# owner = ghost_to_owner_row[i] -# p = ptrs[owner_to_p[owner]] -# I_snd_data[p] = ghost_to_global_row[i] -# J_snd_data[p] = own_to_global_col[j] -# V_snd_data[p] = v -# k_snd_data[p] = k -# ptrs[owner_to_p[owner]] += 1 -# nnz_ghost_own += 1 -# end -# for (k,(i,j,v)) in enumerate(nziterator(A_ghost_ghost)) -# owner = ghost_to_owner_row[i] -# p = ptrs[owner_to_p[owner]] -# I_snd_data[p] = ghost_to_global_row[i] -# J_snd_data[p] = ghost_to_global_col[j] -# V_snd_data[p] = v -# k_snd_data[p] = k+nnz_ghost_own -# ptrs[owner_to_p[owner]] += 1 -# end -# rewind_ptrs!(ptrs) -# I_snd = JaggedArray(I_snd_data,ptrs) -# J_snd = JaggedArray(J_snd_data,ptrs) -# V_snd = JaggedArray(V_snd_data,ptrs) -# k_snd = JaggedArray(k_snd_data,ptrs) -# (;I_snd,J_snd,V_snd,k_snd,parts_snd) -# end -# function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv) -# k_rcv_data = zeros(Int32,length(I_rcv.data)) -# k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs) -# (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv) -# end -# function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa) -# nz_own_own = findnz(A.blocks.own_own) -# nz_own_ghost = findnz(A.blocks.own_ghost) -# I_rcv_data = cache_rcv.I_rcv.data -# J_rcv_data = cache_rcv.J_rcv.data -# V_rcv_data = cache_rcv.V_rcv.data -# k_rcv_data = cache_rcv.k_rcv.data -# global_to_own_col = global_to_own(cols_sa) -# is_ghost = findall(j->global_to_own_col[j]==0,J_rcv_data) -# is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data) -# I_rcv_own = view(I_rcv_data,is_own) -# J_rcv_own = view(J_rcv_data,is_own) -# V_rcv_own = view(V_rcv_data,is_own) -# k_rcv_own = view(k_rcv_data,is_own) -# I_rcv_ghost = view(I_rcv_data,is_ghost) -# J_rcv_ghost = view(J_rcv_data,is_ghost) -# V_rcv_ghost = view(V_rcv_data,is_ghost) -# k_rcv_ghost = view(k_rcv_data,is_ghost) -# # After this col ids in own_ghost triplet remain global -# map_global_to_own!(I_rcv_own,rows_sa) -# map_global_to_own!(J_rcv_own,cols_sa) -# map_global_to_own!(I_rcv_ghost,rows_sa) -# map_ghost_to_global!(nz_own_ghost[2],cols_sa) -# own_own_I = vcat(nz_own_own[1],I_rcv_own) -# own_own_J = vcat(nz_own_own[2],J_rcv_own) -# own_own_V = vcat(nz_own_own[3],V_rcv_own) -# own_own_triplet = (own_own_I,own_own_J,own_own_V) -# own_ghost_I = vcat(nz_own_ghost[1],I_rcv_ghost) -# own_ghost_J = vcat(nz_own_ghost[2],J_rcv_ghost) -# own_ghost_V = vcat(nz_own_ghost[3],V_rcv_ghost) -# map_global_to_ghost!(nz_own_ghost[2],cols_sa) -# own_ghost_triplet = (own_ghost_I,own_ghost_J,own_ghost_V) -# triplets = (own_own_triplet,own_ghost_triplet) -# aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) -# triplets, own_ghost_J, aux -# end -# function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) -# (own_own_triplet,own_ghost_triplet) = triplets -# (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux -# map_global_to_ghost!(own_ghost_triplet[2],cols_fa) -# map_global_to_ghost!(J_rcv_ghost,cols_fa) -# TA = typeof(A.blocks.own_own) -# n_own_rows = own_length(rows_fa) -# n_own_cols = own_length(cols_fa) -# n_ghost_rows = ghost_length(rows_fa) -# n_ghost_cols = ghost_length(cols_fa) -# Ti = indextype(A.blocks.own_own) -# Tv = eltype(A.blocks.own_own) -# own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols) -# own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols) -# ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols) -# ghost_ghost = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_ghost_cols) -# blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) -# values = split_matrix(blocks,local_permutation(rows_fa),local_permutation(cols_fa)) -# nnz_own_own = nnz(own_own) -# k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...) -# k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...) -# for p in 1:length(I_rcv_own) -# i = I_rcv_own[p] -# j = J_rcv_own[p] -# k_rcv_own[p] = nzindex(own_own,i,j) -# end -# for p in 1:length(I_rcv_ghost) -# i = I_rcv_ghost[p] -# j = J_rcv_ghost[p] -# k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own -# end -# cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...) -# values, cache -# end -# rows_sa = partition(axes(A,1)) -# cols_sa = partition(axes(A,2)) -# #rows = map(remove_ghost,rows_sa) -# cols = map(remove_ghost,cols_sa) -# parts_snd, parts_rcv = assembly_neighbors(rows_sa) -# cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa) -# I_snd = map(i->i.I_snd,cache_snd) -# J_snd = map(i->i.J_snd,cache_snd) -# V_snd = map(i->i.V_snd,cache_snd) -# graph = ExchangeGraph(parts_snd,parts_rcv) -# t_I = exchange(I_snd,graph) -# t_J = exchange(J_snd,graph) -# t_V = exchange(V_snd,graph) -# @fake_async begin -# I_rcv = fetch(t_I) -# J_rcv = fetch(t_J) -# V_rcv = fetch(t_V) -# cache_rcv = map(setup_cache_rcv,I_rcv,J_rcv,V_rcv,parts_rcv) -# triplets,J,aux = map(setup_own_triplets,partition(A),cache_rcv,rows_sa,cols_sa) |> tuple_of_arrays -# J_owner = find_owner(cols_sa,J) -# rows_fa = rows -# cols_fa = map(union_ghost,cols,J,J_owner) -# assembly_neighbors(cols_fa;assembly_neighbors_options_cols...) -# vals_fa, cache = map(finalize_values,partition(A),rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) |> tuple_of_arrays -# assembled = true -# B = PSparseMatrix(vals_fa,rows_fa,cols_fa,assembled) -# if val_parameter(reuse) == false -# B -# else -# B, cache -# end -# end -# end # New assemble #################### diff --git a/times.txt b/times.txt index 7db5927d..24e91690 100644 --- a/times.txt +++ b/times.txt @@ -1 +1 @@ -Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2188811, max = 0.2188811, avg = 0.2188811), "Phase 3" => (min = 0.0, max = 0.0, avg = 0.0), "Matrix Assembly" => (min = 0.4098738, max = 0.4098738, avg = 0.4098738), "Phase 1" => (min = 5.0e-7, max = 5.0e-7, avg = 5.0e-7)) +Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2146177, max = 0.2146177, avg = 0.2146177), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4019553, max = 0.4019553, avg = 0.4019553), "Phase 1" => (min = 1.4e-6, max = 1.4e-6, avg = 1.4e-6)) From a458fcc43a84d1ca62f84a4cce0f3ba76faa5cc6 Mon Sep 17 00:00:00 2001 From: jop611 Date: Mon, 9 Sep 2024 16:32:22 +0200 Subject: [PATCH 13/34] fixed mistake in PSparseMatrix documentation (fieldnames/types), added versions of repartition(A,rows,cols) and centralize that support non-default sparse method. --- src/p_sparse_matrix.jl | 50 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index 5018965e..62b6da5d 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -948,9 +948,9 @@ parallel implementations. # Properties -- `matrix_partition::A` -- `row_partition::B` -- `col_partition::C` +- `matrix_partition::B` +- `row_partition::C` +- `col_partition::D` - `assembled::Bool` `matrix_partition[i]` contains a (sparse) matrix with the local rows and the @@ -964,7 +964,7 @@ is fully contained in the own rows. # Supertype hierarchy - PSparseMatrix{V,A,B,C,T} <: AbstractMatrix{T} + PSparseMatrix{V,B,C,D,T} <: AbstractMatrix{T} with `T=eltype(V)`. """ @@ -2212,6 +2212,39 @@ function repartition(A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) end end +function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) + @assert A.assembled "repartition on a sub-assembled matrix not implemented yet" + function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols) + I1,J1,V1 = findnz(A_own_own) + I2,J2,V2 = findnz(A_own_ghost) + map_own_to_global!(I1,A_rows) + map_own_to_global!(I2,A_rows) + map_own_to_global!(J1,A_cols) + map_ghost_to_global!(J2,A_cols) + I = vcat(I1,I2) + J = vcat(J1,J2) + V = vcat(V1,V2) + (I,J,V) + end + A_own_own = own_own_values(A) + A_own_ghost = own_ghost_values(A) + A_rows = partition(axes(A,1)) + A_cols = partition(axes(A,2)) + I,J,V = map(prepare_triplets,A_own_own,A_own_ghost,A_rows,A_cols) |> tuple_of_arrays + # TODO this one does not preserve the local storage layout of A + t = psparse(sparse,I,J,V,new_rows,new_cols;reuse=true) + @fake_async begin + B,cacheB = fetch(t) + if val_parameter(reuse) == false + B + else + cache = (V,cacheB) + B, cache + end + end +end + + """ repartition!(B::PSparseMatrix,A::PSparseMatrix,cache) """ @@ -2281,6 +2314,15 @@ function centralize(A::PSparseMatrix) own_own_values(a_in_main) |> multicast |> getany end +function centralize(sparse,A::PSparseMatrix) + m,n = size(A) + ranks = linear_indices(partition(A)) + rows_trivial = trivial_partition(ranks,m) + cols_trivial = trivial_partition(ranks,n) + a_in_main = repartition(sparse,A,rows_trivial,cols_trivial) |> fetch + own_own_values(a_in_main) |> multicast |> getany +end + """ psystem(I,J,V,I2,V2,rows,cols;kwargs...) """ From 0d4c06f36bcd3b399181b39374f4824f5b03938a Mon Sep 17 00:00:00 2001 From: jop611 Date: Mon, 21 Oct 2024 10:49:50 +0200 Subject: [PATCH 14/34] added versions of sparse_diag_matrix with custom matrix type option, improved cosistent further by reusing own_ghost_block as much as possible through new sparse utils expand_sparse_matrix _columns --- src/gallery.jl | 162 +++++++++++++++++++++++++++++++++++++++++ src/p_range.jl | 4 +- src/p_sparse_matrix.jl | 40 +++++++--- src/sparse_utils.jl | 17 +++++ times.txt | 2 +- 5 files changed, 212 insertions(+), 13 deletions(-) diff --git a/src/gallery.jl b/src/gallery.jl index f68fec9d..8023c1f5 100644 --- a/src/gallery.jl +++ b/src/gallery.jl @@ -553,4 +553,166 @@ function nullspace_linear_elasticity!(B,x) end +function prolongator(T, + nodes_per_dir, + parts_per_dir, + parts; + index_type::Type{Ti} = Int64, + value_type::Type{Tv} = Float64) where {Ti,Tv} + # Improved version of aggregate function not using inefficient direct JaggedRange indexing causing many view allocations + # Also uses a generic function to obtain pointer arrays and index arrays to work with both CSC and CSR. + # If the problem is symmetric neighbours dont change by this. + function aggregate(A,diagA=dense_diag(A);epsilon=0) + # This one is algorithm 5.1 from + # "Algebraic multigrid by smoothed aggregation for second and fourth order elliptic problems" + epsi = epsilon + typeof_strength = eltype(A.nzval) + + nnodes = size(A,1) + pending = Ti(0) + isolated = Ti(-1) + + node_to_aggregate = fill(pending,nnodes) + node_to_old_aggregate = similar(node_to_aggregate) + + node_to_neigs = jagged_array(index_array(A),pointer_array(A)) + neigs = node_to_neigs.data + node_to_vals = jagged_array(A.nzval,pointer_array(A)) + vals = node_to_vals.data + strongly_connected = (node,ineig) -> begin + neig = neigs[ineig] + aii = diagA[node] + ajj = diagA[neig] + aij = vals[ineig] + abs(aij) > epsi*sqrt(aii*ajj) + end + coupling_strength = (node,ineig) -> begin + abs(vals[ineig]) + end + + # Initialization + for node in 1:nnodes + neig_range = jagged_range(node_to_neigs,node) + isolated_node = count(i->neigs[i]!=node,neig_range) == 0 + if isolated_node + node_to_aggregate[node] = isolated + end + end + # Step 1 + aggregate = Ti(0) + for node in 1:nnodes + if node_to_aggregate[node] != pending + continue + end + neig_range = jagged_range(node_to_neigs,node) + all_pending = true + for ineig in neig_range + neig = neigs[ineig] + if neig == node || !strongly_connected(node,ineig) + continue + end + all_pending &= (node_to_aggregate[neig] == pending) + end + if !all_pending + continue + end + aggregate += Ti(1) + node_to_aggregate[node] = aggregate + for ineig in neig_range + neig = neigs[ineig] + if neig == node || !strongly_connected(node,ineig) + continue + end + node_to_aggregate[neig] = aggregate + end + end + # Step 2 + copy!(node_to_old_aggregate,node_to_aggregate) + for node in 1:nnodes + if node_to_aggregate[node] != pending + continue + end + strength = zero(typeof_strength) + neig_range = jagged_range(node_to_neigs, node) + for ineig in neig_range + neig = neigs[ineig] + if neig == node || !strongly_connected(node,ineig) + continue + end + neig_aggregate = node_to_old_aggregate[neig] + if neig_aggregate != pending && neig_aggregate != isolated + neig_strength = coupling_strength(node,ineig) + if neig_strength > strength + strength = neig_strength + node_to_aggregate[node] = neig_aggregate + end + end + end + end + + # Step 3 + for node in 1:nnodes + if node_to_aggregate[node] != pending + continue + end + aggregate += Ti(1) + node_to_aggregate[node] = aggregate + # neigs = node_to_neigs[node] + neig_range = jagged_range(node_to_neigs, node) + for ineig in neig_range + neig = neigs[ineig] + if neig == node || !strongly_connected(node,ineig) + continue + end + neig_aggregate = node_to_old_aggregate[neig] + if neig_aggregate == pending || neig_aggregate == isolated + node_to_aggregate[neig] = aggregate + end + end + end + naggregates = aggregate + + if nnodes == 1 + node_to_aggregate .= 1 + naggregates = 1 + end + node_to_aggregate, 1:naggregates + end + + function aggregate(A::PSparseMatrix,diagA=dense_diag(A);kwargs...) + # This is the vanilla "uncoupled" strategy from "Parallel Smoothed Aggregation Multigrid : Aggregation Strategies on Massively Parallel Machines" + # TODO: implement other more advanced strategies + @assert A.assembled + node_to_aggregate_data, local_ranges = map((A,diagA)->aggregate(A,diagA;kwargs...),own_own_values(A),own_values(diagA)) |> tuple_of_arrays + nown = map(length,local_ranges) + n_aggregates = sum(nown) + nparts = length(nown) + aggregate_partition = variable_partition(nown,n_aggregates) + node_partition = partition(axes(A,1)) + map(map_own_to_global!,node_to_aggregate_data,aggregate_partition) + node_to_aggregate = PVector(node_to_aggregate_data,node_partition) + node_to_aggregate, PRange(aggregate_partition) + end + + function constant_prolongator(T,node_to_aggregate::PVector,aggregates::PRange,n_nullspace_vecs) + if n_nullspace_vecs != 1 + error("case not implemented yet") + end + function setup_triplets(node_to_aggregate,nodes) + myI = UnitRange{Ti}(1:local_length(nodes)) + myJ = node_to_aggregate + myV = ones(length(node_to_aggregate)) + (myI,myJ,myV) + end + node_partition = partition(axes(node_to_aggregate,1)) + I,J,V = map(setup_triplets,partition(node_to_aggregate),node_partition) |> tuple_of_arrays + aggregate_partition = partition(aggregates) + J_owner = find_owner(aggregate_partition,J) + aggregate_partition = map(union_ghost,aggregate_partition,J,J_owner) + map(map_global_to_local!,J,aggregate_partition) + P0 = psparse(T,I,J,V,node_partition,aggregate_partition;assembled=true,indices=:local) |> fetch + P0 + end +end + diff --git a/src/p_range.jl b/src/p_range.jl index cfa330b0..c72f5c9f 100644 --- a/src/p_range.jl +++ b/src/p_range.jl @@ -408,7 +408,7 @@ end """ neigs_snd, neigs_rcv = assembly_neighbors(index_partition;kwargs...) -Return the ids of the neighbor parts from we send and receive data respectively +Return the ids of the neighbor parts from which we send and receive data respectively in the assembly of distributed vectors defined on the index partition `index_partition`. partition `index_partition`. `kwargs` are delegated to [`ExchangeGraph`](@ref) @@ -470,7 +470,7 @@ end function assembly_local_indices(indices,neighbors_snd,neighbors_rcv) cache = map(assembly_cache,indices) - mask = map(cache) do mycache + mask = map(cache) do mycache isassigned(mycache.local_indices_snd) && isassigned(mycache.local_indices_rcv) end if ! getany(mask) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index 62b6da5d..f579b4fd 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -1733,16 +1733,13 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D}, map_global_to_ghost!(I_rcv_ghost,rows_co) map_global_to_own!(J_rcv_own,cols_co) map_global_to_ghost!(J_rcv_ghost,cols_co) - I2,J2,V2 = findnz(A.blocks.own_ghost) - map_ghost_to_global!(J2,cols_fa) - map_global_to_ghost!(J2,cols_co) - n_own_rows = own_length(rows_co) n_ghost_rows = ghost_length(rows_co) n_own_cols = own_length(cols_co) n_ghost_cols = ghost_length(cols_co) TA = typeof(A.blocks.ghost_own) own_own = A.blocks.own_own - own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved + # New own_ghost shares as much memory with existing own_ghost block as possible. Extent depends on sparse format in use. + own_ghost = expand_sparse_matrix_columns(A.blocks.own_ghost,n_ghost_cols) ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols) ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols) K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own) @@ -1767,7 +1764,8 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D}, cols_fa = partition(axes(A,2)) # snd and rcv are swapped on purpose parts_rcv,parts_snd = assembly_neighbors(rows_co) - lids_rcv,lids_snd = assembly_local_indices(rows_co) + # assembly_neighbors is called again in assembly_local_indices? + lids_rcv,lids_snd = assembly_local_indices(rows_co,parts_rcv,parts_snd) cache_snd = map(consistent_setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa) I_snd = map(i->i.I_snd,cache_snd) J_snd = map(i->i.J_snd,cache_snd) @@ -2021,6 +2019,19 @@ function sparse_diag_matrix(d::PVector,shape) psparse(I,J,V,row_partition,col_partition;assembled=true) |> fetch end +# Version of sparse_diag_matrix for preserving local matrix type T (when default CSC is not wanted) +function sparse_diag_matrix(::Type{T},d::PVector,shape) where T + row_partition,col_partition = map(partition,shape) + function setup(own_d,rows,cols) + I = own_to_global(rows) |> collect + J = own_to_global(cols) |> collect + V = own_d + I,J,V + end + I,J,V = map(setup,own_values(d),row_partition,col_partition) |> tuple_of_arrays + psparse(T,I,J,V,row_partition,col_partition;assembled=true) |> fetch +end + function rap(R,A,P;reuse=Val(false)) Ac = R*A*P if val_parameter(reuse) @@ -2126,6 +2137,15 @@ function Base.:-(I::LinearAlgebra.UniformScaling,A::PSparseMatrix) D-A end +# Version of I-A for preserving local matrix type T (when default CSC is not wanted) +function Base.:-(T,I::LinearAlgebra.UniformScaling,A::PSparseMatrix) + Tv = eltype(A) + row_partition = partition(axes(A,1)) + d = pones(Tv,row_partition) + D = PartitionedArrays.sparse_diag_matrix(T,d,axes(A)) + D-A +end + Base.similar(a::PSparseMatrix) = similar(a,eltype(a)) function Base.similar(a::PSparseMatrix,::Type{T}) where T matrix_partition = map(partition(a)) do values @@ -2212,7 +2232,7 @@ function repartition(A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) end end -function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) +function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) where T @assert A.assembled "repartition on a sub-assembled matrix not implemented yet" function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols) I1,J1,V1 = findnz(A_own_own) @@ -2232,7 +2252,7 @@ function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) A_cols = partition(axes(A,2)) I,J,V = map(prepare_triplets,A_own_own,A_own_ghost,A_rows,A_cols) |> tuple_of_arrays # TODO this one does not preserve the local storage layout of A - t = psparse(sparse,I,J,V,new_rows,new_cols;reuse=true) + t = psparse(T,I,J,V,new_rows,new_cols;reuse=true) @fake_async begin B,cacheB = fetch(t) if val_parameter(reuse) == false @@ -2314,12 +2334,12 @@ function centralize(A::PSparseMatrix) own_own_values(a_in_main) |> multicast |> getany end -function centralize(sparse,A::PSparseMatrix) +function centralize(::Type{T},A::PSparseMatrix) where T m,n = size(A) ranks = linear_indices(partition(A)) rows_trivial = trivial_partition(ranks,m) cols_trivial = trivial_partition(ranks,n) - a_in_main = repartition(sparse,A,rows_trivial,cols_trivial) |> fetch + a_in_main = repartition(T,A,rows_trivial,cols_trivial) |> fetch own_own_values(a_in_main) |> multicast |> getany end diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl index 85bbf01a..57b3b57f 100644 --- a/src/sparse_utils.jl +++ b/src/sparse_utils.jl @@ -680,3 +680,20 @@ function spmv_csc!(b,x,colptr_A,rowval_A,nzval_A) b end +function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Ti,Tv}, n) where {Bi,Tv,Ti} + p,q = size(A) + @assert n >= q + SparseMatrixCSR{Bi,Ti,Tv}(p,n,A.rowptr,A.colval,A.nzval) +end + +function expand_sparse_matrix_columns(A::SparseMatrixCSC{Ti,Tv}, n) where {Tv,Ti} + p,q = size(A) + @assert n >= q + new_colptr = similar(A.colptr,n+1) + map!(identity,new_colptr,A.colptr) + last_index = A.colptr[end] + foreach(q+1:n+1) do i + new_colptr[i] = last_index + end + SparseMatrixCSC{Ti,Tv}(p,n,new_colptr,A.rowval,A.nzval) +end \ No newline at end of file diff --git a/times.txt b/times.txt index 24e91690..95ffa1dd 100644 --- a/times.txt +++ b/times.txt @@ -1 +1 @@ -Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2146177, max = 0.2146177, avg = 0.2146177), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4019553, max = 0.4019553, avg = 0.4019553), "Phase 1" => (min = 1.4e-6, max = 1.4e-6, avg = 1.4e-6)) +Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2018849, max = 0.2018849, avg = 0.2018849), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4173665, max = 0.4173665, avg = 0.4173665), "Phase 1" => (min = 1.2e-6, max = 1.2e-6, avg = 1.2e-6)) From 47dea153d449ef3f859fe224236efe9ffcf96c21 Mon Sep 17 00:00:00 2001 From: jop611 Date: Mon, 21 Oct 2024 10:50:52 +0200 Subject: [PATCH 15/34] ? --- src/gallery.jl | 167 +------------------------------------------------ 1 file changed, 1 insertion(+), 166 deletions(-) diff --git a/src/gallery.jl b/src/gallery.jl index 8023c1f5..f9667f0f 100644 --- a/src/gallery.jl +++ b/src/gallery.jl @@ -550,169 +550,4 @@ function nullspace_linear_elasticity!(B,x) error("case not implemented") end B -end - - -function prolongator(T, - nodes_per_dir, - parts_per_dir, - parts; - index_type::Type{Ti} = Int64, - value_type::Type{Tv} = Float64) where {Ti,Tv} - # Improved version of aggregate function not using inefficient direct JaggedRange indexing causing many view allocations - # Also uses a generic function to obtain pointer arrays and index arrays to work with both CSC and CSR. - # If the problem is symmetric neighbours dont change by this. - function aggregate(A,diagA=dense_diag(A);epsilon=0) - # This one is algorithm 5.1 from - # "Algebraic multigrid by smoothed aggregation for second and fourth order elliptic problems" - epsi = epsilon - typeof_strength = eltype(A.nzval) - - nnodes = size(A,1) - pending = Ti(0) - isolated = Ti(-1) - - node_to_aggregate = fill(pending,nnodes) - node_to_old_aggregate = similar(node_to_aggregate) - - node_to_neigs = jagged_array(index_array(A),pointer_array(A)) - neigs = node_to_neigs.data - node_to_vals = jagged_array(A.nzval,pointer_array(A)) - vals = node_to_vals.data - strongly_connected = (node,ineig) -> begin - neig = neigs[ineig] - aii = diagA[node] - ajj = diagA[neig] - aij = vals[ineig] - abs(aij) > epsi*sqrt(aii*ajj) - end - coupling_strength = (node,ineig) -> begin - abs(vals[ineig]) - end - - # Initialization - for node in 1:nnodes - neig_range = jagged_range(node_to_neigs,node) - isolated_node = count(i->neigs[i]!=node,neig_range) == 0 - if isolated_node - node_to_aggregate[node] = isolated - end - end - # Step 1 - aggregate = Ti(0) - for node in 1:nnodes - if node_to_aggregate[node] != pending - continue - end - neig_range = jagged_range(node_to_neigs,node) - all_pending = true - for ineig in neig_range - neig = neigs[ineig] - if neig == node || !strongly_connected(node,ineig) - continue - end - all_pending &= (node_to_aggregate[neig] == pending) - end - if !all_pending - continue - end - aggregate += Ti(1) - node_to_aggregate[node] = aggregate - for ineig in neig_range - neig = neigs[ineig] - if neig == node || !strongly_connected(node,ineig) - continue - end - node_to_aggregate[neig] = aggregate - end - end - # Step 2 - copy!(node_to_old_aggregate,node_to_aggregate) - for node in 1:nnodes - if node_to_aggregate[node] != pending - continue - end - strength = zero(typeof_strength) - neig_range = jagged_range(node_to_neigs, node) - for ineig in neig_range - neig = neigs[ineig] - if neig == node || !strongly_connected(node,ineig) - continue - end - neig_aggregate = node_to_old_aggregate[neig] - if neig_aggregate != pending && neig_aggregate != isolated - neig_strength = coupling_strength(node,ineig) - if neig_strength > strength - strength = neig_strength - node_to_aggregate[node] = neig_aggregate - end - end - end - end - - # Step 3 - for node in 1:nnodes - if node_to_aggregate[node] != pending - continue - end - aggregate += Ti(1) - node_to_aggregate[node] = aggregate - # neigs = node_to_neigs[node] - neig_range = jagged_range(node_to_neigs, node) - for ineig in neig_range - neig = neigs[ineig] - if neig == node || !strongly_connected(node,ineig) - continue - end - neig_aggregate = node_to_old_aggregate[neig] - if neig_aggregate == pending || neig_aggregate == isolated - node_to_aggregate[neig] = aggregate - end - end - end - naggregates = aggregate - - if nnodes == 1 - node_to_aggregate .= 1 - naggregates = 1 - end - node_to_aggregate, 1:naggregates - end - - function aggregate(A::PSparseMatrix,diagA=dense_diag(A);kwargs...) - # This is the vanilla "uncoupled" strategy from "Parallel Smoothed Aggregation Multigrid : Aggregation Strategies on Massively Parallel Machines" - # TODO: implement other more advanced strategies - @assert A.assembled - node_to_aggregate_data, local_ranges = map((A,diagA)->aggregate(A,diagA;kwargs...),own_own_values(A),own_values(diagA)) |> tuple_of_arrays - nown = map(length,local_ranges) - n_aggregates = sum(nown) - nparts = length(nown) - aggregate_partition = variable_partition(nown,n_aggregates) - node_partition = partition(axes(A,1)) - map(map_own_to_global!,node_to_aggregate_data,aggregate_partition) - node_to_aggregate = PVector(node_to_aggregate_data,node_partition) - node_to_aggregate, PRange(aggregate_partition) - end - - function constant_prolongator(T,node_to_aggregate::PVector,aggregates::PRange,n_nullspace_vecs) - if n_nullspace_vecs != 1 - error("case not implemented yet") - end - function setup_triplets(node_to_aggregate,nodes) - myI = UnitRange{Ti}(1:local_length(nodes)) - myJ = node_to_aggregate - myV = ones(length(node_to_aggregate)) - (myI,myJ,myV) - end - node_partition = partition(axes(node_to_aggregate,1)) - I,J,V = map(setup_triplets,partition(node_to_aggregate),node_partition) |> tuple_of_arrays - aggregate_partition = partition(aggregates) - J_owner = find_owner(aggregate_partition,J) - aggregate_partition = map(union_ghost,aggregate_partition,J,J_owner) - map(map_global_to_local!,J,aggregate_partition) - P0 = psparse(T,I,J,V,node_partition,aggregate_partition;assembled=true,indices=:local) |> fetch - P0 - end -end - - +end \ No newline at end of file From 6079cf7fc9eb1de9f44b394362c0816dd33c3d94 Mon Sep 17 00:00:00 2001 From: jop611 Date: Mon, 21 Oct 2024 11:28:29 +0200 Subject: [PATCH 16/34] added some function with spare matrix construct function passed --- src/p_sparse_matrix.jl | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index f579b4fd..fcb2cbe6 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -2264,6 +2264,37 @@ function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(fals end end +function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) where T + @assert A.assembled "repartition on a sub-assembled matrix not implemented yet" + function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols) + I1,J1,V1 = findnz(A_own_own) + I2,J2,V2 = findnz(A_own_ghost) + map_own_to_global!(I1,A_rows) + map_own_to_global!(I2,A_rows) + map_own_to_global!(J1,A_cols) + map_ghost_to_global!(J2,A_cols) + I = vcat(I1,I2) + J = vcat(J1,J2) + V = vcat(V1,V2) + (I,J,V) + end + A_own_own = own_own_values(A) + A_own_ghost = own_ghost_values(A) + A_rows = partition(axes(A,1)) + A_cols = partition(axes(A,2)) + I,J,V = map(prepare_triplets,A_own_own,A_own_ghost,A_rows,A_cols) |> tuple_of_arrays + t = psparse(sparse,I,J,V,new_rows,new_cols;reuse=true) + @fake_async begin + B,cacheB = fetch(t) + if val_parameter(reuse) == false + B + else + cache = (V,cacheB) + B, cache + end + end +end + """ repartition!(B::PSparseMatrix,A::PSparseMatrix,cache) @@ -2343,6 +2374,15 @@ function centralize(::Type{T},A::PSparseMatrix) where T own_own_values(a_in_main) |> multicast |> getany end +function centralize(sparse,A::PSparseMatrix) where T + m,n = size(A) + ranks = linear_indices(partition(A)) + rows_trivial = trivial_partition(ranks,m) + cols_trivial = trivial_partition(ranks,n) + a_in_main = repartition(sparse,A,rows_trivial,cols_trivial) |> fetch + own_own_values(a_in_main) |> multicast |> getany +end + """ psystem(I,J,V,I2,V2,rows,cols;kwargs...) """ From 82c17a9a4ebb8fe8138a3a636cae12e597e296fb Mon Sep 17 00:00:00 2001 From: jop611 Date: Mon, 21 Oct 2024 11:30:21 +0200 Subject: [PATCH 17/34] fixed problem related to previous commit --- src/p_sparse_matrix.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index fcb2cbe6..b69a3456 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -2264,7 +2264,7 @@ function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(fals end end -function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) where T +function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) @assert A.assembled "repartition on a sub-assembled matrix not implemented yet" function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols) I1,J1,V1 = findnz(A_own_own) @@ -2374,7 +2374,7 @@ function centralize(::Type{T},A::PSparseMatrix) where T own_own_values(a_in_main) |> multicast |> getany end -function centralize(sparse,A::PSparseMatrix) where T +function centralize(sparse,A::PSparseMatrix) m,n = size(A) ranks = linear_indices(partition(A)) rows_trivial = trivial_partition(ranks,m) From 5afff2ea30f3dc7333e9c53c1624f3ce65b98156 Mon Sep 17 00:00:00 2001 From: jop611 Date: Mon, 21 Oct 2024 11:35:51 +0200 Subject: [PATCH 18/34] fixed bug in expand_sparse_matrix functions --- src/sparse_utils.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl index 57b3b57f..4f95f6eb 100644 --- a/src/sparse_utils.jl +++ b/src/sparse_utils.jl @@ -680,13 +680,13 @@ function spmv_csc!(b,x,colptr_A,rowval_A,nzval_A) b end -function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Ti,Tv}, n) where {Bi,Tv,Ti} +function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Tv,Ti}, n) where {Bi,Tv,Ti} p,q = size(A) @assert n >= q - SparseMatrixCSR{Bi,Ti,Tv}(p,n,A.rowptr,A.colval,A.nzval) + SparseMatrixCSR{Bi,Tv,Ti}(p,n,A.rowptr,A.colval,A.nzval) end -function expand_sparse_matrix_columns(A::SparseMatrixCSC{Ti,Tv}, n) where {Tv,Ti} +function expand_sparse_matrix_columns(A::SparseMatrixCSC{Tv,Ti}, n) where {Tv,Ti} p,q = size(A) @assert n >= q new_colptr = similar(A.colptr,n+1) @@ -695,5 +695,5 @@ function expand_sparse_matrix_columns(A::SparseMatrixCSC{Ti,Tv}, n) where {Tv,Ti foreach(q+1:n+1) do i new_colptr[i] = last_index end - SparseMatrixCSC{Ti,Tv}(p,n,new_colptr,A.rowval,A.nzval) + SparseMatrixCSC{Tv,Ti}(p,n,new_colptr,A.rowval,A.nzval) end \ No newline at end of file From cb3d40ad64635701f89a0ed66167733f9bd6035b Mon Sep 17 00:00:00 2001 From: jop611 Date: Mon, 21 Oct 2024 11:40:41 +0200 Subject: [PATCH 19/34] fixed another issue with expand_sparse_matrix_columns --- src/sparse_utils.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl index 4f95f6eb..7b0d2b40 100644 --- a/src/sparse_utils.jl +++ b/src/sparse_utils.jl @@ -680,10 +680,10 @@ function spmv_csc!(b,x,colptr_A,rowval_A,nzval_A) b end -function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Tv,Ti}, n) where {Bi,Tv,Ti} +function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Tv,Ti} where {Tv, Ti}, n) where Bi p,q = size(A) @assert n >= q - SparseMatrixCSR{Bi,Tv,Ti}(p,n,A.rowptr,A.colval,A.nzval) + SparseMatrixCSR{Bi}(p,n,A.rowptr,A.colval,A.nzval) end function expand_sparse_matrix_columns(A::SparseMatrixCSC{Tv,Ti}, n) where {Tv,Ti} From da64ff00dcda7374c1bd02322522a2b38007e34d Mon Sep 17 00:00:00 2001 From: jop611 Date: Tue, 22 Oct 2024 18:39:57 +0200 Subject: [PATCH 20/34] helper function for thesis contributions added --- src/p_sparse_matrix.jl | 2 +- src/sparse_utils.jl | 227 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 227 insertions(+), 2 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index b69a3456..22d56f3b 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -2142,7 +2142,7 @@ function Base.:-(T,I::LinearAlgebra.UniformScaling,A::PSparseMatrix) Tv = eltype(A) row_partition = partition(axes(A,1)) d = pones(Tv,row_partition) - D = PartitionedArrays.sparse_diag_matrix(T,d,axes(A)) + D = sparse_diag_matrix(T,d,axes(A)) D-A end diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl index 7b0d2b40..83f7f2a8 100644 --- a/src/sparse_utils.jl +++ b/src/sparse_utils.jl @@ -431,6 +431,40 @@ end # A #end +# Variants for findnz() that only allocates memory for the conversion of the pointer array to an index array. +function findnz_minimal(A::SparseMatrixCSC) + J = ptr_to_coo(A.colptr) + rowvals(A),J,nonzeros(A) +end +function findnz_minimal(A::SparseMatrixCSR) + I = ptr_to_coo(A.rowptr) + I,colvals(A),nonzeros(A) +end + +# Behaves like findnz, but without copying the values. +function find_indices(A::SparseMatrixCSC) + I,J,_ = findnz_minimal(A) + copy(I),J +end +function find_indices(A::SparseMatrixCSR) + I,J,_ = findnz_minimal(A) + I,copy(J) +end + +# Could be optimized by a two-way merge-like method when A is a guaranteed submatrix of C. +function precompute_nzindex(C::AbstractSparseArray,A::AbstractSparseArray) + I,J,_ = findnz_minimal(A) + K = similar(I) + K .= 0 + for (p,(i,j)) in enumerate(zip(I,J)) + if i < 1 || j < 1 + continue + end + K[p] = nzindex(C,i,j) + end + K +end + function precompute_nzindex(A,I,J) K = zeros(Int32,length(I)) for (p,(i,j)) in enumerate(zip(I,J)) @@ -442,6 +476,17 @@ function precompute_nzindex(A,I,J) K end +# Reuse I vector as K vector. +function precompute_nzindex!(I,A,J) + for (p,(i,j)) in enumerate(zip(I,J)) + if i < 1 || j < 1 + continue + end + I[p] = nzindex(A,i,j) + end + I +end + function sparse_matrix!(A,V,K;reset=true) if reset LinearAlgebra.fillstored!(A,0) @@ -459,7 +504,7 @@ end # Notation # csrr: csr with repeated and unsorted columns -# csru: csr witu unsorted columns +# csru: csr with unsorted columns # csc: csc with sorted columns struct SparseMatrixCSRR{Tv,Ti,A} @@ -696,4 +741,184 @@ function expand_sparse_matrix_columns(A::SparseMatrixCSC{Tv,Ti}, n) where {Tv,Ti new_colptr[i] = last_index end SparseMatrixCSC{Tv,Ti}(p,n,new_colptr,A.rowval,A.nzval) +end + +function Base.similar(A::SparseMatrixCSR{Bi}, m::Integer, n::Integer) where Bi + SparseMatrixCSR{1}(m, n, ones(eltype(A.rowptr), m+1), eltype(A.colval)[], eltype(A.nzval)[]) +end + +function Base.similar(A::SparseMatrixCSR{Bi}) where Bi + SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), similar(nonzeros(A))) +end + +function Base.copy(A::SparseMatrixCSR{Bi}) where Bi + SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), copy(nonzeros(A))) +end + +function Base.copy(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti} + A = At.parent + p,q = size(A) + Acsc = ascsc(A) + Acsc_T = copy(transpose(Acsc)) # materialize SparseMAtrixCSC transpose + SparseMatrixCSR{Bi}(q, p, Acsc_T.colptr, rowvals(Acsc_T), nonzeros(Acsc_T)) +end + +function SparseMatricesCSR.sparsecsr(A::SparseMatrixCSC) + sparsecsr(findnz(A)..., size(A)...) +end + +function SparseMatricesCSR.sparsecsr(At::Transpose) + transpose(sparsecsr(At.parent)) +end + +function SparseMatricesCSR.sparsecsr(A::SparseMatrixCSR) + A +end + +function SparseMatricesCSR.sparsecsr(T::Type, A::SparseMatrixCSC) + compresscoo(T,findnz(A)..., size(A)...) +end + + +function pointer_array(A::SparseMatrixCSR) + A.rowptr +end + +function pointer_array(A::SparseMatrixCSC) + A.colptr +end + +function index_array(A::SparseMatrixCSR) + colvals(A) +end + +function index_array(A::SparseMatrixCSC) + rowvals(A) +end + +function ptr_to_coo(ptr_array) + K = zeros(Int32, (ptr_array[end]-1)) + for i in 1:(length(ptr_array)-1) + for p in ptr_array[i]:ptr_array[i+1]-1 + K[p] = i + end + end + K +end + +function find_max_row_length(A::SparseMatrixCSR) + max_rA = 0 + for i in 1:size(A,1) + l = length(nzrange(A,i)) + max_rA = max_rA > l ? max_rA : l + end + max_rA +end + +function find_max_col_length(A::SparseMatrixCSC) + max_cA = 0 + for j in 1:size(A,2) + l = length(nzrange(A,j)) + max_cA = max_cA > l ? max_cA : l + end + max_cA +end + +# Lazily convert CSC matrix to CSR matrix, by interpreting columnpointers as row pointers, and colvals as rowvals, +# effectively transposing it in the process. +function ascsr(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} + p,q = size(A) + SparseMatrixCSR{1}(q,p,A.colptr,rowvals(A),nonzeros(A)) +end + +# Lazily convert CSR matrix to CSC matrix, by interpreting rowpointers as column pointers, and rowvals as colvals, +# effectively transposing it in the process. +function ascsc(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + p,q = size(A) + SparseMatrixCSC{Tv,Ti}(q,p,A.rowptr,colvals(A),nonzeros(A)) +end + + +function halfperm(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + q = size(A,2) + JA,VA = colvals(A),nonzeros(A) + IAt,JAt,VAt = similar(A.rowptr,q+1),similar(JA),similar(VA) + halfperm!(IAt,JAt,VAt,A) +end + +# transpose A into At using vectors IAt,JAt, and VAt +function halfperm!(IAt,JAt,VAt,A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + JA,VA = colvals(A),nonzeros(A) + p,q = size(A) + count_occurrences!(IAt,JA) + counts_to_ptrs!(IAt) + shift_by_one!(IAt) + for i in 1:p + for jp in nzrange(A,i) + j = JA[jp] + jpt = IAt[j+1] + JAt[jpt] = i + VAt[jpt] = VA[jp] + IAt[j+1] = jpt+1 + end + end + IAt[1] = 1 + SparseMatrixCSR{Bi}(q,p,IAt,JAt,VAt) +end + +# retranspose At back into A +function halfperm!(A::SparseMatrixCSR{Bi,Tv,Ti},At::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + IA,JA,VA = A.rowptr,colvals(A),nonzeros(A) + JAt,VAt = colvals(At),nonzeros(At) + p,q = size(At) + shift_by_one!(IA) # pointer to row 1 must be located at IA[2], row 2 at IA[3] etc. + IA[1] = 1 + for i in 1:p + for jpt in nzrange(At,i) + j = JAt[jpt] + jp = IA[j+1] + JA[jp] = i + VA[jp] = VAt[jpt] + IA[j+1] = jp+1 + end + end + At +end + +function halfperm!(A::SparseMatrixCSC,At::SparseMatrixCSC) + halfperm!(ascsr(A),ascsr(At)) + A +end + +function halfperm(A::SparseMatrixCSC) + At = halfperm(ascsr(A)) + ascsc(At) +end + +function count_occurrences!(v1::AbstractVector{<:Integer},v2::AbstractVector{<:Integer};set_zero=true) + if set_zero + v1 .= 0 + end + foreach(i->v1[i]+=1,v2) + v1 +end + +# shift all entries one element to the right in-place. Not circular. +function shift_by_one!(v) + l = length(v) + prev = v[1] + tmp = prev + for i in 1:l-1 + tmp = v[i+1] + v[i+1] = prev + prev = tmp + end +end + +function counts_to_ptrs!(v) + l = length(v) + v[1] += 1 + foreach(i->v[i]+=v[i-1],2:l) + shift_by_one!(v) + v[1] = 1 end \ No newline at end of file From 2f2e927049868e0fcc6e11c854612548dd7cdc72 Mon Sep 17 00:00:00 2001 From: jop611 Date: Tue, 22 Oct 2024 19:01:30 +0200 Subject: [PATCH 21/34] removed copy function for SparseCSR, as it is implemented by SparseMatricesCSR now. --- src/sparse_utils.jl | 19 ++++++++++--------- times.txt | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl index 83f7f2a8..48172218 100644 --- a/src/sparse_utils.jl +++ b/src/sparse_utils.jl @@ -743,24 +743,25 @@ function expand_sparse_matrix_columns(A::SparseMatrixCSC{Tv,Ti}, n) where {Tv,Ti SparseMatrixCSC{Tv,Ti}(p,n,new_colptr,A.rowval,A.nzval) end +# Currently not implemented by the SparseMatricesCSR module function Base.similar(A::SparseMatrixCSR{Bi}, m::Integer, n::Integer) where Bi SparseMatrixCSR{1}(m, n, ones(eltype(A.rowptr), m+1), eltype(A.colval)[], eltype(A.nzval)[]) end +# Currently not implemented by the SparseMatricesCSR module function Base.similar(A::SparseMatrixCSR{Bi}) where Bi SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), similar(nonzeros(A))) end -function Base.copy(A::SparseMatrixCSR{Bi}) where Bi - SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), copy(nonzeros(A))) -end +# This method is implemented also by SparseMatricesCSR, but related methods aren't. +# function Base.copy(A::SparseMatrixCSR{Bi}) where Bi +# SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), copy(nonzeros(A))) +# end -function Base.copy(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti} - A = At.parent - p,q = size(A) - Acsc = ascsc(A) - Acsc_T = copy(transpose(Acsc)) # materialize SparseMAtrixCSC transpose - SparseMatrixCSR{Bi}(q, p, Acsc_T.colptr, rowvals(Acsc_T), nonzeros(Acsc_T)) +# Currently not implemented by the SparseMatricesCSR module +function Base.copy(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}} where {Bi,Tv,Ti}) + Acsc_T = copy(transpose(ascsc(At.parent))) # materialize SparseMatrixCSC transpose + ascsr(Acsc_T) end function SparseMatricesCSR.sparsecsr(A::SparseMatrixCSC) diff --git a/times.txt b/times.txt index 95ffa1dd..53764f74 100644 --- a/times.txt +++ b/times.txt @@ -1 +1 @@ -Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2018849, max = 0.2018849, avg = 0.2018849), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4173665, max = 0.4173665, avg = 0.4173665), "Phase 1" => (min = 1.2e-6, max = 1.2e-6, avg = 1.2e-6)) +Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2068526, max = 0.2068526, avg = 0.2068526), "Phase 3" => (min = 0.0, max = 0.0, avg = 0.0), "Matrix Assembly" => (min = 0.4054487, max = 0.4054487, avg = 0.4054487), "Phase 1" => (min = 1.1e-6, max = 1.1e-6, avg = 1.1e-6)) From cabad59933a45aed8d70c931d64c7624147f05fc Mon Sep 17 00:00:00 2001 From: jop611 Date: Fri, 24 Jan 2025 23:30:57 +0100 Subject: [PATCH 22/34] Added new distributed SpMM, SpMtM, SPMMM and SpMtMM algorithms with latency hiding to PartitionedArrays. Additional tests for these functions have been added. Sparse utils has been extended with some new functions required by SpMM, etc. Serial kernels for the methods are provided in a new file 'sequential implementations'. --- src/PartitionedArrays.jl | 13 + src/p_sparse_matrix.jl | 551 +++++++++- src/sequential_implementations.jl | 1672 +++++++++++++++++++++++++++++ src/sparse_utils.jl | 104 +- test/debug_array/runtests.jl | 2 + test/debug_array/spmtmm_tests.jl | 14 + test/mpi_array/runtests.jl | 1 + test/mpi_array/spmtmm_tests.jl | 4 + test/spmtmm_tests.jl | 216 ++++ times.txt | 2 +- 10 files changed, 2479 insertions(+), 100 deletions(-) create mode 100644 src/sequential_implementations.jl create mode 100644 test/debug_array/spmtmm_tests.jl create mode 100644 test/mpi_array/spmtmm_tests.jl create mode 100644 test/spmtmm_tests.jl diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl index 317c1c2a..8505d709 100644 --- a/src/PartitionedArrays.jl +++ b/src/PartitionedArrays.jl @@ -170,9 +170,17 @@ export spmv! export spmtv! export spmm export spmm! +export spmmm +export spmmm! export spmtm export spmtm! +export spmtmm +export spmtmm! export centralize +export explicit_transpose +export explicit_transpose! +export add +export add! include("p_sparse_matrix.jl") export BRange @@ -196,4 +204,9 @@ export nullspace_linear_elasticity! export near_nullspace_linear_elasticity include("gallery.jl") +export RAP +export RAP! +export -,+ +include("sequential_implementations.jl") + end # module diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index c95e9bf9..e663b841 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -2262,21 +2262,56 @@ function sparse_diag_matrix(::Type{T},d::PVector,shape) where T psparse(T,I,J,V,row_partition,col_partition;assembled=true) |> fetch end -function rap(R,A,P;reuse=Val(false)) - Ac = R*A*P +### OLD ### +# function rap(R,A,P;reuse=Val(false)) +# Ac = R*A*P +# if val_parameter(reuse) +# return Ac, nothing +# end +# Ac +# end + +### NEW ### +function rap(R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false)) + Ac, cache = spmmm(R,A,P) if val_parameter(reuse) - return Ac, nothing + return Ac, cache end Ac end -function rap!(Ac,R,A,P,cache) - # TODO improve performance - tmp = R*A*P - copyto!(Ac,tmp) +### OLD ### +# function rap!(Ac,R,A,P,cache) +# # TODO improve performance +# tmp = R*A*P +# copyto!(Ac,tmp) +# Ac +# end + +### NEW ### +function rap!(Ac::PSparseMatrix,R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache) + spmmm!(Ac,R,A,P,cache) Ac end +### NEW ### +function rap(Pt::Transpose{Tv,<:PSparseMatrix} where Tv, A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false)) + spmtmm(Pt.parent,A,P;reuse=reuse) +end + +function rap!(Ac::PSparseMatrix,Pt::Transpose{Tv,<:PSparseMatrix} where Tv, A::PSparseMatrix,P::PSparseMatrix,cache) + spmtmm!(Ac,Pt.parent,A,P,cache) +end + +function rap(A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false)) + spmtmm(P,A,P;reuse=reuse) +end + +function rap!(Ac::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache) + spmtmm!(Ac,A,P,cache) +end +### End NEW ### + function spmm(A,B;reuse=Val(false)) C = A*B if val_parameter(reuse) @@ -2290,28 +2325,82 @@ function spmm!(C,A,B,state) C end +### OLD ### +# function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) +# # TODO latency hiding +# @assert A.assembled +# @assert B.assembled +# col_partition = partition(axes(A,2)) +# C,cacheC = consistent(B,col_partition;reuse=true) |> fetch +# D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays +# assembled = true +# D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled) +# if val_parameter(reuse) +# cache = (C,cacheC,cacheD) +# return D,cache +# end +# D +# end + +# function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) +# (C,cacheC,cacheD)= cache +# consistent!(C,B,cacheC) |> wait +# map(spmm!,partition(D),partition(A),partition(C),cacheD) +# D +# end + +### NEW ### function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) - # TODO latency hiding - @assert A.assembled - @assert B.assembled - col_partition = partition(axes(A,2)) - C,cacheC = consistent(B,col_partition;reuse=true) |> fetch - D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays - assembled = true - D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled) + t = consistent(B,partition(axes(A,2)),reuse=true) + A_own_own = own_own_values(A) + A_own_ghost = own_ghost_values(A) + + C_own_own_1 = map(*,A_own_own,own_own_values(B)) + + # Wait for consistent + B2, cacheB2 = fetch(t) + C_own_ghost_1 = map(*,A_own_own,own_ghost_values(B2)) + C_own_own_2 = map(*,A_own_ghost,ghost_own_values(B2)) + C_own_ghost_2 = map(*,A_own_ghost,ghost_ghost_values(B2)) + + C_own_own = map(+, C_own_own_1, C_own_own_2) + C_own_ghost = map(+, C_own_ghost_1, C_own_ghost_2) + + Coo_cache = map(construct_spmm_cache, C_own_own) + Cog_cache = map(construct_spmm_cache, C_own_ghost) + + C_values = map(C_own_own,C_own_ghost,partition(A),partition(B2)) do own_own,own_ghost,A_part,B_part + ghost_own = similar(own_own,0,size(own_own,2)) + ghost_ghost = similar(own_own,0,size(own_ghost,2)) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + split_matrix(blocks,A_part.row_permutation,B_part.col_permutation) + end + + C = PSparseMatrix(C_values,partition(axes(A,1)),partition(axes(B2,2)),true) if val_parameter(reuse) - cache = (C,cacheC,cacheD) - return D,cache + cache = (B2,cacheB2,(Coo_cache,Cog_cache)) + return C,cache end - D + C end -function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) - (C,cacheC,cacheD)= cache - consistent!(C,B,cacheC) |> wait - map(spmm!,partition(D),partition(A),partition(C),cacheD) - D +function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) + (B2,cacheB2,(Coo_cache,Cog_cache)) = cache + t = consistent!(B2,B,cacheB2) + A_own_own = own_own_values(A) + A_own_ghost = own_ghost_values(A) + C_own_own = own_own_values(C) + C_own_ghost = own_ghost_values(C) + + map(mul!, C_own_own, A_own_own, own_own_values(B),Coo_cache) + wait(t) + map(mul!, C_own_ghost, A_own_own, own_ghost_values(B2),Cog_cache) + + map((C,A,B,cache) -> mul!(C,A,B,1,1,cache), C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache) + map((C,A,B,cache) -> mul!(C,A,B,1,1,cache), C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache) + C end +### End NEW ### function spmtm(A,B;reuse=Val(false)) C = transpose(A)*B @@ -2326,27 +2415,99 @@ function spmtm!(C,A,B,cache) C end +### OLD ### +# function spmtm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) +# # TODO latency hiding +# @assert A.assembled +# @assert B.assembled +# D_partition,cacheD = map((args...)->spmtm(args...;reuse=true),partition(A),partition(B)) |> tuple_of_arrays +# assembled = false +# D = PSparseMatrix(D_partition,partition(axes(A,2)),partition(axes(B,2)),assembled) +# C,cacheC = assemble(D;reuse=true) |> fetch +# if val_parameter(reuse) +# cache = (D,cacheC,cacheD) +# return C,cache +# end +# C +# end + +# function spmtm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) +# (D,cacheC,cacheD)= cache +# map(spmtm!,partition(D),partition(A),partition(B),cacheD) +# assemble!(C,D,cacheC) |> wait +# C +# end + +### NEW ### function spmtm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) - # TODO latency hiding - @assert A.assembled - @assert B.assembled - D_partition,cacheD = map((args...)->spmtm(args...;reuse=true),partition(A),partition(B)) |> tuple_of_arrays + Aoo = own_own_values(A) + Aog = own_ghost_values(A) + Boo = own_own_values(B) + Bog = own_ghost_values(B) + + C1go = map((A,B)->transpose(A)*B,Aog,Boo) + C1gg = map((A,B)->transpose(A)*B,Aog,Bog) + + C1_values = map(C1go, C1gg, partition(A), partition(B)) do ghost_own, ghost_ghost, A_part, B_part + own_own = similar(ghost_ghost, size(A_part.blocks.own_own, 2), size(B_part.blocks.own_own, 2)) + own_ghost = similar(ghost_ghost, size(A_part.blocks.own_own, 2), size(B_part.blocks.own_ghost, 2)) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + split_matrix(blocks,A_part.col_permutation,B_part.col_permutation) + end + assembled = false - D = PSparseMatrix(D_partition,partition(axes(A,2)),partition(axes(B,2)),assembled) - C,cacheC = assemble(D;reuse=true) |> fetch + C1_unassembled = PSparseMatrix(C1_values,partition(axes(A,2)),partition(axes(B,2)),assembled) + t = assemble(C1_unassembled,reuse=true) + + C2oo = map((A,B)->transpose(A)*B,Aoo,Boo) + C2og = map((A,B)->transpose(A)*B,Aoo,Bog) + + C2_values = map(C2oo, C2og, partition(A), partition(B)) do own_own, own_ghost, A_part, B_part + ghost_own = similar(own_own,0,size(own_own,2)) + ghost_ghost = similar(own_own,0,size(own_ghost,2)) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + split_matrix(blocks, A_part.col_permutation, B_part.col_permutation) + end + + # No cache returned by SparseArrays, so this is a workaround. + Coo_cache = map(construct_spmtm_cache, C2oo) + Cog_cache = map(construct_spmtm_cache, C2og) + Cgo_cache = map(construct_spmtm_cache, C1go) + Cgg_cache = map(construct_spmtm_cache, C1gg) + + assembled = true + C2 = PSparseMatrix(C2_values,partition(axes(A,2)),partition(axes(B,2)),assembled) + C1, assemblyCache = fetch(t) + C, mergeCache = add(C1, C2) + if val_parameter(reuse) - cache = (D,cacheC,cacheD) + sequential_caches = (Coo_cache,Cog_cache,Cgo_cache,Cgg_cache) + cache = (C1, C1_unassembled, assemblyCache, C2, mergeCache, sequential_caches) return C,cache end C end function spmtm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) - (D,cacheC,cacheD)= cache - map(spmtm!,partition(D),partition(A),partition(B),cacheD) - assemble!(C,D,cacheC) |> wait + C1, C1_unassembled, assemblyCache, C2, mergeCache, sequential_caches = cache + (Coo_cache,Cog_cache,Cgo_cache,Cgg_cache) = sequential_caches + + Aoo = own_own_values(A) + Aog = own_ghost_values(A) + Boo = own_own_values(B) + Bog = own_ghost_values(B) + + map((C,A,B,cache)->mul!(C,transpose(A),B,cache),ghost_own_values(C1_unassembled),Aog,Boo,Cgo_cache) + map((C,A,B,cache)->mul!(C,transpose(A),B,cache),ghost_ghost_values(C1_unassembled),Aog,Bog,Cgg_cache) + + t = assemble!(C1, C1_unassembled, assemblyCache) + map((C,A,B,cache)->mul!(C,transpose(A),B,cache),own_own_values(C2),Aoo,Boo,Coo_cache) + map((C,A,B,cache)->mul!(C,transpose(A),B,cache),own_ghost_values(C2),Aoo,Bog,Cog_cache) + wait(t) + add!(C, C1, C2, mergeCache) C end +### End NEW ### function Base.:*(A::PSparseMatrix,B::PSparseMatrix) C = spmm(A,B) @@ -2462,6 +2623,8 @@ function repartition(A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) end end +### NEW ### +# Repartition that follows local data layout of type T (some sparse matrix format) function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) where T @assert A.assembled "repartition on a sub-assembled matrix not implemented yet" function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols) @@ -2481,7 +2644,7 @@ function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(fals A_rows = partition(axes(A,1)) A_cols = partition(axes(A,2)) I,J,V = map(prepare_triplets,A_own_own,A_own_ghost,A_rows,A_cols) |> tuple_of_arrays - # TODO this one does not preserve the local storage layout of A + t = psparse(T,I,J,V,new_rows,new_cols;reuse=true) @fake_async begin B,cacheB = fetch(t) @@ -2494,6 +2657,8 @@ function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(fals end end +### NEW ### +# Repartition that follows local data layout by using sparse function "sparse" function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) @assert A.assembled "repartition on a sub-assembled matrix not implemented yet" function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols) @@ -2595,6 +2760,8 @@ function centralize(A::PSparseMatrix) own_own_values(a_in_main) |> multicast |> getany end +### NEW ### +# Centralize function with local storage layout of type T (some sparse matrix format) function centralize(::Type{T},A::PSparseMatrix) where T m,n = size(A) ranks = linear_indices(partition(A)) @@ -2604,6 +2771,8 @@ function centralize(::Type{T},A::PSparseMatrix) where T own_own_values(a_in_main) |> multicast |> getany end +### NEW ### +# Centralize function that follows local data layout resulting from "sparse" function centralize(sparse,A::PSparseMatrix) m,n = size(A) ranks = linear_indices(partition(A)) @@ -2849,3 +3018,317 @@ function laplace_matrix(nodes_per_dir,parts_per_dir,ranks) I,J,V = map(setup,node_partition) |> tuple_of_arrays A = psparse(sparse,I,J,V,node_partition,node_partition) |> fetch end + + +################ NEW ################ + +# Locally transpose SplitMatrix +function explicit_transpose(A::AbstractSplitMatrix) + own_own = halfperm(A.blocks.own_own) + own_ghost = halfperm(A.blocks.ghost_own) + ghost_own = halfperm(A.blocks.own_ghost) + ghost_ghost = halfperm(A.blocks.ghost_ghost) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + split_matrix(blocks,A.col_permutation,A.row_permutation) +end + +# Redistribute PSparseMatrix, returns unassembled transpose and a assmbly task when reuse is true, or only the assembly task otherwise +function explicit_transpose(A::PSparseMatrix;reuse=false) + mats = map(explicit_transpose,partition(A)) + rows, cols = axes(A) + B = PSparseMatrix(mats,partition(cols),partition(rows),false) + t = assemble(B,reuse=reuse) + if val_parameter(reuse) + B,t + else + t + end +end + +function explicit_transpose!(B::AbstractSplitMatrix,A::AbstractSplitMatrix) + halfperm!(B.blocks.own_own,A.blocks.own_own) + halfperm!(B.blocks.own_ghost,A.blocks.ghost_own) + halfperm!(B.blocks.ghost_own,A.blocks.own_ghost) + halfperm!(B.blocks.ghost_ghost,A.blocks.ghost_ghost) +end + +function explicit_transpose!(B::PSparseMatrix,B_local::PSparseMatrix,A::PSparseMatrix,cache) + map(explicit_transpose!,partition(B_local),partition(A)) + assemble!(B, B_local, cache) +end + +function add(A::PSparseMatrix,B::PSparseMatrix) + function add_own_own(A,B) + C = A+B + # reuse IA/IB for cache + KA = precompute_nzindex(C,A) + KB = precompute_nzindex(C,B) + C,(KA,KB) + end + function add_own_ghost(own_ghost_A, own_ghost_B, colsA, colsB, cols) + # Minimize allocated memory, but could be replaced with findnz(...) + iA,jA = find_indices(own_ghost_A) # local nonzero + vA = nonzeros(own_ghost_A) + iB,jB = find_indices(own_ghost_B) # local nonzero + vB = nonzeros(own_ghost_B) + jC = zeros(eltype(jA), (length(jA) + length(jB))) + ghostA_to_global = ghost_to_global(colsA) + ghostB_to_global = ghost_to_global(colsB) + global_to_ghostC = global_to_ghost(cols) + l = zero(eltype(jA)) + for k in eachindex(jA) + l += 1 + j = jA[k] + jC[l] = global_to_ghostC[ghostA_to_global[j]] + jA[k] = jC[l] + end + for k in eachindex(jB) + l += 1 + j = jB[k] + jC[l] = global_to_ghostC[ghostB_to_global[j]] + jB[k] = jC[l] + end + own_ghost = compresscoo(typeof(own_ghost_A), vcat(iA, iB), jC, vcat(vA, vB), size(own_ghost_A, 1), ghost_length(cols)) + # reuse auxiliary iA, iB arrays as caches + precompute_nzindex!(iA,own_ghost,iA,jA) + precompute_nzindex!(iB,own_ghost,iB,jB) + own_ghost, (iA, iB) + end + function _add(A,B) + colsA = partition(axes(A,2)) + colsB = partition(axes(B,2)) + J = map(ghost_to_global, colsB) + J_owner = map(ghost_to_owner, colsB) + cols = map(union_ghost, colsA, J, J_owner) + rows = partition(axes(A,1)) + Coo, Koo = map(add_own_own, own_own_values(A), own_own_values(B)) |> tuple_of_arrays + Cog, Kog = map(add_own_ghost, own_ghost_values(A), own_ghost_values(B), colsA, colsB, cols) |> tuple_of_arrays + C_vals = map(Coo,Cog,rows,cols) do Coo, Cog, rows, cols + Cgo = similar(Coo, 0, size(Coo,2)) + Cgg = similar(Coo, 0, size(Cog,2)) + blocks = split_matrix_blocks(Coo, Cog, Cgo, Cgg) + split_matrix(blocks, local_permutation(rows), local_permutation(cols)) + end + assembled = true + K = (Koo, Kog) + PSparseMatrix(C_vals,rows,cols,assembled), K + end + _add(A,B) +end + +function add!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) + function add_blocks!(C, A, B, K) + K_A, K_B = K + sparse_matrix!(C, nonzeros(A), K_A) + sparse_matrix!(C, nonzeros(B), K_B, reset=false) + end + Koo, Kog = cache + map(add_blocks!, own_own_values(C), own_own_values(A), own_own_values(B), Koo) + map(add_blocks!, own_ghost_values(C), own_ghost_values(A), own_ghost_values(B), Kog) +end + +# Interpret A as if its transpose is needed +function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(false)) + consistency_task = consistent(C, partition(axes(B,2)),reuse=true) + + Aoo = own_own_values(A) + Boo = own_own_values(B) + Cog = own_own_values(C) + + Aog = own_ghost_values(A) + Bog = own_ghost_values(B) + + Doo1, Doo_cache = map((A,B,C)->RAP(transpose(A),B,C), Aoo,Boo,Cog) |> tuple_of_arrays + Dgo1, Dgo_cache = map((A,B,C)->RAP(transpose(A),B,C), Aog,Boo,Cog) |> tuple_of_arrays + + # Collect ghost rows from P before continuing + C2, consistencyCache = fetch(consistency_task) + + Cog2 = own_ghost_values(C2) + Cgo = ghost_own_values(C2) + Cgg = ghost_ghost_values(C2) + + Dgo2, Dgo_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aog,Bog,Cgo,Dgo_cache) |> tuple_of_arrays + Dog1, Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aog,Boo,Cog2,Dgo_cache) |> tuple_of_arrays + Dog2, Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aog,Bog,Cgg,Dog_cache) |> tuple_of_arrays + + Dgo = map(+,Dgo1,Dgo2) # different sparsity patterns so not in-place. + Dog = map(+,Dog1,Dog2) + + D1_values = map(Dgo, Dog, partition(C), partition(C2)) do ghost_own, ghost_ghost, C_part, C2_part + own_own = similar(ghost_ghost, size(C_part.blocks.own_own, 2), size(C2_part.blocks.own_own, 2)) + own_ghost = similar(ghost_ghost, size(C_part.blocks.own_own, 2), size(C2_part.blocks.own_ghost, 2)) + blocks = split_matrix_blocks(own_own, own_ghost, ghost_own, ghost_ghost) + split_matrix(blocks, C_part.col_permutation, C2_part.col_permutation) + end + D1_unassembled = PSparseMatrix(D1_values, partition(axes(C,2)), partition(axes(C2,2)), false) + assembly_task = assemble(D1_unassembled, reuse=true) + + Dog1, Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aoo,Boo,Cog2,Doo_cache) |> tuple_of_arrays + Doo2,Doo_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays + Dog2,Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays + + Doo = map(+,Doo1,Doo2) + Dog = map(+,Dog1,Dog2) + + Doo_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Doo_cache,Doo) + Dog_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dog_cache,Dog) + Dgo_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dgo_cache,Dgo) + Dog_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dog_cache,Dog) + + D2_values = map(Doo, Dog, partition(C2)) do own_own, own_ghost, C_part + ghost_own = similar(own_own,0,size(own_own, 2)) + ghost_ghost = similar(own_ghost,0,size(own_ghost, 2)) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + split_matrix(blocks, C_part.col_permutation, C_part.col_permutation) + end + + D1, assemblyCache = fetch(assembly_task) + D2 = PSparseMatrix(D2_values, partition(axes(D1,1)), partition(axes(C2,2)), true) + D, mergeCache = add(D1, D2) + sequential_caches = (Doo_cache_final, Dog_cache_final, Dgo_cache_final, Dog_cache_final) + if val_parameter(reuse) + cache = (C2, consistencyCache, D1, D1_unassembled, assemblyCache, D2, mergeCache, sequential_caches) + return D,cache + end + D +end + +function spmtmm(A::PSparseMatrix,P::PSparseMatrix;kwargs...) + spmtmm(transpose(P),A,P;kwargs...) +end + +function spmtmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix,cache) + C2, consistencyCache, D1, D1_unassembled, assemblyCache, D2, mergeCache, sequential_caches = cache + Doo_cache, Dog_cache, Dgo_cache, Dgg_cache = sequential_caches + C2, consistencyCache, D1, D1_unassembled, assemblyCache, D2, mergeCache = cache + + consistency_task = consistent!(C2, C, consistencyCache) + Doo = own_own_values(D2) + Dog = own_ghost_values(D2) + Dgo = ghost_own_values(D1_unassembled) + Dgg = ghost_ghost_values(D1_unassembled) + + Aoo = own_own_values(A) + Boo = own_own_values(B) + Coo = own_own_values(C) + + Aog = own_ghost_values(A) + Bog = own_ghost_values(B) + + map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Doo,Aoo,Boo,Coo,Doo_cache) + map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Dgo,Aog,Boo,Coo,Dgo_cache) + + # Collect ghost rows from P before continuing + wait(consistency_task) + Cog2 = own_ghost_values(C2) + Cgo = ghost_own_values(C2) + Cgg = ghost_ghost_values(C2) + + map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Dgg,Aog,Boo,Cog2,Dgg_cache) + + map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Dgo,Aog,Bog,Cgo,Dgo_cache) + map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Dgg,Aog,Bog,Cgg,Dgg_cache) + + assembly_task = assemble!(D1, D1_unassembled, assemblyCache) + + map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Doo,Aoo,Bog,Cgo,Doo_cache) + map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Dog,Aoo,Boo,Cog2,Dog_cache) + map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Dog,Aoo,Bog,Cgg,Dog_cache) + + wait(assembly_task) + add!(D, D1, D2, mergeCache) + D +end + +function spmtmm!(C::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache) + spmtmm!(C,P,A,P,cache) +end + +function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(false)) + B2_task = consistent(B,partition(axes(A,2)),reuse=true) + Aoo = own_own_values(A) + Aog = own_ghost_values(A) + Boo = own_own_values(B) + Coo = own_own_values(C) + + Doo1,Doo_cache = map(RAP,Aoo,Boo,Coo) |> tuple_of_arrays + B2, Bcache = fetch(B2_task) + C2_task = consistent(C,partition(axes(B2,2)),reuse=true) + + Bog = own_ghost_values(B2) + Bgo = ghost_own_values(B2) + Bgg = ghost_ghost_values(B2) + + Doo2,Doo_cache = map(RAP,Aog,Bgo,Coo,Doo_cache) |> tuple_of_arrays + Doo12 = map(+,Doo1,Doo2) + + C2, Ccache = fetch(C2_task) + + Cog = own_ghost_values(C2) + Cgo = ghost_own_values(C2) + Cgg = ghost_ghost_values(C2) + + Doo3,Doo_cache = map(RAP,Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays + Doo4,Doo_cache = map(RAP,Aog,Bgg,Cgo,Doo_cache) |> tuple_of_arrays + + Doo34 = map(+,Doo3,Doo4) + Doo = map(+,Doo12,Doo34) + + Dog1,Dog_cache = map(RAP,Aoo,Boo,Cog) |> tuple_of_arrays + Dog2,Dog_cache = map(RAP,Aog,Bgo,Cog,Dog_cache) |> tuple_of_arrays + Dog3,Dog_cache = map(RAP,Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays + Dog4,Dog_cache = map(RAP,Aog,Bgg,Cgg,Dog_cache) |> tuple_of_arrays + + Dog12 = map(+,Dog1,Dog2) + Dog34 = map(+,Dog3,Dog4) + Dog = map(+,Dog12,Dog34) + + D_values = map(Doo, Dog, partition(A),partition(C2)) do own_own, own_ghost, A_part,C_part + ghost_own = similar(own_own,0,size(own_own, 2)) + ghost_ghost = similar(own_ghost,0,size(own_ghost, 2)) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + split_matrix(blocks, A_part.row_permutation, C_part.col_permutation) + end + + D = PSparseMatrix(D_values, partition(axes(A,1)), partition(axes(C2,2)), true) + if val_parameter(reuse) + cache = B2,Bcache,C2,Ccache,(Doo_cache,Dog_cache) + return D,cache + end + D +end + +function spmmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix,cache) + B2,Bcache,C2,Ccache,sequential_caches = cache + Doo_cache, Dog_cache = sequential_caches + B2_task = consistent!(B2,B,Bcache) + + Doo = own_own_values(D) + Dog = own_ghost_values(D) + Aoo = own_own_values(A) + Aog = own_ghost_values(A) + Boo = own_own_values(B) + Coo = own_own_values(C) + map(RAP!,Doo,Aoo,Boo,Coo,Doo_cache) + wait(B2_task) + + C2_task = consistent!(C2,C,Ccache) + Bog = own_ghost_values(B2) + Bgo = ghost_own_values(B2) + Bgg = ghost_ghost_values(B2) + map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Doo,Aog,Bgo,Coo,Doo_cache) + + wait(C2_task) + Cog = own_ghost_values(C2) + Cgo = ghost_own_values(C2) + Cgg = ghost_ghost_values(C2) + + map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Doo,Aoo,Bog,Cgo,Doo_cache) + map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Doo,Aog,Bgg,Cgo,Doo_cache) + map(RAP!,Dog,Aoo,Boo,Cog,Dog_cache) + map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Dog,Aog,Bgo,Cog,Dog_cache) + map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Dog,Aoo,Bog,Cgg,Dog_cache) + map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Dog,Aog,Bgg,Cgg,Dog_cache) + D +end \ No newline at end of file diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl new file mode 100644 index 00000000..94ab1d08 --- /dev/null +++ b/src/sequential_implementations.jl @@ -0,0 +1,1672 @@ +function Base.:*(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TiA,TvB,TiB} + C = ascsc(B)*ascsc(A) + ascsr(C) +end + +function Base.:*(At::Transpose{Tv, SparseMatrixCSR{Bi,Tv,Ti}},B::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + C = ascsc(B)*transpose(ascsc(At.parent)) + ascsr(C) +end + +function Base.:*(A::SparseMatrixCSR{Bi,Tv,Ti},Bt::Transpose{Tv, SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti} + C = transpose(ascsc(Bt.parent))*ascsc(A) + ascsr(C) +end + +function Base.:*(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},Bt::Transpose{Tv, SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti} + C = transpose(ascsc(Bt.parent))*transpose(ascsc(At.parent)) + ascsr(C) +end + +function Base.:*(x::Number,A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> x*a, A.nzval)) +end +function Base.:*(A::SparseMatrixCSR,x::Number) *(x,A) end + +function Base.:/(A::SparseMatrixCSR{Bi,Tv,Ti},x::Number) where {Bi,Tv,Ti} + SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> a/x, A.nzval)) +end + + +# Alternative to lazy csr to csc for matrix addition that does not drop structural zeros. +function Base.:+(A::SparseMatrixCSR{Bi,Tv,Ti},B::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end + p,q = size(A) + nnz_C_upperbound = nnz(A) + nnz(B) + IC = Vector{Ti}(undef, p+1) + JC = Vector{Ti}(undef, nnz_C_upperbound) + VC = Vector{Tv}(undef, nnz_C_upperbound) + + pC = 1 + JA = colvals(A) + VA = nonzeros(A) + JB = colvals(B) + VB = nonzeros(B) + for i in 1:p + IC[i] = pC + jpA_range = nzrange(A, i) + jpA, jpA_end = jpA_range.start, jpA_range.stop + jpB_range = nzrange(B, i) + jpB, jpB_end = jpB_range.start, jpB_range.stop + while jpA <= jpA_end && jpB <= jpB_end + jA = JA[jpA] + jB = JB[jpB] + if jA < jB + JC[pC] = jA + VC[pC] = VA[jpA] + jpA += 1 + elseif jB < jA + JC[pC] = jB + VC[pC] = VB[jpB] + jpB += 1 + else + JC[pC] = jA + VC[pC] = VA[jpA] + VB[jpB] + jpA += 1 + jpB += 1 + end + pC += 1 + end + while jpA <= jpA_end + JC[pC] = JA[jpA] + VC[pC] = VA[jpA] + jpA += 1 + pC += 1 + end + while jpB <= jpB_end + JC[pC] = JB[jpB] + VC[pC] = VB[jpB] + jpB += 1 + pC += 1 + end + end + IC[end] = pC + resize!(JC, (pC-1)) + resize!(VC, (pC-1)) + SparseMatrixCSR{Bi}(p,q,IC,JC,VC) # A += B +end + +# Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B. +function Base.:-(A::SparseMatrixCSR{Bi,Tv,Ti},B::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end + nnz_C_upperbound = nnz(A) + nnz(B) + p,r = size(A) + IC = Vector{Ti}(undef, p+1) + JC = Vector{Ti}(undef, nnz_C_upperbound) + VC = Vector{Tv}(undef, nnz_C_upperbound) + + pC = 1 + JA = colvals(A) + VA = nonzeros(A) + JB = colvals(B) + VB = nonzeros(B) + for i in 1:p + IC[i] = pC + jpA_range = nzrange(A, i) + jpA, jpA_end = jpA_range.start, jpA_range.stop + jpB_range = nzrange(B, i) + jpB, jpB_end = jpB_range.start, jpB_range.stop + while jpA <= jpA_end && jpB <= jpB_end + jA = JA[jpA] + jB = JB[jpB] + if jA < jB + JC[pC] = jA + VC[pC] = VA[jpA] + jpA += 1 + elseif jB < jA + JC[pC] = jB + VC[pC] = -VB[jpB] + jpB += 1 + else + JC[pC] = jA + VC[pC] = VA[jpA] - VB[jpB] + jpA += 1 + jpB += 1 + end + pC += 1 + end + while jpA <= jpA_end + JC[pC] = JA[jpA] + VC[pC] = VA[jpA] + jpA += 1 + pC += 1 + end + while jpB <= jpB_end + JC[pC] = JB[jpB] + VC[pC] = -VB[jpB] + jpB += 1 + pC += 1 + end + end + IC[end] = pC + resize!(JC, (pC-1)) + resize!(VC, (pC-1)) + SparseMatrixCSR{Bi}(p,r,IC,JC,VC) # A += B +end + +function Base.:-(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a->-a, A.nzval)) +end + +# Alternative to lazy csr to csc for matrix addition that does not drop structural zeros. +function Base.:+(A::SparseMatrixCSC{Tv,Ti},B::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} + if size(A) != size(B) && throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end + p,q = size(A) + nnz_C_upperbound = nnz(A) + nnz(B) + JC = Vector{Ti}(undef, q+1) + IC = Vector{Ti}(undef, nnz_C_upperbound) + VC = Vector{Tv}(undef, nnz_C_upperbound) + + pC = 1 + IA = rowvals(A) + VA = nonzeros(A) + IB = rowvals(B) + VB = nonzeros(B) + for j in 1:q + JC[j] = pC + ipA_range = nzrange(A, j) + ipA, ipA_end = ipA_range.start, ipA_range.stop + ipB_range = nzrange(B, j) + ipB, ipB_end = ipB_range.start, ipB_range.stop + while ipA <= ipA_end && ipB <= ipB_end + iA = IA[ipA] + iB = IB[ipB] + if iA < iB + IC[pC] = iA + VC[pC] = VA[ipA] + ipA += 1 + elseif iB < iA + IC[pC] = iB + VC[pC] = VB[ipB] + ipB += 1 + else + IC[pC] = iA + VC[pC] = VA[ipA] + VB[ipB] + ipA += 1 + ipB += 1 + end + pC += 1 + end + while ipA <= ipA_end + IC[pC] = IA[ipA] + VC[pC] = VA[ipA] + ipA += 1 + pC += 1 + end + while ipB <= ipB_end + IC[pC] = IB[ipB] + VC[pC] = VB[ipB] + ipB += 1 + pC += 1 + end + end + JC[end] = pC + resize!(IC, (pC-1)) + resize!(VC, (pC-1)) + SparseMatrixCSC{Tv,Ti}(p,q,JC,IC,VC) +end + +# Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B. +function Base.:-(A::SparseMatrixCSC{Tv,Ti},B::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} + if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end + p,q = size(A) + nnz_C_upperbound = nnz(A) + nnz(B) + JC = Vector{Ti}(undef, q+1) + IC = Vector{Ti}(undef, nnz_C_upperbound) + VC = Vector{Tv}(undef, nnz_C_upperbound) + + pC = 1 + IA = rowvals(A) + VA = nonzeros(A) + IB = rowvals(B) + VB = nonzeros(B) + for j in 1:q + JC[j] = pC + ipA_range = nzrange(A, j) + ipA, ipA_end = ipA_range.start, ipA_range.stop + ipB_range = nzrange(B, j) + ipB, ipB_end = ipB_range.start, ipB_range.stop + while ipA <= ipA_end && ipB <= ipB_end + iA = IA[ipA] + iB = IB[ipB] + if iA < iB + IC[pC] = iA + VC[pC] = VA[ipA] + ipA += 1 + elseif iB < iA + IC[pC] = iB + VC[pC] = VB[ipB] + ipB += 1 + else + IC[pC] = iA + VC[pC] = VA[ipA] - VB[ipB] + ipA += 1 + ipB += 1 + end + pC += 1 + end + while ipA <= ipA_end + IC[pC] = IA[ipA] + VC[pC] = VA[ipA] + ipA += 1 + pC += 1 + end + while ipB <= ipB_end + IC[pC] = IB[ipB] + VC[pC] = -VB[ipB] + ipB += 1 + pC += 1 + end + end + JC[end] = pC + resize!(IC, (pC-1)) + resize!(VC, (pC-1)) + SparseMatrixCSC{Tv,Ti}(p,q,JC,IC,VC) +end + +function Base.:-(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} + SparseMatrixCSC{Tv,Ti}(size(A)..., copy(A.colptr), copy(A.rowval), map(a->-a, A.nzval)) +end + + +function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, + A::SparseMatrixCSC{Tv,Ti}, + B::SparseMatrixCSC{Tv,Ti}, + cache) where {Tv,Ti} + mul!(ascsr(C),ascsr(B),ascsr(A),cache) +end + + +function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, + A::SparseMatrixCSC{Tv,Ti}, + B::SparseMatrixCSC{Tv,Ti}, + α::Number, + β::Number, + cache) where {Tv,Ti} + mul!(ascsr(C),ascsr(B),ascsr(A),α,β,cache) +end + +function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, + At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, + B::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} + a,b = size(C) + p,q = size(At) + r,s = size(B) + if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end + if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end + A = At.parent + VC = nonzeros(C) + VC .= 0 + IC = rowvals(C) + JA = rowvals(A) # When virtually transposed rowvals represent colvals. + VA = nonzeros(A) + IB = rowvals(B) + VB = nonzeros(B) + for j in 1:s + # loop over columns "j" in row i of A + Bj = nzrange(B, j) + ptrB_start = Bj.start + ptrB_stop = Bj.stop + for ip in nzrange(C, j) + i = IC[ip] + # loop over columns "k" in row j of B + Ai = nzrange(A, i) + ptrB = ptrB_start + ptrA = Ai.start + vC = 0 + while ptrA <= Ai.stop && ptrB <= ptrB_stop + jA = JA[ptrA] + iB = IB[ptrB] + if jA < iB + ptrA += 1 + elseif iB < jA + ptrB += 1 + else # jA == iB + vC += VA[ptrA]*VB[ptrB] + ptrA += 1 + ptrB += 1 + end + end + VC[ip] = vC + end + end + C +end + +function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, + At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, + B::SparseMatrixCSC{Tv,Ti}, + α::Number, + β::Number) where {Tv,Ti} + a,b = size(C) + p,q = size(At) + r,s = size(B) + if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end + if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end + A = At.parent + VC = nonzeros(C) + IC = rowvals(C) + VC .*= β + JA = rowvals(A) # When virtually transposed rowvals represent colvals. + VA = nonzeros(A) + IB = rowvals(B) + VB = nonzeros(B) + for j in 1:s + # loop over columns "j" in row i of A + Bj = nzrange(B, j) + for jp in nzrange(C, j) + i = IC[jp] + # loop over columns "k" in row j of B + Ai = nzrange(A, i) + ptrB = Bj.start + ptrA = Ai.start + vC = 0 + while ptrA <= Ai.stop && ptrB <= Bj.stop + jA = JA[ptrA] + iB = IB[ptrB] + if jA == iB + vC += VA[ptrA]*VB[ptrB] + ptrA += 1 + ptrB += 1 + elseif jA < iB + ptrA += 1 + else + ptrB += 1 + end + end + VC[jp] += α*vC + end + end + C +end + +function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, + A::SparseMatrixCSC{Tv,Ti}, + Bt::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}) where {Tv,Ti} + mul!(ascsr(C),transpose(ascsr(B)),ascsr(A)) +end + +function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, + A::SparseMatrixCSR{Bi,Tv,Ti}, + B::SparseMatrixCSR{Bi,Tv,Ti}, + cache) where {Bi,Tv,Ti} + a,b = size(C) + p,q = size(A) + r,s = size(B) + if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end + if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end + JC = colvals(C) + VC = nonzeros(C) + VC .= zero(Tv) + JA = colvals(A) + VA = nonzeros(A) + JB = colvals(B) + VB = nonzeros(B) + # A cache here would remove need for allocating acumulating arrays + # xb = zeros(Ti, p) + xb,x = cache + xb .= 0 + # x = similar(xb, Tv) # sparse accumulator, can be zeros() to remove if statement in inner loop. + for i in 1:p # ! + # loop over rows Ai in col Bj + for jpa in nzrange(A, i) + ja = JA[jpa] + va = VA[jpa] + # loop over columns "k" in row j of B + for jpb in nzrange(B, ja) + jb = JB[jpb] + vb = VB[jpb] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xb[jb] != i + xb[jb] = i + x[jb] = va*vb + else + x[jb] += va*vb + end + end + end + for jpc in nzrange(C,i) + jc = JC[jpc] + # To support in-place products whose sparsity patterns are subsets of the sparsity of C, this check is required. + if xb[jc] == i + VC[jpc] = x[jc] + end + end + end + C +end + +function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, + A::SparseMatrixCSC{Tv,Ti}, + Bt::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, + cache) where {Tv,Ti} + mul!(ascsr(C),transpose(ascsr(B)),ascsr(A),cache) +end + +function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, + A::SparseMatrixCSR{Bi,Tv,Ti}, + B::SparseMatrixCSR{Bi,Tv,Ti}, + α::Number, + β::Number, + cache) where {Bi,Tv,Ti} + a,b = size(C) + p,q = size(A) + r,s = size(B) + if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end + if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end + JC = colvals(C) + VC = nonzeros(C) + VC .*= β + JA = colvals(A) + VA = nonzeros(A) + JB = colvals(B) + VB = nonzeros(B) + # A cache here would remove need for allocating acumulating arrays + # xb = zeros(Ti, p) + xb,x = cache + xb .= 0 + # x = similar(xb, Tv) # sparse accumulator, can be zeros() to remove if statement in inner loop. + for i in 1:p # ! + # loop over rows Ai in col Bj + for jpa in nzrange(A, i) + ja = JA[jpa] + va = VA[jpa] + # loop over columns "k" in row j of B + for jpb in nzrange(B, ja) + jb = JB[jpb] + vb = VB[jpb] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xb[jb] != i + xb[jb] = i + x[jb] = va*vb + else + x[jb] += va*vb + end + end + end + for jpc in nzrange(C,i) + jc = JC[jpc] + # To support in-place products whose sparsity patterns are subsets of the sparsity of C, this check is required. + if xb[jc] == i + VC[jpc] += α * x[jc] + end + end + end + C +end + +function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, + A::SparseMatrixCSC{Tv,Ti}, + Bt::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, + α::Number, + β::Number, + cache) where {Tv,Ti} + mul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),α,β,cache) +end + +function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, + At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, + B::SparseMatrixCSC{Tv,Ti}, + cache) where {Tv,Ti} + mul!(ascsr(C),ascsr(B),transpose(ascsr(At.parent))) +end + +function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, + At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, + B::SparseMatrixCSC{Tv,Ti}, + α::Number, + β::Number, + cache) where {Tv,Ti} + mul!(ascsr(C),ascsr(A),transpose(ascsr(At.parent)),α,β) +end + +# Workaround to supply in-place mul! with auxiliary array, as these are not returned by multiply function exported by SparseArrays +function construct_spmm_cache(A::SparseMatrixCSR{Bi,Tv,Ti} where Bi) where {Tv,Ti} + q = size(A,2) + xb = zeros(Ti,q) + x = similar(xb,Tv) + xb,x +end +function construct_spmm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} + construct_spmm_cache(ascsr(A)) +end + +function construct_spmtm_cache(A::SparseMatrixCSR{Bi,Tv,Ti} where Bi) where {Tv,Ti} + q = size(A,2) + xb = zeros(Ti,q) + x = similar(xb,Tv) + xb,x +end + +function construct_spmtm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} + construct_spmtm_cache(ascsr(A)) +end + +function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, + At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, + B::SparseMatrixCSR{Bi,Tv,Ti}, + cache) where {Bi,Tv,Ti} + a,b = size(C) + p,q = size(At) + r,s = size(B) + if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end + if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end + A = At.parent + VC = nonzeros(C) + VC .= zero(Tv) + JC = colvals(C) + JA = colvals(A) # When virtually transposed colvals represent rowvals. + VA = nonzeros(A) + JB = colvals(B) + VB = nonzeros(B) + xb,x = cache + xb .= 0 + for k in 1:q + # loop over columns "j" in row i of B + for jpb in nzrange(B,k) + jb = JB[jpb] + vb = VB[jpb] + xb[jb] = k + x[jb] = vb + end + for ipa in nzrange(A,k) + ia = JA[ipa] # interpret column index of A as row index of A^T. + va = VA[ipa] + for jpc in nzrange(C, ia) + jc = JC[jpc] + # This check is required, as the outerproduct might not contribute to to all nonzero entries in this row of C. + if xb[jc] == k + VC[jpc] += va*x[jc] + end + end + end + + end + C +end + +function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, + At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, + B::SparseMatrixCSR{Bi,Tv,Ti}, + α::Number, + β::Number, + cache) where {Bi,Tv,Ti} + a,b = size(C) + p,q = size(At) + r,s = size(B) + if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end + if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end + A = At.parent + VC = nonzeros(C) + VC .*= β + JC = colvals(C) + JA = colvals(A) # When virtually transposed colvals represent rowvals. + VA = nonzeros(A) + JB = colvals(B) + VB = nonzeros(B) + xb,x = cache + xb .= 0 + for k in 1:q + # loop over columns "j" in row i of B + for jpb in nzrange(B,k) + jb = JB[jpb] + vb = VB[jpb] + xb[jb] = k + x[jb] = α*vb + end + for ipa in nzrange(A,k) + ia = JA[ipa] # interpret column index of A as row index of A^T. + va = VA[ipa] + for jpc in nzrange(C, ia) + jc = JC[jpc] + # This check is required, as the outerproduct might not contribute to to all nonzero entries in this row of C. + if xb[jc] == k + VC[jpc] += va*x[jc] + end + end + end + + end + C +end + +function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, + A::SparseMatrixCSR{Bi,Tv,Ti}, + Bt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti} + mul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A)) + C +end + +function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, + A::SparseMatrixCSR{Bi,Tv,Ti}, + Bt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, + α::Number, + β::Number) where {Bi,Tv,Ti} + mul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A), α, β) + C +end + +# PtAP variants +function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi,Tv,Ti}, Pr::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + p,q = size(Plt) + m,r = size(A) + n,s = size(Pr) + if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end + if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end + function RAP_symbolic_count!(R,A,Pr) + JR = R.data + JA = colvals(A) + JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + xbRA = zeros(Ti, r) + xbC = zeros(Ti, s) # this vector will also serve as as colptr array in halfperm + max_rR = find_max_row_length(R) + max_rA = find_max_row_length(A) + max_rPr = find_max_row_length(Pr) + + max_rC = max((max_rR*max_rA*max_rPr),(max_rA*max_rR)) + JRA = Vector{Ti}(undef,max_rC) + IC = Vector{Ti}(undef,p+1) + nnz_C = 1 + IC[1] = nnz_C + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in jagged_range(R, i) + j = JR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + end + end + end + ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(Pr,j) + k = JPr[kp] + if xbC[k] != i + xbC[k] = i + ccC += 1 + end + end + end + nnz_C += ccC + IC[i+1] = nnz_C + end + JC = Vector{Ti}(undef, nnz_C-1) + VC = zeros(Tv,nnz_C-1) + JAP = Vector{Ti}(undef,min(max_rA*max_rPr,s)) # upper bound estimate for length of virtual row of AP + xbRA .= 0 + xbC .= 0 + cache = (xbRA,JRA,xbC,JAP) + SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized + end + function RAP_symbolic_fill!(C,R,A,Pr,cache) + (xbRA,JRA,xbC,JAP) = cache + JC = colvals(C) + JR = R.data + JA = colvals(A) + JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + pC = 0 + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in jagged_range(R, i) + j = JR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + end + end + end + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(Pr,j) + k = JPr[kp] + if xbC[k] != i + pC += 1 + xbC[k] = i + JC[pC] = k + end + end + end + end + xbC .= 0 + outer_cache = (xbC,similar(xbC, Tv),JAP) + C, outer_cache # values not yet initialized + end + function _RAP(Plt,A,Pr) + R = symbolic_halfperm(Plt.parent) + C,symbolic_cache = RAP_symbolic_count!(R,A,Pr) # precompute nz structure with a symbolic transpose + _,outer_cache = RAP_symbolic_fill!(C,R,A,Pr,symbolic_cache) + Ct = symbolic_halfperm(C) + symbolic_halfperm!(C,Ct) + RAP!(C,Plt,A,Pr,outer_cache),(outer_cache...,R) + end + _RAP(Plt,A,Pr) +end + +function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, + A::SparseMatrixCSR{Bi,Tv,Ti}, + Pr::SparseMatrixCSR{Bi,Tv,Ti}, + cache) where {Bi,Tv,Ti} + p,q = size(Plt) + m,r = size(A) + n,s = size(Pr) + if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end + if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end + + function RAP_symbolic_count!(R,A,Pr) + JR = R.data + JA = colvals(A) + JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + xbRA = zeros(Ti, r) + xbC = zeros(Ti, s) # this vector will also serve as as colptr array in halfperm + max_rR = find_max_row_length(R) + max_rA = find_max_row_length(A) + max_rPr = find_max_row_length(Pr) + + max_rC = max((max_rR*max_rA*max_rPr),(max_rA*max_rR)) + JRA = Vector{Ti}(undef,max_rC) + IC = Vector{Ti}(undef,p+1) + nnz_C = 1 + IC[1] = nnz_C + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in jagged_range(R, i) + j = JR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + end + end + end + ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(Pr,j) + k = JPr[kp] + if xbC[k] != i + xbC[k] = i + ccC += 1 + end + end + end + nnz_C += ccC + IC[i+1] = nnz_C + end + JC = Vector{Ti}(undef, nnz_C-1) + VC = zeros(Tv,nnz_C-1) + JAP = Vector{Ti}(undef,min(max_rA*max_rPr,s)) # upper bound estimate for length of virtual row of AP + xbRA .= 0 + xbC .= 0 + SparseMatrixCSR{Bi}(p,s,IC,JC,VC),(xbRA,JRA,xbC,JAP) # values in CSR matrix not yet initialized + end + function RAP_symbolic_fill!(C,R,A,Pr,cache) + (xbRA,JRA,xbC,JAP) = cache + JC = colvals(C) + JR = R.data + JA = colvals(A) + JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + pC = 0 + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in jagged_range(R, i) + j = JR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + end + end + end + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(Pr,j) + k = JPr[kp] + if xbC[k] != i + pC += 1 + xbC[k] = i + JC[pC] = k + end + end + end + end + xbC .= 0 + C, (xbC,similar(xbC, Tv),JAP) # values not yet initialized + end + function _RAP(Plt,A,Pr,old_cache) + xb,x,JAP,R = old_cache + old_outer_cache = (xb,x,JAP) + C,symbolic_cache = RAP_symbolic_count!(R, A, Pr) + _,new_outer_cache = RAP_symbolic_fill!(C,R, A, Pr, symbolic_cache) + Ct = symbolic_halfperm(C) + symbolic_halfperm!(C,Ct) + outer_cache = map((c1,c2) -> length(c1) >= length(c2) ? c1 : c2, old_outer_cache,new_outer_cache) + RAP!(C,Plt,A,Pr,outer_cache),(outer_cache...,R) + end + _RAP(Plt,A,Pr,cache) +end + +function reduce_spmtmm_cache(cache,::Type{SparseMatrixCSR}) + (xb,x,JAP,_) = cache + (xb,x,JAP) +end + +function RAP!(C::SparseMatrixCSR{Bi,Tv,Ti}, + Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, + A::SparseMatrixCSR{Bi,Tv,Ti}, + Pr::SparseMatrixCSR{Bi,Tv,Ti}, + cache) where {Bi,Tv,Ti} + (a,b) = size(C) + p,q = size(Plt) + m,r = size(A) + n,s = size(Pr) + if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end + if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end + if (a,b) != (p,s) && throw(DimensionMismatch("Dimensions of C $(size(C)) don't match dimensions of R*A*P ($p,$q)*($m,$r)*($n,$s)."));end + Pl = Plt.parent + JC = colvals(C) + VC = nonzeros(C) + VC .= zero(Tv) + + JA = colvals(A) + VA = nonzeros(A) + JPr = colvals(Pr) + VPr = nonzeros(Pr) + xb, x, JAP = cache + xb .= 0 + # loop over rows in A + for i in 1:m + lp = 0 + # loop over columns "j" in row i of A + for jp in nzrange(A, i) + j = JA[jp] + va = VA[jp] + # loop over columns "k" in row j of B + for kp in nzrange(Pr, j) + k = JPr[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xb[k] != i + lp += 1 + JAP[lp] = k + xb[k] = i + x[k] = va * VPr[kp] + else + x[k] += va * VPr[kp] + end + end + end + for kp in nzrange(Pl, i) + k = colvals(Pl)[kp] # rowvals when transposed conceptually + v = nonzeros(Pl)[kp] + for jp in nzrange(C,k) + j = JC[jp] + if xb[j] == i + VC[jp] += v*x[j] + end + end + end + end + C +end + +function RAP!(C::SparseMatrixCSR{Bi,Tv,Ti}, + Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, + A::SparseMatrixCSR{Bi,Tv,Ti}, + Pr::SparseMatrixCSR{Bi,Tv,Ti}, + α::Number, + β::Number, + cache) where {Bi,Tv,Ti} + (a,b) = size(C) + p,q = size(Plt) + m,r = size(A) + n,s = size(Pr) + if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end + if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end + if (a,b) != (p,s) && throw(DimensionMismatch("Dimensions of C $(size(C)) don't match dimensions of R*A*P ($p,$q)*($m,$r)*($n,$s)."));end + Pl = Plt.parent + JC = colvals(C) + VC = nonzeros(C) + JA = colvals(A) + VA = nonzeros(A) + JPr = colvals(Pr) + VPr = nonzeros(Pr) + xb, x, JAP = cache + xb .= 0 + VC .*= β + # loop over rows in A + for i in 1:m + lp = 0 + # loop over columns "j" in row i of A + for jp in nzrange(A, i) + j = JA[jp] + va = α*VA[jp] + # loop over columns "k" in row j of B + for kp in nzrange(Pr, j) + k = JPr[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xb[k] != i + lp += 1 + JAP[lp] = k + xb[k] = i + x[k] = va*VPr[kp] + else + x[k] += va*VPr[kp] + end + end + end + for kp in nzrange(Pl, i) + k = colvals(Pl)[kp] # rowvals when transposed conceptually + vpl = nonzeros(Pl)[kp] + for jp in nzrange(C,k) + j = JC[jp] + if xb[j] == i + VC[jp] += vpl*x[j] + end + end + end + end + C +end + +# RAP variants +function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, + A::SparseMatrixCSR{Bi,Tv,TiA}, + Pr::SparseMatrixCSR{Bi,Tv,TiPr}) where {Bi,Tv,TiPl,TiA,TiPr} + p,q = size(Pl) + m,r = size(A) + n,s = size(Pr) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end + function RAP_symbolic!(Pl,A,Pr) + JPl = colvals(Pl) + JA = colvals(A) + JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + xbRA = zeros(TiA, r) + xbC = zeros(TiA, s+1) # this vector will also serve as as colptr array in halfperm + xRA = similar(xbRA, Tv) # sparse accumulator + xC = similar(xbC, Tv) # sparse accumulator + max_rPl = find_max_row_length(Pl) + max_rA = find_max_row_length(A) + max_rPr = find_max_row_length(Pr) + + max_rC = max((max_rPl*max_rA*max_rPr),(max_rA*max_rPl)) + JRA = Vector{TiA}(undef,max_rC) + IC = Vector{TiA}(undef,p+1) + nnz_C = 1 + IC[1] = nnz_C + for i in 1:p + ccRA = 0 + # loop over columns "j" in row i of A + for jp in nzrange(Pl, i) + j = JPl[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + end + end + end + ccC = 0 + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(Pr,j) + k = JPr[kp] + if xbC[k] != i + xbC[k] = i + ccC += 1 + end + end + end + nnz_C += ccC + IC[i+1] = nnz_C + end + JC = Vector{TiA}(undef, nnz_C-1) + VC = zeros(Tv,nnz_C-1) + cache = (xbRA,xRA,JRA,xbC,xC) + SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized + end + function RAP_numeric!(C,Pl,A,Pr,cache) + JPl = colvals(Pl) + VPl = nonzeros(Pl) + JA = colvals(A) + VA = nonzeros(A) + JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + VPr = nonzeros(Pr) + JC = colvals(C) + VC = nonzeros(C) + (xbRA,xRA,JRA,xbC,xC) = cache + jpC = 1 + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in nzrange(Pl, i) + j = JPl[jp] + vpl = VPl[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + xRA[k] = vpl * VA[kp] + else + xRA[k] += vpl * VA[kp] + end + end + end + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(Pr,j) + k = JPr[kp] + if xbC[k] != i + xbC[k] = i + JC[jpC] = k + jpC += 1 + xC[k] = xRA[j]*VPr[kp] + else + xC[k] += xRA[j]*VPr[kp] + end + end + end + for ind in nzrange(C,i) + j = JC[ind] + VC[ind] = xC[j] + end + end + end + function _RAP(Pl,A,Pr) + C,(xbRA,xRA,JRA,xbC,xC) = RAP_symbolic!(Pl,A,Pr) + xbRA .= 0 + xbC .= 0 + cache = (xbRA,xRA,JRA,xbC,xC) + RAP_numeric!(C,Pl,A,Pr,cache) + Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C) + halfperm!(C,Ct) + C,cache + end + _RAP(Pl,A,Pr) +end + +# Reuses internal arrays of A!!! +function construct_spmmm_cache(C::SparseMatrixCSR,A::SparseMatrixCSR) + cache = JaggedArray(colvals(A), A.rowptr) +end + +function construct_spmmm_cache(C::SparseMatrixCSC,A::SparseMatrixCSC) + cache = JaggedArray(rowvals(A), A.colptr) +end + +function reduce_spmtmm_cache(cache,::Type{M} where M <: SparseMatrixCSR) + (xb,x,JAP,_) = cache + (xb,x,JAP) +end + +function reduce_spmtmm_cache(cache,::Type{M} where M <: SparseMatrixCSC) + reduce_spmmmt_cache(cache,SparseMatrixCSR) +end + +function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, + A::SparseMatrixCSR{Bi,Tv,TiA}, + Pr::SparseMatrixCSR{Bi,Tv,TiPr}, + cache) where {Bi,Tv,TiPl,TiA,TiPr} + p,q = size(Pl) + m,r = size(A) + n,s = size(Pr) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end + function RAP_symbolic!(Pl,A,Pr,cache) + JPl = colvals(Pl) + JA = colvals(A) + JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + (xbRA,_,JRA,xbC,_) = cache + IC = Vector{TiA}(undef,p+1) + nnz_C = 1 + IC[1] = nnz_C + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in nzrange(Pl, i) + j = JPl[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + end + end + end + ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(Pr,j) + k = JPr[kp] + if xbC[k] != i + xbC[k] = i + ccC += 1 + end + end + end + nnz_C += ccC + IC[i+1] = nnz_C + end + JC = Vector{TiA}(undef, nnz_C-1) + VC = zeros(Tv,nnz_C-1) + SparseMatrixCSR{Bi}(p,s,IC,JC,VC) # values not yet initialized + end + function RAP_numeric!(C,Pl,A,Pr,cache) + JPl = colvals(Pl) + VPl = nonzeros(Pl) + JA = colvals(A) + VA = nonzeros(A) + JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + VPr = nonzeros(Pr) + JC = colvals(C) + VC = nonzeros(C) + (xbRA,xRA,JRA,xbC,xC) = cache + jpC = 1 + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in nzrange(Pl, i) + j = JPl[jp] + vpl = VPl[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + xRA[k] = vpl * VA[kp] + else + xRA[k] += vpl * VA[kp] + end + end + end + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(Pr,j) + k = JPr[kp] + if xbC[k] != i + xbC[k] = i + JC[jpC] = k + jpC += 1 + xC[k] = xRA[j]*VPr[kp] + else + xC[k] += xRA[j]*VPr[kp] + end + end + end + for ind in nzrange(C,i) + j = JC[ind] + VC[ind] = xC[j] + end + end + end + function _RAP(Pl,A,Pr,old_cache) + max_rPl = find_max_row_length(Pl) + max_rA = find_max_row_length(A) + max_rPr = find_max_row_length(Pr) + (xbRA,xRA,JRA,xbC,xC) = old_cache + max_rC = max((max_rPl*max_rA*max_rPr),(max_rA*max_rPl)) + JRA2 = max_rC > length(JRA) ? similar(JRA,max_rC) : JRA + if r > length(xbRA) + xbRA2 = similar(xbRA,r) + xRA2 = similar(xRA,r) + else + xbRA2 = xbRA + xRA2 = xRA + end + + new_cache = (xbRA2,xRA2,JRA2,xbC,xC) + xbRA2 .= 0 + xbC .= 0 + C = RAP_symbolic!(Pl,A,Pr,new_cache) + xbRA2 .= 0 + xbC .= 0 + RAP_numeric!(C,Pl,A,Pr,new_cache) + Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C) + halfperm!(C,Ct) + C,new_cache + end + _RAP(Pl,A,Pr,cache) +end + +function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSR) + (xbRA,xRA,JRA,_,_) = cache + (xbRA,xRA,JRA) +end + +function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSC) + reduce_spmtmm_cache(cache,SparseMatrixCSR) +end + +function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, + Pl::SparseMatrixCSR{Bi,Tv,TiPl}, + A::SparseMatrixCSR{Bi,Tv,TiA}, + Pr::SparseMatrixCSR{Bi,Tv,TiPr}, + cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr} + p,q = size(Pl) + m,r = size(A) + n,s = size(Pr) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end + JPl = colvals(Pl) + VPl = nonzeros(Pl) + JA = colvals(A) + VA = nonzeros(A) + JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + VPr = nonzeros(Pr) + JC = colvals(C) + VC = nonzeros(C) + VC .= zero(Tv) + (xbRA,xRA,JRA,xbC,xC) = cache + xbRA .= 0 + xbC .= 0 + for i in 1:p + lp = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in nzrange(Pl, i) + j = JPl[jp] + vpl = VPl[jp] + + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + va = VA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + lp += 1 + JRA[lp] = k + xbRA[k] = i + xRA[k] = vpl * va + else + xRA[k] += vpl * va + end + end + end + for jp in 1:lp + j = JRA[jp] + vra = xRA[j] + for kp in nzrange(Pr,j) + k = JPr[kp] + if xbC[k] != i + xbC[k] = i + xC[k] = vra*VPr[kp] + else + xC[k] += vra*VPr[kp] + end + end + end + for ind in nzrange(C,i) + j = JC[ind] + if xbC[j] == i + VC[ind] = xC[j] + end + end + end + C +end + +function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, + Pl::SparseMatrixCSR{Bi,Tv,TiPl}, + A::SparseMatrixCSR{Bi,Tv,TiA}, + Pr::SparseMatrixCSR{Bi,Tv,TiPr}, + α::Number, + β::Number, + cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr} + p,q = size(Pl) + m,r = size(A) + n,s = size(Pr) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end + JPl = colvals(Pl) + VPl = nonzeros(Pl) + JA = colvals(A) + VA = nonzeros(A) + JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + VPr = nonzeros(Pr) + JC = colvals(C) + VC = nonzeros(C) + VC .*= β + (xbRA,xRA,JRA,xbC,xC) = cache + xbRA .= 0 + xbC .= 0 + xC .= zero(Tv) + for i in 1:p + lp = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in nzrange(Pl, i) + j = JPl[jp] + vpl = VPl[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + lp += 1 + JRA[lp] = k + xbRA[k] = i + xRA[k] = vpl * VA[kp] + else + xRA[k] += vpl * VA[kp] + end + end + end + for jp in 1:lp + j = JRA[jp] + for kp in nzrange(Pr,j) + k = JPr[kp] + if xbC[k] != i + xbC[k] = i + xC[k] = xRA[j]*VPr[kp] + else + xC[k] += xRA[j]*VPr[kp] + end + end + end + for ind in nzrange(C,i) + j = JC[ind] + if xbC[j] == i + VC[ind] += α*xC[j] + end + end + end + C +end + +# RARt variants +function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiA}, + A::SparseMatrixCSR{Bi,Tv,TiB}, + Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiC}}) where {Bi,Tv,TiA,TiB,TiC} + p,q = size(Pl) + m,r = size(A) + n,s = size(Prt) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end + RAP(Pl,A,copy(Prt)) +end + +function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiA}, + A::SparseMatrixCSR{Bi,Tv,TiB}, + Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiC}},cache) where {Bi,Tv,TiA,TiB,TiC} + p,q = size(Pl) + m,r = size(A) + n,s = size(Prt) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end + RAP(Pl,A,copy(Prt),cache) +end + +function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, + Pl::SparseMatrixCSR{Bi,Tv,TiPl}, + A::SparseMatrixCSR{Bi,Tv,TiA}, + Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiPr}}, + cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr} + p,q = size(Pl) + m,r = size(A) + n,s = size(Prt) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end + Pr = Prt.parent + JPl = colvals(Pl) + VPl = nonzeros(Pl) + JA = colvals(A) + VA = nonzeros(A) + IPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + VPr = nonzeros(Pr) + JC = colvals(C) + VC = nonzeros(C) + # some cache items are present with the regular RAP product in mind, which is how the allocating verison is performed + xb,x = cache + xb .= 0 + for i in 1:p + # loop over columns "j" in row i of A + for jp in nzrange(Pl, i) + j = JPl[jp] + vpl = VPl[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xb[k] != i + xb[k] = i + x[k] = vpl * VA[kp] + else + x[k] += vpl * VA[kp] + end + end + end + for jpPr in nzrange(C,i) + jPr = JC[jpPr] + v = Tv(0) + for ip in nzrange(Pr,jPr) + iPr = IPr[ip] + if xb[iPr] == i + v += x[iPr]*VPr[ip] + end + end + VC[jpPr] = v + end + end + C +end + +function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, + Pl::SparseMatrixCSR{Bi,Tv,TiPl}, + A::SparseMatrixCSR{Bi,Tv,TiA}, + Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiPr}}, + α::Number, + β::Number, + cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr} + p,q = size(Pl) + m,r = size(A) + n,s = size(Prt) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end + Pr = Prt.parent + JPl = colvals(Pl) + VPl = nonzeros(Pl) + JA = colvals(A) + VA = nonzeros(A) + IPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + VPr = nonzeros(Pr) + JC = colvals(C) + VC = nonzeros(C) + VC .*= β + # some cache items are present with the regular RAP product in mind, which is how the allocating verison is performed + xb,x = cache + xb .= 0 + for i in 1:p + # loop over columns "j" in row i of A + for jp in nzrange(Pl, i) + j = JPl[jp] + vpl = VPl[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xb[k] != i + xb[k] = i + x[k] = vpl * VA[kp] + else + x[k] += vpl * VA[kp] + end + end + end + for jpPr in nzrange(C,i) + jPr = JC[jpPr] + v = Tv(0) + for ip in nzrange(Pr,jPr) + iPr = IPr[ip] + if xb[iPr] == i + v += x[iPr]*VPr[ip] + end + end + VC[jpPr] += α*v + end + end + C +end + +### CSC in terms of CSR +function RAP(A::SparseMatrixCSC{Tv,TiA}, + B::SparseMatrixCSC{Tv,TiB}, + C::SparseMatrixCSC{Tv,TiC}) where {Tv,TiA,TiB,TiC} + D,cache = RAP(ascsr(C),ascsr(B),ascsr(A)) + ascsc(D),cache +end + +function RAP(A::SparseMatrixCSC{Tv,TiA}, + B::SparseMatrixCSC{Tv,TiB}, + C::SparseMatrixCSC{Tv,TiC}, + cache) where {Tv,TiA,TiB,TiC} + D,new_cache = RAP(ascsr(C),ascsr(B),ascsr(A),cache) + ascsc(D),new_cache +end + +function RAP!(D::SparseMatrixCSC{Tv,TiD}, + A::SparseMatrixCSC{Tv,TiA}, + B::SparseMatrixCSC{Tv,TiB}, + C::SparseMatrixCSC{Tv,TiC}, + cache) where {Tv,TiD,TiA,TiB,TiC} + RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache) + D +end + +function RAP!(D::SparseMatrixCSC{Tv,TiD}, + A::SparseMatrixCSC{Tv,TiA}, + B::SparseMatrixCSC{Tv,TiB}, + C::SparseMatrixCSC{Tv,TiC}, + cache::JaggedArray{X,Y} where {X<:Integer, Y<:Integer}, + acc) where {Tv,TiD,TiA,TiB,TiC} + RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache,acc) + D +end + +function RAP!(D::SparseMatrixCSC{Tv,TiD}, + A::SparseMatrixCSC{Tv,TiA}, + B::SparseMatrixCSC{Tv,TiB}, + C::SparseMatrixCSC{Tv,TiC}, + α::Number, + β::Number, + cache) where {Tv,TiD,TiA,TiB,TiC} + RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache) + D +end + +function RAP!(D::SparseMatrixCSC{Tv,TiD}, + A::SparseMatrixCSC{Tv,TiA}, + B::SparseMatrixCSC{Tv,TiB}, + C::SparseMatrixCSC{Tv,TiC}, + α::Number, + β::Number, + cache::JaggedArray{X,Y} where {X <: Integer, Y<:Integer}, + acc) where {Tv,TiD,TiA,TiB,TiC} + RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache,acc) + D +end + +# PtAP +function RAP(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, + B::SparseMatrixCSC{Tv,TiB}, + C::SparseMatrixCSC{Tv,TiC}) where {Tv,TiA,TiB,TiC} + D,cache = RAP(ascsr(C),ascsr(B),transpose(ascsr(A.parent))) + ascsc(D),cache +end + +function RAP(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, + B::SparseMatrixCSC{Tv,TiB}, + C::SparseMatrixCSC{Tv,TiC}, + cache) where {Tv,TiA,TiB,TiC} + D,cache = RAP(ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache) + ascsc(D),cache +end + +function RAP!(D::SparseMatrixCSC{Tv,TiD}, + A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, + B::SparseMatrixCSC{Tv,TiB}, + C::SparseMatrixCSC{Tv,TiC}, + cache) where {Tv,TiD,TiA,TiB,TiC} + RAP!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache) + D +end + +function RAP!(D::SparseMatrixCSC{Tv,TiD}, + A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, + B::SparseMatrixCSC{Tv,TiB}, + C::SparseMatrixCSC{Tv,TiC}, + α::Number, + β::Number, + cache) where {Tv,TiD,TiA,TiB,TiC} + RAP!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),α,β,cache) + D +end + +# RARt +function RAP(A::SparseMatrixCSC{Tv,Ti}, + B::SparseMatrixCSC{Tv,Ti}, + C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}) where {Tv,Ti<:Integer} + D,new_cache = RAP(transpose(ascsr(C.parent)),ascsr(B),ascsr(A)) + ascsc(D),new_cache +end +function RAP(A::SparseMatrixCSC{Tv,Ti}, + B::SparseMatrixCSC{Tv,Ti}, + C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, + cache) where {Tv,Ti<:Integer} + D,new_cache = RAP(transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache) + ascsc(D),new_cache +end + +function RAP!(D::SparseMatrixCSC{Tv,Ti}, + A::SparseMatrixCSC{Tv,Ti}, + B::SparseMatrixCSC{Tv,Ti}, + C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, + cache) where {Tv,Ti<:Integer} + RAP!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache) + D +end + +function RAP!(D::SparseMatrixCSC{Tv,Ti}, + A::SparseMatrixCSC{Tv,Ti}, + B::SparseMatrixCSC{Tv,Ti}, + C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, + α::Number, + β::Number, + cache) where {Tv,Ti<:Integer} + RAP!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),α,β,cache) + D +end \ No newline at end of file diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl index 0b1d6fa2..08e0bd11 100644 --- a/src/sparse_utils.jl +++ b/src/sparse_utils.jl @@ -431,40 +431,6 @@ end # A #end -# Variants for findnz() that only allocates memory for the conversion of the pointer array to an index array. -function findnz_minimal(A::SparseMatrixCSC) - J = ptr_to_coo(A.colptr) - rowvals(A),J,nonzeros(A) -end -function findnz_minimal(A::SparseMatrixCSR) - I = ptr_to_coo(A.rowptr) - I,colvals(A),nonzeros(A) -end - -# Behaves like findnz, but without copying the values. -function find_indices(A::SparseMatrixCSC) - I,J,_ = findnz_minimal(A) - copy(I),J -end -function find_indices(A::SparseMatrixCSR) - I,J,_ = findnz_minimal(A) - I,copy(J) -end - -# Could be optimized by a two-way merge-like method when A is a guaranteed submatrix of C. -function precompute_nzindex(C::AbstractSparseArray,A::AbstractSparseArray) - I,J,_ = findnz_minimal(A) - K = similar(I) - K .= 0 - for (p,(i,j)) in enumerate(zip(I,J)) - if i < 1 || j < 1 - continue - end - K[p] = nzindex(C,i,j) - end - K -end - function precompute_nzindex(A,I,J) K = zeros(Int32,length(I)) for (p,(i,j)) in enumerate(zip(I,J)) @@ -476,25 +442,14 @@ function precompute_nzindex(A,I,J) K end -# Reuse I vector as K vector. -# function precompute_nzindex!(I,A,J) -# for (p,(i,j)) in enumerate(zip(I,J)) -# if i < 1 || j < 1 -# continue -# end -# I[p] = nzindex(A,i,j) -# end -# I -# end - function precompute_nzindex!(K, A, I, J) for (p, (i, j)) in enumerate(zip(I, J)) if i < 1 || j < 1 continue end K[p] = nzindex(A, i, j) - end - + end +end function sparse_matrix!(A,V,K;reset=true) if reset @@ -510,7 +465,6 @@ function sparse_matrix!(A,V,K;reset=true) A end - # Notation # csrr: csr with repeated and unsorted columns # csru: csr with unsorted columns @@ -734,6 +688,43 @@ function spmv_csc!(b,x,colptr_A,rowval_A,nzval_A) b end +################ NEW ################ + +# Variants for findnz() that only allocates memory for the conversion of the pointer array to an index array. +# Only use for read-only operations. +function findnz_minimal(A::SparseMatrixCSC) + J = ptr_to_coo(A.colptr) + rowvals(A),J,nonzeros(A) +end +function findnz_minimal(A::SparseMatrixCSR) + I = ptr_to_coo(A.rowptr) + I,colvals(A),nonzeros(A) +end + +# Behaves like findnz, but without the values. +function find_indices(A::SparseMatrixCSC) + I,J,_ = findnz_minimal(A) + copy(I),J +end +function find_indices(A::SparseMatrixCSR) + I,J,_ = findnz_minimal(A) + I,copy(J) +end + +# TODO Could be done without binary searches from nzindex(...), when it is known that A and C are ordered, and A is a guaranteed submatrix of C. +function precompute_nzindex(C::AbstractSparseArray,A::AbstractSparseArray) + I,J,_ = findnz_minimal(A) + K = similar(I) + K .= 0 + for (p,(i,j)) in enumerate(zip(I,J)) + if i < 1 || j < 1 + continue + end + K[p] = nzindex(C,i,j) + end + K +end + function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Tv,Ti} where {Tv, Ti}, n) where Bi p,q = size(A) @assert n >= q @@ -773,23 +764,6 @@ function Base.copy(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}} where {Bi,Tv,Ti}) ascsr(Acsc_T) end -function SparseMatricesCSR.sparsecsr(A::SparseMatrixCSC) - sparsecsr(findnz(A)..., size(A)...) -end - -function SparseMatricesCSR.sparsecsr(At::Transpose) - transpose(sparsecsr(At.parent)) -end - -function SparseMatricesCSR.sparsecsr(A::SparseMatrixCSR) - A -end - -function SparseMatricesCSR.sparsecsr(T::Type, A::SparseMatrixCSC) - compresscoo(T,findnz(A)..., size(A)...) -end - - function pointer_array(A::SparseMatrixCSR) A.rowptr end diff --git a/test/debug_array/runtests.jl b/test/debug_array/runtests.jl index 2c1a61ab..a175b722 100644 --- a/test/debug_array/runtests.jl +++ b/test/debug_array/runtests.jl @@ -23,4 +23,6 @@ using PartitionedArrays @testset "fem_example" begin include("fem_example.jl") end +@testset "spmtmm_tests" begin include("spmtmm_tests.jl") end + end #module diff --git a/test/debug_array/spmtmm_tests.jl b/test/debug_array/spmtmm_tests.jl new file mode 100644 index 00000000..384aeb21 --- /dev/null +++ b/test/debug_array/spmtmm_tests.jl @@ -0,0 +1,14 @@ +module DebugArraySpMtMMTests + +using PartitionedArrays +using SparseArrays + +include(joinpath("..","primitives_tests.jl")) + +M = sparse(1:5,1:5,1:5) +@test nnz(M-M) == nnz(M) +display(M-M) + +with_debug(primitives_tests) + +end # module diff --git a/test/mpi_array/runtests.jl b/test/mpi_array/runtests.jl index 26a3a5d3..ffdc1f1e 100644 --- a/test/mpi_array/runtests.jl +++ b/test/mpi_array/runtests.jl @@ -13,5 +13,6 @@ using PartitionedArrays @testset "p_timer_tests" begin include("p_timer_tests.jl") end @testset "fdm_example" begin include("fdm_example.jl") end @testset "fem_example" begin include("fem_example.jl") end +@testset "spmtmm_tests" begin include("spmtmm_tests.jl") end end #module diff --git a/test/mpi_array/spmtmm_tests.jl b/test/mpi_array/spmtmm_tests.jl new file mode 100644 index 00000000..c9063604 --- /dev/null +++ b/test/mpi_array/spmtmm_tests.jl @@ -0,0 +1,4 @@ +using MPI +include("run_mpi_driver.jl") +file = joinpath(@__DIR__,"drivers","spmtmm_tests.jl") +run_mpi_driver(file;procs=4) diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl new file mode 100644 index 00000000..8e162bd3 --- /dev/null +++ b/test/spmtmm_tests.jl @@ -0,0 +1,216 @@ +module SpMtMMTests + +using SparseArrays +using SparseMatricesCSR +using PartitionedArrays +using LinearAlgebra +using Test + +# Equality definition for SparseCSC and SparseCSR. If the size and lengths match, the CSR matrix is converted +function strictly_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSR) + if size(A) != size(B) && return false; end + if length(nonzeros(A)) != length(nonzeros(B)) && return false; end + Bcsc = sparse(findnz(B)...,size(B)...) + if rowvals(A) != rowvals(Bcsc) && return false; end + if nonzeros(A) != nonzeros(Bcsc) && return false; end + true +end + +function strictly_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC) + if size(A) != size(B) && return false; end + if length(nonzeros(A)) != length(nonzeros(B)) && return false; end + if A.colptr != B.colptr && return false; end + if rowvals(A) != rowvals(B) && return false; end + if nonzeros(A) != nonzeros(B) && return false; end + true +end + +function strictly_equivalent(A::SparseMatrixCSR, B::SparseMatrixCSR) + if size(A) != size(B) && return false; end + if length(nonzeros(A)) != length(nonzeros(B)) && return false; end + if A.rowptr != B.rowptr && return false; end + if colvals(A) != colvals(B) && return false; end + if nonzeros(A) != nonzeros(B) && return false; end + true +end + +function strictly_equivalent(A::SparseMatrixCSR,B::SparseMatrixCSC) strictly_equivalent(B,A) end + +# Equality definition for SparseCSC and SparseCSR. If the size and lengths match, the CSR matrix is converted +function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSR,args...) + if size(A) != size(B) && return false; end + if length(nonzeros(A)) != length(nonzeros(B)) && return false; end + Bcsc = sparse(findnz(B)...,size(B)...) + if A.colptr != Bcsc.colptr && return false; end + if rowvals(A) != rowvals(Bcsc) && return false; end + if !isapprox(nonzeros(A),nonzeros(B),args...) && return false; end + true +end + +function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC,args...) + if size(A) != size(B) && return false; end + if length(nonzeros(A)) != length(nonzeros(B)) && return false; end + if A.colptr != B.colptr && return false; end + if rowvals(A) != rowvals(B) && return false; end + if !isapprox(nonzeros(A),nonzeros(B),args...) && return false; end + true +end + +# Structurally A and B must be equal, but numerically the can be approximately equal +function approx_equivalent(A::SparseMatrixCSR, B::SparseMatrixCSR,args...) + if size(A) != size(B) && return false; end + if length(nonzeros(A)) != length(nonzeros(B)) && return false; end + if A.rowptr != B.rowptr && return false; end + if colvals(A) != colvals(B) && return false; end + if !isapprox(nonzeros(A),nonzeros(B),args...) && return false; end + true +end + +function approx_equivalent(A::SparseMatrixCSR,B::SparseMatrixCSC) strictly_equivalent(B,A) end + + +function parallel_tests(pA,pB,sparse_func) + A = centralize(sparse_func,pA) + B = centralize(sparse_func,pB) + # explicit parallel transpose + + pBt = explicit_transpose(pB) |> fetch + Bt = centralize(sparse_func,pBt) + @test Bt == copy(transpose(B)) + hp_B = halfperm(B) + B_struct = symbolic_halfperm(B) + @test pointer_array(hp_B) == B_struct.ptrs + @test index_array(hp_B) == B_struct.data + @test Bt == hp_B + + pBt_local,t = explicit_transpose(pB,reuse=true) + pBt, transpose_cache = fetch(t) + Bt = centralize(sparse_func,pBt) + @test Bt == copy(transpose(B)) + hp_B = halfperm(B) + @test Bt == hp_B + + t = explicit_transpose!(pBt,pBt_local,pB,transpose_cache) + wait(t) + Bt = centralize(sparse_func,pBt) + @test Bt == copy(transpose(B)) + hp_B = halfperm(B) + @test Bt == hp_B + + AB0 = A*B + C0 = transpose(B)*AB0 + # test basic sequential csr implementations to default csc sequential implementations. + pAB,cacheAB = spmm(pA,pB,reuse=true) + AB = centralize(sparse_func,pAB) + @test approx_equivalent(AB,AB0) + + # pB will be transposed internally + pC,cacheC = spmtm(pB,pAB,reuse=true) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + spmm!(pAB,pA,pB,cacheAB) + AB = centralize(sparse_func,pAB) + + @test approx_equivalent(AB,AB0) + spmtm!(pC,pB,pAB,cacheC) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + + pC,cacheC = spmtmm(pB,pA,pB,reuse=true) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + + spmtmm!(pC,pB,pA,pB,cacheC) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + + # test basic sequential csr implementations to default csc sequential implementations. + pC,cacheC = spmm(pBt,pAB,reuse=true) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + spmm!(pC,pBt,pAB,cacheC) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + + # pB will be transposed internally + pC,cacheC = spmmm(pBt,pA,pB,reuse=true) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + spmmm!(pC,pBt,pA,pB,cacheC) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + + # unequal sizes backward (small to large) + if size(pA) != size(pB) + CB0 = C0*Bt + D0 = transpose(Bt)*CB0 + pCB,cacheCB = spmm(pC,pBt,reuse=true) + CB = centralize(sparse_func,pCB) + @test approx_equivalent(CB,CB0) + + pD,cacheD = spmtm(pBt,pCB,reuse=true) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + spmm!(pCB,pC,pBt,cacheCB) + CB = centralize(sparse_func,pCB) + @test approx_equivalent(CB,CB0) + spmtm!(pD,pBt,pCB,cacheD) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + + pD,cacheD = spmtmm(pBt,pC,pBt,reuse=true) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + spmtmm!(pD,pBt,pC,pBt,cacheD) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + + pD,cacheD = spmm(pB,pCB,reuse=true) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + + pD,cacheD = spmmm(pB,pC,pBt,reuse=true) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + spmmm!(pD,pB,pC,pBt,cacheD) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + end +end + +function spmtmm_tests(distribute) + nodes_per_dir = (5,5,5) + parts_per_dir = (1,2,2) + np = prod(parts_per_dir) + ranks = distribute(LinearIndices((np,))) + Ti = Int32 + pA = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = Ti)...) |> fetch + pB = pA + parallel_tests(pA,pB,sparsecsr) + + T = eltype(typeof(own_own_values(pA).items)) + + pB = prolongator(T,pA) + B = centralize(T,pB) + sequential_tests(pA,pB) + + #### CSC + do_CSC = true + if do_CSC + pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = Ti)...) |> fetch + T = eltype(typeof(own_own_values(pA).items)) + + pB = pA + parallel_tests(pA,pB,sparse) + + parallel_time(pA,pB,sparse) + T = eltype(typeof(own_own_values(pA).items)) + pB = prolongator(T,pA) + B = centralize(T,pB) + parallel_tests(pA,pB,sparse) + end +end + +end # module +; + diff --git a/times.txt b/times.txt index 53764f74..f7794112 100644 --- a/times.txt +++ b/times.txt @@ -1 +1 @@ -Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2068526, max = 0.2068526, avg = 0.2068526), "Phase 3" => (min = 0.0, max = 0.0, avg = 0.0), "Matrix Assembly" => (min = 0.4054487, max = 0.4054487, avg = 0.4054487), "Phase 1" => (min = 1.1e-6, max = 1.1e-6, avg = 1.1e-6)) +Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2174075, max = 0.2174075, avg = 0.2174075), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4207836, max = 0.4207836, avg = 0.4207836), "Phase 1" => (min = 1.4e-6, max = 1.4e-6, avg = 1.4e-6)) From 33a36ada74c23b6b4550a886bbc6b2519c4aadd9 Mon Sep 17 00:00:00 2001 From: jop611 Date: Mon, 27 Jan 2025 17:45:51 +0100 Subject: [PATCH 23/34] fixed tests, add missing symbolic_halfperm methods. --- src/PartitionedArrays.jl | 7 +++ src/gallery.jl | 2 +- src/sparse_utils.jl | 73 ++++++++++++++++++++++++++- test/debug_array/spmtmm_tests.jl | 13 +++-- test/mpi_array/runtests.jl | 20 ++++---- test/spmtmm_tests.jl | 87 ++++++-------------------------- times.txt | 2 +- 7 files changed, 114 insertions(+), 90 deletions(-) diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl index 8505d709..2d250e0a 100644 --- a/src/PartitionedArrays.jl +++ b/src/PartitionedArrays.jl @@ -25,6 +25,12 @@ export compresscoo export indextype export sparse_matrix export sparse_matrix! +export index_array +export pointer_array +export halfperm +export halfperm! +export symbolic_halfperm +export symbolic_halfperm! include("sparse_utils.jl") export linear_indices @@ -202,6 +208,7 @@ export node_coordinates_unit_cube export nullspace_linear_elasticity export nullspace_linear_elasticity! export near_nullspace_linear_elasticity +export prolongator include("gallery.jl") export RAP diff --git a/src/gallery.jl b/src/gallery.jl index 0757ad9d..06933725 100644 --- a/src/gallery.jl +++ b/src/gallery.jl @@ -586,4 +586,4 @@ function nullspace_linear_elasticity!(B,x) error("case not implemented") end B -end \ No newline at end of file +end diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl index 08e0bd11..ea12f3f9 100644 --- a/src/sparse_utils.jl +++ b/src/sparse_utils.jl @@ -799,6 +799,15 @@ function find_max_row_length(A::SparseMatrixCSR) max_rA end +function find_max_row_length(A::JaggedArray) + max_rA = 0 + for i in 1:length(A.ptrs)-1 + l = length(jagged_range(A,i)) + max_rA = max_rA > l ? max_rA : l + end + max_rA +end + function find_max_col_length(A::SparseMatrixCSC) max_cA = 0 for j in 1:size(A,2) @@ -822,7 +831,6 @@ function ascsc(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} SparseMatrixCSC{Tv,Ti}(q,p,A.rowptr,colvals(A),nonzeros(A)) end - function halfperm(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} q = size(A,2) JA,VA = colvals(A),nonzeros(A) @@ -905,4 +913,65 @@ function counts_to_ptrs!(v) foreach(i->v[i]+=v[i-1],2:l) shift_by_one!(v) v[1] = 1 -end \ No newline at end of file +end + +function symbolic_halfperm(A::SparseMatrixCSR) + q = size(A,2) + JA = colvals(A) + IAt,JAt = similar(A.rowptr,q+1),similar(JA) + symbolic_halfperm!(IAt,JAt,A) +end + +# transpose A into At using vectors IAt,JAt, and VAt +function symbolic_halfperm!(IAt,JAt,A::SparseMatrixCSR) + JA= colvals(A) + p,q = size(A) + count_occurrences!(IAt,JA) + counts_to_ptrs!(IAt) + shift_by_one!(IAt) + for i in 1:p + for jp in nzrange(A,i) + j = JA[jp] + jpt = IAt[j+1] + JAt[jpt] = i + IAt[j+1] = jpt+1 + end + end + IAt[1] = 1 + JaggedArray(JAt,IAt) +end + +# transpose A into At using vectors IAt,JAt, and VAt +function symbolic_halfperm!(JAt,IAt,A::SparseMatrixCSC) + symbolic_halfperm!(JAt,IAt,ascsr(A)) +end + +function symbolic_halfperm(A::SparseMatrixCSC) + symbolic_halfperm(ascsr(A)) +end + +# retranspose At back into A +function symbolic_halfperm!(A::SparseMatrixCSR,At::JaggedArray) + IA,JA = pointer_array(A),index_array(A) + JAt = At.data + # p = size(A,1) + shift_by_one!(IA) # pointer to row 1 must be located at IA[2], row 2 at IA[3] etc. + IA[1] = 1 + for i in 1:size(A,2) + for jpt in jagged_range(At,i) + j = JAt[jpt] + jp = IA[j+1] + JA[jp] = i + IA[j+1] = jp+1 + end + end + A +end + +# retranspose At back into A +function symbolic_halfperm!(A::SparseMatrixCSC,At::JaggedArray) + symbolic_halfperm!(ascsr(A),At) + A +end + + diff --git a/test/debug_array/spmtmm_tests.jl b/test/debug_array/spmtmm_tests.jl index 384aeb21..ba3cf431 100644 --- a/test/debug_array/spmtmm_tests.jl +++ b/test/debug_array/spmtmm_tests.jl @@ -1,14 +1,19 @@ module DebugArraySpMtMMTests using PartitionedArrays -using SparseArrays +using Test -include(joinpath("..","primitives_tests.jl")) +include(joinpath("..","spmtmm_tests.jl")) -M = sparse(1:5,1:5,1:5) +v = 1:5 +M = sparse(v,v,v) @test nnz(M-M) == nnz(M) display(M-M) -with_debug(primitives_tests) +M = sparsecsr(v,v,v) +@test nnz(M-M) == nnz(M) +display(M-M) + +with_debug(spmtmm_tests) end # module diff --git a/test/mpi_array/runtests.jl b/test/mpi_array/runtests.jl index ffdc1f1e..fc6f0aee 100644 --- a/test/mpi_array/runtests.jl +++ b/test/mpi_array/runtests.jl @@ -3,16 +3,16 @@ module MPIArrayRunTests using Test using PartitionedArrays -@testset "mpi_array" begin include("mpi_array_tests.jl") end -@testset "primitives" begin include("primitives_tests.jl") end -@testset "p_range_tests" begin include("p_range_tests.jl") end -@testset "p_vector_tests" begin include("p_vector_tests.jl") end -@testset "p_sparse_matrix_tests" begin include("p_sparse_matrix_tests.jl") end -@testset "gallery" begin include("gallery_tests.jl") end -@testset "block_arrays" begin include("block_arrays_tests.jl") end -@testset "p_timer_tests" begin include("p_timer_tests.jl") end -@testset "fdm_example" begin include("fdm_example.jl") end -@testset "fem_example" begin include("fem_example.jl") end +# @testset "mpi_array" begin include("mpi_array_tests.jl") end +# @testset "primitives" begin include("primitives_tests.jl") end +# @testset "p_range_tests" begin include("p_range_tests.jl") end +# @testset "p_vector_tests" begin include("p_vector_tests.jl") end +# @testset "p_sparse_matrix_tests" begin include("p_sparse_matrix_tests.jl") end +# @testset "gallery" begin include("gallery_tests.jl") end +# @testset "block_arrays" begin include("block_arrays_tests.jl") end +# @testset "p_timer_tests" begin include("p_timer_tests.jl") end +# @testset "fdm_example" begin include("fdm_example.jl") end +# @testset "fem_example" begin include("fem_example.jl") end @testset "spmtmm_tests" begin include("spmtmm_tests.jl") end end #module diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl index 8e162bd3..d8d4e658 100644 --- a/test/spmtmm_tests.jl +++ b/test/spmtmm_tests.jl @@ -1,52 +1,9 @@ -module SpMtMMTests - using SparseArrays using SparseMatricesCSR using PartitionedArrays using LinearAlgebra using Test -# Equality definition for SparseCSC and SparseCSR. If the size and lengths match, the CSR matrix is converted -function strictly_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSR) - if size(A) != size(B) && return false; end - if length(nonzeros(A)) != length(nonzeros(B)) && return false; end - Bcsc = sparse(findnz(B)...,size(B)...) - if rowvals(A) != rowvals(Bcsc) && return false; end - if nonzeros(A) != nonzeros(Bcsc) && return false; end - true -end - -function strictly_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC) - if size(A) != size(B) && return false; end - if length(nonzeros(A)) != length(nonzeros(B)) && return false; end - if A.colptr != B.colptr && return false; end - if rowvals(A) != rowvals(B) && return false; end - if nonzeros(A) != nonzeros(B) && return false; end - true -end - -function strictly_equivalent(A::SparseMatrixCSR, B::SparseMatrixCSR) - if size(A) != size(B) && return false; end - if length(nonzeros(A)) != length(nonzeros(B)) && return false; end - if A.rowptr != B.rowptr && return false; end - if colvals(A) != colvals(B) && return false; end - if nonzeros(A) != nonzeros(B) && return false; end - true -end - -function strictly_equivalent(A::SparseMatrixCSR,B::SparseMatrixCSC) strictly_equivalent(B,A) end - -# Equality definition for SparseCSC and SparseCSR. If the size and lengths match, the CSR matrix is converted -function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSR,args...) - if size(A) != size(B) && return false; end - if length(nonzeros(A)) != length(nonzeros(B)) && return false; end - Bcsc = sparse(findnz(B)...,size(B)...) - if A.colptr != Bcsc.colptr && return false; end - if rowvals(A) != rowvals(Bcsc) && return false; end - if !isapprox(nonzeros(A),nonzeros(B),args...) && return false; end - true -end - function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC,args...) if size(A) != size(B) && return false; end if length(nonzeros(A)) != length(nonzeros(B)) && return false; end @@ -66,9 +23,6 @@ function approx_equivalent(A::SparseMatrixCSR, B::SparseMatrixCSR,args...) true end -function approx_equivalent(A::SparseMatrixCSR,B::SparseMatrixCSC) strictly_equivalent(B,A) end - - function parallel_tests(pA,pB,sparse_func) A = centralize(sparse_func,pA) B = centralize(sparse_func,pB) @@ -188,29 +142,18 @@ function spmtmm_tests(distribute) pB = pA parallel_tests(pA,pB,sparsecsr) - T = eltype(typeof(own_own_values(pA).items)) - - pB = prolongator(T,pA) - B = centralize(T,pB) - sequential_tests(pA,pB) - - #### CSC - do_CSC = true - if do_CSC - pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = Ti)...) |> fetch - T = eltype(typeof(own_own_values(pA).items)) - - pB = pA - parallel_tests(pA,pB,sparse) - - parallel_time(pA,pB,sparse) - T = eltype(typeof(own_own_values(pA).items)) - pB = prolongator(T,pA) - B = centralize(T,pB) - parallel_tests(pA,pB,sparse) - end -end - -end # module -; - + # Testing with a real prolongator requires PartitionedSolvers + # T = eltype(typeof(own_own_values(pA).items)) + # pB = prolongator(T,pA) + # parallel_tests(pA,pB,sparsecsr) + + #### CSC #### + pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = Ti)...) |> fetch + pB = pA + parallel_tests(pA,pB,sparse) + + # Testing with a real prolongator requires PartitionedSolvers + # T = eltype(typeof(own_own_values(pA).items)) + # pB = prolongator(T,pA) + # parallel_tests(pA,pB,sparse) +end \ No newline at end of file diff --git a/times.txt b/times.txt index f7794112..1f2c118d 100644 --- a/times.txt +++ b/times.txt @@ -1 +1 @@ -Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2174075, max = 0.2174075, avg = 0.2174075), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4207836, max = 0.4207836, avg = 0.4207836), "Phase 1" => (min = 1.4e-6, max = 1.4e-6, avg = 1.4e-6)) +Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2025824, max = 0.2025824, avg = 0.2025824), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4105116, max = 0.4105116, avg = 0.4105116), "Phase 1" => (min = 1.4e-6, max = 1.4e-6, avg = 1.4e-6)) From a1b0f8f1b6572ed0f8b148387137b96752bfd0fb Mon Sep 17 00:00:00 2001 From: jop611 Date: Mon, 27 Jan 2025 17:59:16 +0100 Subject: [PATCH 24/34] uncommented a test line. --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index ed7aff49..92768453 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,6 +5,6 @@ using Test @testset "jagged_array" begin include("jagged_array_tests.jl") end @testset "sparse_utils" begin include("sparse_utils_tests.jl") end @testset "debug_array" begin include("debug_array/runtests.jl") end -# @testset "mpi_array" begin include("mpi_array/runtests.jl") end +@testset "mpi_array" begin include("mpi_array/runtests.jl") end end # module From c29b7d3246afb577c0f3ede3bf6833452501181a Mon Sep 17 00:00:00 2001 From: jop611 Date: Tue, 28 Jan 2025 13:10:18 +0100 Subject: [PATCH 25/34] Added spmtmm mpi driver to tests --- test/mpi_array/drivers/spmtmm_tests.jl | 10 ++++++++++ times.txt | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 test/mpi_array/drivers/spmtmm_tests.jl diff --git a/test/mpi_array/drivers/spmtmm_tests.jl b/test/mpi_array/drivers/spmtmm_tests.jl new file mode 100644 index 00000000..50c3668a --- /dev/null +++ b/test/mpi_array/drivers/spmtmm_tests.jl @@ -0,0 +1,10 @@ +module MPIArrayPrimitivesTests + +using PartitionedArrays + +include(joinpath("..","..","spmtmm_tests.jl")) + +with_mpi(spmtmm_tests) + +end # module + diff --git a/times.txt b/times.txt index 1f2c118d..2e3f93ca 100644 --- a/times.txt +++ b/times.txt @@ -1 +1 @@ -Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2025824, max = 0.2025824, avg = 0.2025824), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4105116, max = 0.4105116, avg = 0.4105116), "Phase 1" => (min = 1.4e-6, max = 1.4e-6, avg = 1.4e-6)) +Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2149521, max = 0.2149521, avg = 0.2149521), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4041403, max = 0.4041403, avg = 0.4041403), "Phase 1" => (min = 4.0e-7, max = 4.0e-7, avg = 4.0e-7)) From 18b564e6b9fb372712b151443e6df5d1b1e5094b Mon Sep 17 00:00:00 2001 From: jop611 Date: Tue, 28 Jan 2025 13:19:53 +0100 Subject: [PATCH 26/34] uncommented old code for PartitionedSOlvers tests (rap(...) and rap!(...)). --- src/p_sparse_matrix.jl | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index e663b841..7b76a129 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -2263,13 +2263,13 @@ function sparse_diag_matrix(::Type{T},d::PVector,shape) where T end ### OLD ### -# function rap(R,A,P;reuse=Val(false)) -# Ac = R*A*P -# if val_parameter(reuse) -# return Ac, nothing -# end -# Ac -# end +function rap(R,A,P;reuse=Val(false)) + Ac = R*A*P + if val_parameter(reuse) + return Ac, nothing + end + Ac +end ### NEW ### function rap(R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false)) @@ -2281,12 +2281,12 @@ function rap(R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false) end ### OLD ### -# function rap!(Ac,R,A,P,cache) -# # TODO improve performance -# tmp = R*A*P -# copyto!(Ac,tmp) -# Ac -# end +function rap!(Ac,R,A,P,cache) + # TODO improve performance + tmp = R*A*P + copyto!(Ac,tmp) + Ac +end ### NEW ### function rap!(Ac::PSparseMatrix,R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache) From cddd0ad52ffd68d84a8b641737264428c2d15355 Mon Sep 17 00:00:00 2001 From: jop611 Date: Tue, 28 Jan 2025 13:22:49 +0100 Subject: [PATCH 27/34] changed RAP function names to rap for consistency --- src/p_sparse_matrix.jl | 64 +++++++------- src/sequential_implementations.jl | 142 +++++++++++++++--------------- 2 files changed, 103 insertions(+), 103 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index 7b76a129..e8948c54 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -3138,8 +3138,8 @@ function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fal Aog = own_ghost_values(A) Bog = own_ghost_values(B) - Doo1, Doo_cache = map((A,B,C)->RAP(transpose(A),B,C), Aoo,Boo,Cog) |> tuple_of_arrays - Dgo1, Dgo_cache = map((A,B,C)->RAP(transpose(A),B,C), Aog,Boo,Cog) |> tuple_of_arrays + Doo1, Doo_cache = map((A,B,C)->rap(transpose(A),B,C), Aoo,Boo,Cog) |> tuple_of_arrays + Dgo1, Dgo_cache = map((A,B,C)->rap(transpose(A),B,C), Aog,Boo,Cog) |> tuple_of_arrays # Collect ghost rows from P before continuing C2, consistencyCache = fetch(consistency_task) @@ -3148,9 +3148,9 @@ function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fal Cgo = ghost_own_values(C2) Cgg = ghost_ghost_values(C2) - Dgo2, Dgo_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aog,Bog,Cgo,Dgo_cache) |> tuple_of_arrays - Dog1, Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aog,Boo,Cog2,Dgo_cache) |> tuple_of_arrays - Dog2, Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aog,Bog,Cgg,Dog_cache) |> tuple_of_arrays + Dgo2, Dgo_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Bog,Cgo,Dgo_cache) |> tuple_of_arrays + Dog1, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Boo,Cog2,Dgo_cache) |> tuple_of_arrays + Dog2, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Bog,Cgg,Dog_cache) |> tuple_of_arrays Dgo = map(+,Dgo1,Dgo2) # different sparsity patterns so not in-place. Dog = map(+,Dog1,Dog2) @@ -3164,9 +3164,9 @@ function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fal D1_unassembled = PSparseMatrix(D1_values, partition(axes(C,2)), partition(axes(C2,2)), false) assembly_task = assemble(D1_unassembled, reuse=true) - Dog1, Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aoo,Boo,Cog2,Doo_cache) |> tuple_of_arrays - Doo2,Doo_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays - Dog2,Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays + Dog1, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Boo,Cog2,Doo_cache) |> tuple_of_arrays + Doo2,Doo_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays + Dog2,Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays Doo = map(+,Doo1,Doo2) Dog = map(+,Dog1,Dog2) @@ -3216,8 +3216,8 @@ function spmtmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMa Aog = own_ghost_values(A) Bog = own_ghost_values(B) - map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Doo,Aoo,Boo,Coo,Doo_cache) - map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Dgo,Aog,Boo,Coo,Dgo_cache) + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Doo,Aoo,Boo,Coo,Doo_cache) + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dgo,Aog,Boo,Coo,Dgo_cache) # Collect ghost rows from P before continuing wait(consistency_task) @@ -3225,16 +3225,16 @@ function spmtmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMa Cgo = ghost_own_values(C2) Cgg = ghost_ghost_values(C2) - map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Dgg,Aog,Boo,Cog2,Dgg_cache) + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dgg,Aog,Boo,Cog2,Dgg_cache) - map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Dgo,Aog,Bog,Cgo,Dgo_cache) - map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Dgg,Aog,Bog,Cgg,Dgg_cache) + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dgo,Aog,Bog,Cgo,Dgo_cache) + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dgg,Aog,Bog,Cgg,Dgg_cache) assembly_task = assemble!(D1, D1_unassembled, assemblyCache) - map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Doo,Aoo,Bog,Cgo,Doo_cache) - map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Dog,Aoo,Boo,Cog2,Dog_cache) - map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Dog,Aoo,Bog,Cgg,Dog_cache) + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Doo,Aoo,Bog,Cgo,Doo_cache) + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dog,Aoo,Boo,Cog2,Dog_cache) + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dog,Aoo,Bog,Cgg,Dog_cache) wait(assembly_task) add!(D, D1, D2, mergeCache) @@ -3252,7 +3252,7 @@ function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fals Boo = own_own_values(B) Coo = own_own_values(C) - Doo1,Doo_cache = map(RAP,Aoo,Boo,Coo) |> tuple_of_arrays + Doo1,Doo_cache = map(rap,Aoo,Boo,Coo) |> tuple_of_arrays B2, Bcache = fetch(B2_task) C2_task = consistent(C,partition(axes(B2,2)),reuse=true) @@ -3260,7 +3260,7 @@ function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fals Bgo = ghost_own_values(B2) Bgg = ghost_ghost_values(B2) - Doo2,Doo_cache = map(RAP,Aog,Bgo,Coo,Doo_cache) |> tuple_of_arrays + Doo2,Doo_cache = map(rap,Aog,Bgo,Coo,Doo_cache) |> tuple_of_arrays Doo12 = map(+,Doo1,Doo2) C2, Ccache = fetch(C2_task) @@ -3269,16 +3269,16 @@ function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fals Cgo = ghost_own_values(C2) Cgg = ghost_ghost_values(C2) - Doo3,Doo_cache = map(RAP,Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays - Doo4,Doo_cache = map(RAP,Aog,Bgg,Cgo,Doo_cache) |> tuple_of_arrays + Doo3,Doo_cache = map(rap,Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays + Doo4,Doo_cache = map(rap,Aog,Bgg,Cgo,Doo_cache) |> tuple_of_arrays Doo34 = map(+,Doo3,Doo4) Doo = map(+,Doo12,Doo34) - Dog1,Dog_cache = map(RAP,Aoo,Boo,Cog) |> tuple_of_arrays - Dog2,Dog_cache = map(RAP,Aog,Bgo,Cog,Dog_cache) |> tuple_of_arrays - Dog3,Dog_cache = map(RAP,Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays - Dog4,Dog_cache = map(RAP,Aog,Bgg,Cgg,Dog_cache) |> tuple_of_arrays + Dog1,Dog_cache = map(rap,Aoo,Boo,Cog) |> tuple_of_arrays + Dog2,Dog_cache = map(rap,Aog,Bgo,Cog,Dog_cache) |> tuple_of_arrays + Dog3,Dog_cache = map(rap,Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays + Dog4,Dog_cache = map(rap,Aog,Bgg,Cgg,Dog_cache) |> tuple_of_arrays Dog12 = map(+,Dog1,Dog2) Dog34 = map(+,Dog3,Dog4) @@ -3310,25 +3310,25 @@ function spmmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMat Aog = own_ghost_values(A) Boo = own_own_values(B) Coo = own_own_values(C) - map(RAP!,Doo,Aoo,Boo,Coo,Doo_cache) + map(rap!,Doo,Aoo,Boo,Coo,Doo_cache) wait(B2_task) C2_task = consistent!(C2,C,Ccache) Bog = own_ghost_values(B2) Bgo = ghost_own_values(B2) Bgg = ghost_ghost_values(B2) - map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Doo,Aog,Bgo,Coo,Doo_cache) + map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Doo,Aog,Bgo,Coo,Doo_cache) wait(C2_task) Cog = own_ghost_values(C2) Cgo = ghost_own_values(C2) Cgg = ghost_ghost_values(C2) - map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Doo,Aoo,Bog,Cgo,Doo_cache) - map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Doo,Aog,Bgg,Cgo,Doo_cache) - map(RAP!,Dog,Aoo,Boo,Cog,Dog_cache) - map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Dog,Aog,Bgo,Cog,Dog_cache) - map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Dog,Aoo,Bog,Cgg,Dog_cache) - map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Dog,Aog,Bgg,Cgg,Dog_cache) + map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Doo,Aoo,Bog,Cgo,Doo_cache) + map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Doo,Aog,Bgg,Cgo,Doo_cache) + map(rap!,Dog,Aoo,Boo,Cog,Dog_cache) + map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Dog,Aog,Bgo,Cog,Dog_cache) + map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Dog,Aoo,Bog,Cgg,Dog_cache) + map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Dog,Aog,Bgg,Cgg,Dog_cache) D end \ No newline at end of file diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl index 94ab1d08..b628a3d9 100644 --- a/src/sequential_implementations.jl +++ b/src/sequential_implementations.jl @@ -647,13 +647,13 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, end # PtAP variants -function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi,Tv,Ti}, Pr::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} +function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi,Tv,Ti}, Pr::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} p,q = size(Plt) m,r = size(A) n,s = size(Pr) if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end - function RAP_symbolic_count!(R,A,Pr) + function rap_symbolic_count!(R,A,Pr) JR = R.data JA = colvals(A) JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. @@ -706,7 +706,7 @@ function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi cache = (xbRA,JRA,xbC,JAP) SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized end - function RAP_symbolic_fill!(C,R,A,Pr,cache) + function rap_symbolic_fill!(C,R,A,Pr,cache) (xbRA,JRA,xbC,JAP) = cache JC = colvals(C) JR = R.data @@ -745,18 +745,18 @@ function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi outer_cache = (xbC,similar(xbC, Tv),JAP) C, outer_cache # values not yet initialized end - function _RAP(Plt,A,Pr) + function _rap(Plt,A,Pr) R = symbolic_halfperm(Plt.parent) - C,symbolic_cache = RAP_symbolic_count!(R,A,Pr) # precompute nz structure with a symbolic transpose - _,outer_cache = RAP_symbolic_fill!(C,R,A,Pr,symbolic_cache) + C,symbolic_cache = rap_symbolic_count!(R,A,Pr) # precompute nz structure with a symbolic transpose + _,outer_cache = rap_symbolic_fill!(C,R,A,Pr,symbolic_cache) Ct = symbolic_halfperm(C) symbolic_halfperm!(C,Ct) - RAP!(C,Plt,A,Pr,outer_cache),(outer_cache...,R) + rap!(C,Plt,A,Pr,outer_cache),(outer_cache...,R) end - _RAP(Plt,A,Pr) + _rap(Plt,A,Pr) end -function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, +function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi,Tv,Ti}, Pr::SparseMatrixCSR{Bi,Tv,Ti}, cache) where {Bi,Tv,Ti} @@ -766,7 +766,7 @@ function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end - function RAP_symbolic_count!(R,A,Pr) + function rap_symbolic_count!(R,A,Pr) JR = R.data JA = colvals(A) JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. @@ -818,7 +818,7 @@ function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, xbC .= 0 SparseMatrixCSR{Bi}(p,s,IC,JC,VC),(xbRA,JRA,xbC,JAP) # values in CSR matrix not yet initialized end - function RAP_symbolic_fill!(C,R,A,Pr,cache) + function rap_symbolic_fill!(C,R,A,Pr,cache) (xbRA,JRA,xbC,JAP) = cache JC = colvals(C) JR = R.data @@ -856,17 +856,17 @@ function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, xbC .= 0 C, (xbC,similar(xbC, Tv),JAP) # values not yet initialized end - function _RAP(Plt,A,Pr,old_cache) + function _rap(Plt,A,Pr,old_cache) xb,x,JAP,R = old_cache old_outer_cache = (xb,x,JAP) - C,symbolic_cache = RAP_symbolic_count!(R, A, Pr) - _,new_outer_cache = RAP_symbolic_fill!(C,R, A, Pr, symbolic_cache) + C,symbolic_cache = rap_symbolic_count!(R, A, Pr) + _,new_outer_cache = rap_symbolic_fill!(C,R, A, Pr, symbolic_cache) Ct = symbolic_halfperm(C) symbolic_halfperm!(C,Ct) outer_cache = map((c1,c2) -> length(c1) >= length(c2) ? c1 : c2, old_outer_cache,new_outer_cache) - RAP!(C,Plt,A,Pr,outer_cache),(outer_cache...,R) + rap!(C,Plt,A,Pr,outer_cache),(outer_cache...,R) end - _RAP(Plt,A,Pr,cache) + _rap(Plt,A,Pr,cache) end function reduce_spmtmm_cache(cache,::Type{SparseMatrixCSR}) @@ -874,7 +874,7 @@ function reduce_spmtmm_cache(cache,::Type{SparseMatrixCSR}) (xb,x,JAP) end -function RAP!(C::SparseMatrixCSR{Bi,Tv,Ti}, +function rap!(C::SparseMatrixCSR{Bi,Tv,Ti}, Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi,Tv,Ti}, Pr::SparseMatrixCSR{Bi,Tv,Ti}, @@ -932,7 +932,7 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,Ti}, C end -function RAP!(C::SparseMatrixCSR{Bi,Tv,Ti}, +function rap!(C::SparseMatrixCSR{Bi,Tv,Ti}, Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi,Tv,Ti}, Pr::SparseMatrixCSR{Bi,Tv,Ti}, @@ -991,8 +991,8 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,Ti}, C end -# RAP variants -function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, +# rap variants +function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, A::SparseMatrixCSR{Bi,Tv,TiA}, Pr::SparseMatrixCSR{Bi,Tv,TiPr}) where {Bi,Tv,TiPl,TiA,TiPr} p,q = size(Pl) @@ -1000,7 +1000,7 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, n,s = size(Pr) if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end - function RAP_symbolic!(Pl,A,Pr) + function rap_symbolic!(Pl,A,Pr) JPl = colvals(Pl) JA = colvals(A) JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. @@ -1052,7 +1052,7 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, cache = (xbRA,xRA,JRA,xbC,xC) SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized end - function RAP_numeric!(C,Pl,A,Pr,cache) + function rap_numeric!(C,Pl,A,Pr,cache) JPl = colvals(Pl) VPl = nonzeros(Pl) JA = colvals(A) @@ -1103,17 +1103,17 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, end end end - function _RAP(Pl,A,Pr) - C,(xbRA,xRA,JRA,xbC,xC) = RAP_symbolic!(Pl,A,Pr) + function _rap(Pl,A,Pr) + C,(xbRA,xRA,JRA,xbC,xC) = rap_symbolic!(Pl,A,Pr) xbRA .= 0 xbC .= 0 cache = (xbRA,xRA,JRA,xbC,xC) - RAP_numeric!(C,Pl,A,Pr,cache) + rap_numeric!(C,Pl,A,Pr,cache) Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C) halfperm!(C,Ct) C,cache end - _RAP(Pl,A,Pr) + _rap(Pl,A,Pr) end # Reuses internal arrays of A!!! @@ -1134,7 +1134,7 @@ function reduce_spmtmm_cache(cache,::Type{M} where M <: SparseMatrixCSC) reduce_spmmmt_cache(cache,SparseMatrixCSR) end -function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, +function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, A::SparseMatrixCSR{Bi,Tv,TiA}, Pr::SparseMatrixCSR{Bi,Tv,TiPr}, cache) where {Bi,Tv,TiPl,TiA,TiPr} @@ -1143,7 +1143,7 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, n,s = size(Pr) if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end - function RAP_symbolic!(Pl,A,Pr,cache) + function rap_symbolic!(Pl,A,Pr,cache) JPl = colvals(Pl) JA = colvals(A) JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. @@ -1185,7 +1185,7 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, VC = zeros(Tv,nnz_C-1) SparseMatrixCSR{Bi}(p,s,IC,JC,VC) # values not yet initialized end - function RAP_numeric!(C,Pl,A,Pr,cache) + function rap_numeric!(C,Pl,A,Pr,cache) JPl = colvals(Pl) VPl = nonzeros(Pl) JA = colvals(A) @@ -1236,7 +1236,7 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, end end end - function _RAP(Pl,A,Pr,old_cache) + function _rap(Pl,A,Pr,old_cache) max_rPl = find_max_row_length(Pl) max_rA = find_max_row_length(A) max_rPr = find_max_row_length(Pr) @@ -1254,15 +1254,15 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, new_cache = (xbRA2,xRA2,JRA2,xbC,xC) xbRA2 .= 0 xbC .= 0 - C = RAP_symbolic!(Pl,A,Pr,new_cache) + C = rap_symbolic!(Pl,A,Pr,new_cache) xbRA2 .= 0 xbC .= 0 - RAP_numeric!(C,Pl,A,Pr,new_cache) + rap_numeric!(C,Pl,A,Pr,new_cache) Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C) halfperm!(C,Ct) C,new_cache end - _RAP(Pl,A,Pr,cache) + _rap(Pl,A,Pr,cache) end function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSR) @@ -1274,7 +1274,7 @@ function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSC) reduce_spmtmm_cache(cache,SparseMatrixCSR) end -function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, +function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, Pl::SparseMatrixCSR{Bi,Tv,TiPl}, A::SparseMatrixCSR{Bi,Tv,TiA}, Pr::SparseMatrixCSR{Bi,Tv,TiPr}, @@ -1341,7 +1341,7 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, C end -function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, +function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, Pl::SparseMatrixCSR{Bi,Tv,TiPl}, A::SparseMatrixCSR{Bi,Tv,TiA}, Pr::SparseMatrixCSR{Bi,Tv,TiPr}, @@ -1409,7 +1409,7 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, end # RARt variants -function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiA}, +function rap(Pl::SparseMatrixCSR{Bi,Tv,TiA}, A::SparseMatrixCSR{Bi,Tv,TiB}, Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiC}}) where {Bi,Tv,TiA,TiB,TiC} p,q = size(Pl) @@ -1417,10 +1417,10 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiA}, n,s = size(Prt) if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end - RAP(Pl,A,copy(Prt)) + rap(Pl,A,copy(Prt)) end -function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiA}, +function rap(Pl::SparseMatrixCSR{Bi,Tv,TiA}, A::SparseMatrixCSR{Bi,Tv,TiB}, Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiC}},cache) where {Bi,Tv,TiA,TiB,TiC} p,q = size(Pl) @@ -1428,10 +1428,10 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiA}, n,s = size(Prt) if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end - RAP(Pl,A,copy(Prt),cache) + rap(Pl,A,copy(Prt),cache) end -function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, +function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, Pl::SparseMatrixCSR{Bi,Tv,TiPl}, A::SparseMatrixCSR{Bi,Tv,TiA}, Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiPr}}, @@ -1450,7 +1450,7 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, VPr = nonzeros(Pr) JC = colvals(C) VC = nonzeros(C) - # some cache items are present with the regular RAP product in mind, which is how the allocating verison is performed + # some cache items are present with the regular rap product in mind, which is how the allocating verison is performed xb,x = cache xb .= 0 for i in 1:p @@ -1485,7 +1485,7 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, C end -function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, +function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, Pl::SparseMatrixCSR{Bi,Tv,TiPl}, A::SparseMatrixCSR{Bi,Tv,TiA}, Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiPr}}, @@ -1507,7 +1507,7 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, JC = colvals(C) VC = nonzeros(C) VC .*= β - # some cache items are present with the regular RAP product in mind, which is how the allocating verison is performed + # some cache items are present with the regular rap product in mind, which is how the allocating verison is performed xb,x = cache xb .= 0 for i in 1:p @@ -1543,52 +1543,52 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC}, end ### CSC in terms of CSR -function RAP(A::SparseMatrixCSC{Tv,TiA}, +function rap(A::SparseMatrixCSC{Tv,TiA}, B::SparseMatrixCSC{Tv,TiB}, C::SparseMatrixCSC{Tv,TiC}) where {Tv,TiA,TiB,TiC} - D,cache = RAP(ascsr(C),ascsr(B),ascsr(A)) + D,cache = rap(ascsr(C),ascsr(B),ascsr(A)) ascsc(D),cache end -function RAP(A::SparseMatrixCSC{Tv,TiA}, +function rap(A::SparseMatrixCSC{Tv,TiA}, B::SparseMatrixCSC{Tv,TiB}, C::SparseMatrixCSC{Tv,TiC}, cache) where {Tv,TiA,TiB,TiC} - D,new_cache = RAP(ascsr(C),ascsr(B),ascsr(A),cache) + D,new_cache = rap(ascsr(C),ascsr(B),ascsr(A),cache) ascsc(D),new_cache end -function RAP!(D::SparseMatrixCSC{Tv,TiD}, +function rap!(D::SparseMatrixCSC{Tv,TiD}, A::SparseMatrixCSC{Tv,TiA}, B::SparseMatrixCSC{Tv,TiB}, C::SparseMatrixCSC{Tv,TiC}, cache) where {Tv,TiD,TiA,TiB,TiC} - RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache) + rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache) D end -function RAP!(D::SparseMatrixCSC{Tv,TiD}, +function rap!(D::SparseMatrixCSC{Tv,TiD}, A::SparseMatrixCSC{Tv,TiA}, B::SparseMatrixCSC{Tv,TiB}, C::SparseMatrixCSC{Tv,TiC}, cache::JaggedArray{X,Y} where {X<:Integer, Y<:Integer}, acc) where {Tv,TiD,TiA,TiB,TiC} - RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache,acc) + rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache,acc) D end -function RAP!(D::SparseMatrixCSC{Tv,TiD}, +function rap!(D::SparseMatrixCSC{Tv,TiD}, A::SparseMatrixCSC{Tv,TiA}, B::SparseMatrixCSC{Tv,TiB}, C::SparseMatrixCSC{Tv,TiC}, α::Number, β::Number, cache) where {Tv,TiD,TiA,TiB,TiC} - RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache) + rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache) D end -function RAP!(D::SparseMatrixCSC{Tv,TiD}, +function rap!(D::SparseMatrixCSC{Tv,TiD}, A::SparseMatrixCSC{Tv,TiA}, B::SparseMatrixCSC{Tv,TiB}, C::SparseMatrixCSC{Tv,TiC}, @@ -1596,77 +1596,77 @@ function RAP!(D::SparseMatrixCSC{Tv,TiD}, β::Number, cache::JaggedArray{X,Y} where {X <: Integer, Y<:Integer}, acc) where {Tv,TiD,TiA,TiB,TiC} - RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache,acc) + rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache,acc) D end # PtAP -function RAP(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, +function rap(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, B::SparseMatrixCSC{Tv,TiB}, C::SparseMatrixCSC{Tv,TiC}) where {Tv,TiA,TiB,TiC} - D,cache = RAP(ascsr(C),ascsr(B),transpose(ascsr(A.parent))) + D,cache = rap(ascsr(C),ascsr(B),transpose(ascsr(A.parent))) ascsc(D),cache end -function RAP(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, +function rap(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, B::SparseMatrixCSC{Tv,TiB}, C::SparseMatrixCSC{Tv,TiC}, cache) where {Tv,TiA,TiB,TiC} - D,cache = RAP(ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache) + D,cache = rap(ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache) ascsc(D),cache end -function RAP!(D::SparseMatrixCSC{Tv,TiD}, +function rap!(D::SparseMatrixCSC{Tv,TiD}, A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, B::SparseMatrixCSC{Tv,TiB}, C::SparseMatrixCSC{Tv,TiC}, cache) where {Tv,TiD,TiA,TiB,TiC} - RAP!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache) + rap!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache) D end -function RAP!(D::SparseMatrixCSC{Tv,TiD}, +function rap!(D::SparseMatrixCSC{Tv,TiD}, A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, B::SparseMatrixCSC{Tv,TiB}, C::SparseMatrixCSC{Tv,TiC}, α::Number, β::Number, cache) where {Tv,TiD,TiA,TiB,TiC} - RAP!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),α,β,cache) + rap!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),α,β,cache) D end # RARt -function RAP(A::SparseMatrixCSC{Tv,Ti}, +function rap(A::SparseMatrixCSC{Tv,Ti}, B::SparseMatrixCSC{Tv,Ti}, C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}) where {Tv,Ti<:Integer} - D,new_cache = RAP(transpose(ascsr(C.parent)),ascsr(B),ascsr(A)) + D,new_cache = rap(transpose(ascsr(C.parent)),ascsr(B),ascsr(A)) ascsc(D),new_cache end -function RAP(A::SparseMatrixCSC{Tv,Ti}, +function rap(A::SparseMatrixCSC{Tv,Ti}, B::SparseMatrixCSC{Tv,Ti}, C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, cache) where {Tv,Ti<:Integer} - D,new_cache = RAP(transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache) + D,new_cache = rap(transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache) ascsc(D),new_cache end -function RAP!(D::SparseMatrixCSC{Tv,Ti}, +function rap!(D::SparseMatrixCSC{Tv,Ti}, A::SparseMatrixCSC{Tv,Ti}, B::SparseMatrixCSC{Tv,Ti}, C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, cache) where {Tv,Ti<:Integer} - RAP!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache) + rap!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache) D end -function RAP!(D::SparseMatrixCSC{Tv,Ti}, +function rap!(D::SparseMatrixCSC{Tv,Ti}, A::SparseMatrixCSC{Tv,Ti}, B::SparseMatrixCSC{Tv,Ti}, C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, α::Number, β::Number, cache) where {Tv,Ti<:Integer} - RAP!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),α,β,cache) + rap!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),α,β,cache) D end \ No newline at end of file From 3a03304b56f8cff5e5e85cdd8f79aa608a2a08cd Mon Sep 17 00:00:00 2001 From: jop611 Date: Tue, 28 Jan 2025 13:29:20 +0100 Subject: [PATCH 28/34] changed RAP function names to rap for consistency --- times.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/times.txt b/times.txt index 2e3f93ca..4e9217e8 100644 --- a/times.txt +++ b/times.txt @@ -1 +1 @@ -Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2149521, max = 0.2149521, avg = 0.2149521), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4041403, max = 0.4041403, avg = 0.4041403), "Phase 1" => (min = 4.0e-7, max = 4.0e-7, avg = 4.0e-7)) +Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2073056, max = 0.2073056, avg = 0.2073056), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4125464, max = 0.4125464, avg = 0.4125464), "Phase 1" => (min = 1.0e-6, max = 1.0e-6, avg = 1.0e-6)) From 7d37364b73138a481b9d84cf089ee619a166c12c Mon Sep 17 00:00:00 2001 From: jop611 Date: Wed, 29 Jan 2025 11:50:17 +0100 Subject: [PATCH 29/34] Simplified dispatch, included automatic type promotion when required. --- src/p_sparse_matrix.jl | 2 +- src/sequential_implementations.jl | 778 +++++++++++++++--------------- test/spmtmm_tests.jl | 109 ++++- times.txt | 2 +- 4 files changed, 484 insertions(+), 407 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index e8948c54..560ba66a 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -2273,7 +2273,7 @@ end ### NEW ### function rap(R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false)) - Ac, cache = spmmm(R,A,P) + Ac, cache = spmmm(R,A,P;reuse=true) if val_parameter(reuse) return Ac, cache end diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl index b628a3d9..3585e38b 100644 --- a/src/sequential_implementations.jl +++ b/src/sequential_implementations.jl @@ -1,19 +1,19 @@ -function Base.:*(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TiA,TvB,TiB} +function Base.:*(A::SparseMatrixCSR,B::SparseMatrixCSR) C = ascsc(B)*ascsc(A) ascsr(C) end -function Base.:*(At::Transpose{Tv, SparseMatrixCSR{Bi,Tv,Ti}},B::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} +function Base.:*(At::Transpose{Tv,<:SparseMatrixCSR} where Tv,B::SparseMatrixCSR) C = ascsc(B)*transpose(ascsc(At.parent)) ascsr(C) end -function Base.:*(A::SparseMatrixCSR{Bi,Tv,Ti},Bt::Transpose{Tv, SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti} +function Base.:*(A::SparseMatrixCSR,Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv) C = transpose(ascsc(Bt.parent))*ascsc(A) ascsr(C) end -function Base.:*(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},Bt::Transpose{Tv, SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti} +function Base.:*(At::Transpose{TvA,<:SparseMatrixCSR} where TvA,Bt::Transpose{TvB,<:SparseMatrixCSR} where TvB) C = transpose(ascsc(Bt.parent))*transpose(ascsc(At.parent)) ascsr(C) end @@ -27,10 +27,11 @@ function Base.:/(A::SparseMatrixCSR{Bi,Tv,Ti},x::Number) where {Bi,Tv,Ti} SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> a/x, A.nzval)) end - # Alternative to lazy csr to csc for matrix addition that does not drop structural zeros. -function Base.:+(A::SparseMatrixCSR{Bi,Tv,Ti},B::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} +function Base.:+(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB} if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end + Ti = promote_type(TiA,TiB) + Tv = promote_type(TvA,TvB) p,q = size(A) nnz_C_upperbound = nnz(A) + nnz(B) IC = Vector{Ti}(undef, p+1) @@ -87,8 +88,10 @@ function Base.:+(A::SparseMatrixCSR{Bi,Tv,Ti},B::SparseMatrixCSR{Bi,Tv,Ti}) wher end # Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B. -function Base.:-(A::SparseMatrixCSR{Bi,Tv,Ti},B::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} +function Base.:-(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB} if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end + Ti = promote_type(TiA,TiB) + Tv = promote_type(TvA,TvB) nnz_C_upperbound = nnz(A) + nnz(B) p,r = size(A) IC = Vector{Ti}(undef, p+1) @@ -149,8 +152,10 @@ function Base.:-(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} end # Alternative to lazy csr to csc for matrix addition that does not drop structural zeros. -function Base.:+(A::SparseMatrixCSC{Tv,Ti},B::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} +function Base.:+(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB} if size(A) != size(B) && throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end + Ti = promote_type(TiA,TiB) + Tv = promote_type(TvA,TvB) p,q = size(A) nnz_C_upperbound = nnz(A) + nnz(B) JC = Vector{Ti}(undef, q+1) @@ -207,8 +212,10 @@ function Base.:+(A::SparseMatrixCSC{Tv,Ti},B::SparseMatrixCSC{Tv,Ti}) where {Tv, end # Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B. -function Base.:-(A::SparseMatrixCSC{Tv,Ti},B::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} +function Base.:-(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB} if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end + Ti = promote_type(TiA,TiB) + Tv = promote_type(TvA,TvB) p,q = size(A) nnz_C_upperbound = nnz(A) + nnz(B) JC = Vector{Ti}(undef, q+1) @@ -269,26 +276,28 @@ function Base.:-(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} end -function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, - A::SparseMatrixCSC{Tv,Ti}, - B::SparseMatrixCSC{Tv,Ti}, - cache) where {Tv,Ti} +function LinearAlgebra.mul!(C::SparseMatrixCSC, + A::SparseMatrixCSC, + B::SparseMatrixCSC, + cache) mul!(ascsr(C),ascsr(B),ascsr(A),cache) + C end -function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, - A::SparseMatrixCSC{Tv,Ti}, - B::SparseMatrixCSC{Tv,Ti}, - α::Number, - β::Number, - cache) where {Tv,Ti} +function LinearAlgebra.mul!(C::SparseMatrixCSC, + A::SparseMatrixCSC, + B::SparseMatrixCSC, + α::Number, + β::Number, + cache) mul!(ascsr(C),ascsr(B),ascsr(A),α,β,cache) + C end -function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, - At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, - B::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} +function LinearAlgebra.mul!(C::SparseMatrixCSC, + At::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC) a,b = size(C) p,q = size(At) r,s = size(B) @@ -380,16 +389,17 @@ function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, C end -function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, - A::SparseMatrixCSC{Tv,Ti}, - Bt::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}) where {Tv,Ti} +function LinearAlgebra.mul!(C::SparseMatrixCSC, + A::SparseMatrixCSC, + Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv) mul!(ascsr(C),transpose(ascsr(B)),ascsr(A)) + C end -function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, - A::SparseMatrixCSR{Bi,Tv,Ti}, - B::SparseMatrixCSR{Bi,Tv,Ti}, - cache) where {Bi,Tv,Ti} +function LinearAlgebra.mul!(C::SparseMatrixCSR, + A::SparseMatrixCSR, + B::SparseMatrixCSR, + cache) a,b = size(C) p,q = size(A) r,s = size(B) @@ -397,7 +407,7 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end JC = colvals(C) VC = nonzeros(C) - VC .= zero(Tv) + VC .= zero(eltype(C)) JA = colvals(A) VA = nonzeros(A) JB = colvals(B) @@ -436,19 +446,20 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, C end -function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, - A::SparseMatrixCSC{Tv,Ti}, - Bt::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, - cache) where {Tv,Ti} +function LinearAlgebra.mul!(C::SparseMatrixCSC, + A::SparseMatrixCSC, + Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv, + cache) mul!(ascsr(C),transpose(ascsr(B)),ascsr(A),cache) + C end -function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, - A::SparseMatrixCSR{Bi,Tv,Ti}, - B::SparseMatrixCSR{Bi,Tv,Ti}, +function LinearAlgebra.mul!(C::SparseMatrixCSR, + A::SparseMatrixCSR, + B::SparseMatrixCSR, α::Number, β::Number, - cache) where {Bi,Tv,Ti} + cache) a,b = size(C) p,q = size(A) r,s = size(B) @@ -495,29 +506,32 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, C end -function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, - A::SparseMatrixCSC{Tv,Ti}, - Bt::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, +function LinearAlgebra.mul!(C::SparseMatrixCSC, + A::SparseMatrixCSC, + Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv, α::Number, β::Number, - cache) where {Tv,Ti} + cache) mul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),α,β,cache) + C end -function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, - At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, - B::SparseMatrixCSC{Tv,Ti}, - cache) where {Tv,Ti} +function LinearAlgebra.mul!(C::SparseMatrixCSC, + At::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, + cache) mul!(ascsr(C),ascsr(B),transpose(ascsr(At.parent))) + C end -function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, - At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, - B::SparseMatrixCSC{Tv,Ti}, +function LinearAlgebra.mul!(C::SparseMatrixCSC, + At::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, α::Number, β::Number, - cache) where {Tv,Ti} + cache) mul!(ascsr(C),ascsr(A),transpose(ascsr(At.parent)),α,β) + C end # Workaround to supply in-place mul! with auxiliary array, as these are not returned by multiply function exported by SparseArrays @@ -542,10 +556,10 @@ function construct_spmtm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} construct_spmtm_cache(ascsr(A)) end -function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, - At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, - B::SparseMatrixCSR{Bi,Tv,Ti}, - cache) where {Bi,Tv,Ti} +function LinearAlgebra.mul!(C::SparseMatrixCSR, + At::Transpose{Tv,<:SparseMatrixCSR} where Tv, + B::SparseMatrixCSR, + cache) a,b = size(C) p,q = size(At) r,s = size(B) @@ -553,7 +567,7 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end A = At.parent VC = nonzeros(C) - VC .= zero(Tv) + VC .= zero((eltype(C))) JC = colvals(C) JA = colvals(A) # When virtually transposed colvals represent rowvals. VA = nonzeros(A) @@ -585,12 +599,12 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, C end -function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, - At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, - B::SparseMatrixCSR{Bi,Tv,Ti}, +function LinearAlgebra.mul!(C::SparseMatrixCSR, + At::Transpose{Tv,<:SparseMatrixCSR} where Tv, + B::SparseMatrixCSR, α::Number, β::Number, - cache) where {Bi,Tv,Ti} + cache) a,b = size(C) p,q = size(At) r,s = size(B) @@ -630,40 +644,44 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, C end -function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, - A::SparseMatrixCSR{Bi,Tv,Ti}, - Bt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti} +function LinearAlgebra.mul!(C::SparseMatrixCSR, + A::SparseMatrixCSR, + Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv) mul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A)) C end -function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti}, - A::SparseMatrixCSR{Bi,Tv,Ti}, - Bt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, +function LinearAlgebra.mul!(C::SparseMatrixCSR, + A::SparseMatrixCSR, + Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv, α::Number, - β::Number) where {Bi,Tv,Ti} + β::Number) mul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A), α, β) C end # PtAP variants -function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi,Tv,Ti}, Pr::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} - p,q = size(Plt) +function rap(Rt::Transpose{TvR,SparseMatrixCSR{Bi,TvR,TiR}}, + A::SparseMatrixCSR{Bi,TvA,TiA}, + P::SparseMatrixCSR{Bi,TvP,TiP}) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP} + p,q = size(Rt) m,r = size(A) - n,s = size(Pr) + n,s = size(P) if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end - function rap_symbolic_count!(R,A,Pr) + function rap_symbolic_count!(R,A,P) + Ti = promote_type(TiR,TiA,TiP) + Tv = promote_type(TvR,TvA,TvP) JR = R.data JA = colvals(A) - JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. xbRA = zeros(Ti, r) xbC = zeros(Ti, s) # this vector will also serve as as colptr array in halfperm max_rR = find_max_row_length(R) max_rA = find_max_row_length(A) - max_rPr = find_max_row_length(Pr) + max_rP = find_max_row_length(P) - max_rC = max((max_rR*max_rA*max_rPr),(max_rA*max_rR)) + max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR)) JRA = Vector{Ti}(undef,max_rC) IC = Vector{Ti}(undef,p+1) nnz_C = 1 @@ -687,8 +705,8 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows for jp in 1:ccRA j = JRA[jp] - for kp in nzrange(Pr,j) - k = JPr[kp] + for kp in nzrange(P,j) + k = JP[kp] if xbC[k] != i xbC[k] = i ccC += 1 @@ -700,18 +718,18 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi end JC = Vector{Ti}(undef, nnz_C-1) VC = zeros(Tv,nnz_C-1) - JAP = Vector{Ti}(undef,min(max_rA*max_rPr,s)) # upper bound estimate for length of virtual row of AP + JAP = Vector{Ti}(undef,min(max_rA*max_rP,s)) # upper bound estimate for length of virtual row of AP xbRA .= 0 xbC .= 0 cache = (xbRA,JRA,xbC,JAP) SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized end - function rap_symbolic_fill!(C,R,A,Pr,cache) + function rap_symbolic_fill!(C,R,A,P,cache) (xbRA,JRA,xbC,JAP) = cache JC = colvals(C) JR = R.data JA = colvals(A) - JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. pC = 0 for i in 1:p ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows @@ -731,8 +749,8 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi end for jp in 1:ccRA j = JRA[jp] - for kp in nzrange(Pr,j) - k = JPr[kp] + for kp in nzrange(P,j) + k = JP[kp] if xbC[k] != i pC += 1 xbC[k] = i @@ -742,41 +760,43 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi end end xbC .= 0 - outer_cache = (xbC,similar(xbC, Tv),JAP) + outer_cache = (xbC,similar(xbC, eltype(C)),JAP) C, outer_cache # values not yet initialized end - function _rap(Plt,A,Pr) - R = symbolic_halfperm(Plt.parent) - C,symbolic_cache = rap_symbolic_count!(R,A,Pr) # precompute nz structure with a symbolic transpose - _,outer_cache = rap_symbolic_fill!(C,R,A,Pr,symbolic_cache) + function _rap(Rt,A,P) + R = symbolic_halfperm(Rt.parent) + C,symbolic_cache = rap_symbolic_count!(R,A,P) # precompute nz structure with a symbolic transpose + _,outer_cache = rap_symbolic_fill!(C,R,A,P,symbolic_cache) Ct = symbolic_halfperm(C) symbolic_halfperm!(C,Ct) - rap!(C,Plt,A,Pr,outer_cache),(outer_cache...,R) + rap!(C,Rt,A,P,outer_cache),(outer_cache...,R) end - _rap(Plt,A,Pr) + _rap(Rt,A,P) end -function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, - A::SparseMatrixCSR{Bi,Tv,Ti}, - Pr::SparseMatrixCSR{Bi,Tv,Ti}, - cache) where {Bi,Tv,Ti} - p,q = size(Plt) +function rap(Rt::Transpose{TvR,SparseMatrixCSR{Bi,TvR,TiR}}, + A::SparseMatrixCSR{Bi,TvA,TiA}, + P::SparseMatrixCSR{Bi,TvP,TiP}, + cache) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP} + p,q = size(Rt) m,r = size(A) - n,s = size(Pr) + n,s = size(P) if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end - - function rap_symbolic_count!(R,A,Pr) + + function rap_symbolic_count(R,A,P) + Ti = promote_type(TiR,TiA,TiP) + Tv = promote_type(TvR,TvA,TvP) JR = R.data JA = colvals(A) - JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. xbRA = zeros(Ti, r) xbC = zeros(Ti, s) # this vector will also serve as as colptr array in halfperm max_rR = find_max_row_length(R) max_rA = find_max_row_length(A) - max_rPr = find_max_row_length(Pr) + max_rP = find_max_row_length(P) - max_rC = max((max_rR*max_rA*max_rPr),(max_rA*max_rR)) + max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR)) JRA = Vector{Ti}(undef,max_rC) IC = Vector{Ti}(undef,p+1) nnz_C = 1 @@ -800,8 +820,8 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows for jp in 1:ccRA j = JRA[jp] - for kp in nzrange(Pr,j) - k = JPr[kp] + for kp in nzrange(P,j) + k = JP[kp] if xbC[k] != i xbC[k] = i ccC += 1 @@ -813,17 +833,17 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, end JC = Vector{Ti}(undef, nnz_C-1) VC = zeros(Tv,nnz_C-1) - JAP = Vector{Ti}(undef,min(max_rA*max_rPr,s)) # upper bound estimate for length of virtual row of AP + JAP = Vector{Ti}(undef,min(max_rA*max_rP,s)) # upper bound estimate for length of virtual row of AP xbRA .= 0 xbC .= 0 SparseMatrixCSR{Bi}(p,s,IC,JC,VC),(xbRA,JRA,xbC,JAP) # values in CSR matrix not yet initialized end - function rap_symbolic_fill!(C,R,A,Pr,cache) + function rap_symbolic_fill!(C,R,A,P,cache) (xbRA,JRA,xbC,JAP) = cache JC = colvals(C) JR = R.data JA = colvals(A) - JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. pC = 0 for i in 1:p ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows @@ -843,8 +863,8 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, end for jp in 1:ccRA j = JRA[jp] - for kp in nzrange(Pr,j) - k = JPr[kp] + for kp in nzrange(P,j) + k = JP[kp] if xbC[k] != i pC += 1 xbC[k] = i @@ -854,19 +874,19 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, end end xbC .= 0 - C, (xbC,similar(xbC, Tv),JAP) # values not yet initialized + C, (xbC,similar(xbC, eltype(C)),JAP) # values not yet initialized end - function _rap(Plt,A,Pr,old_cache) + function _rap(Rt,A,P,old_cache) xb,x,JAP,R = old_cache old_outer_cache = (xb,x,JAP) - C,symbolic_cache = rap_symbolic_count!(R, A, Pr) - _,new_outer_cache = rap_symbolic_fill!(C,R, A, Pr, symbolic_cache) + C,symbolic_cache = rap_symbolic_count(R, A, P) + _,new_outer_cache = rap_symbolic_fill!(C,R, A, P, symbolic_cache) Ct = symbolic_halfperm(C) symbolic_halfperm!(C,Ct) outer_cache = map((c1,c2) -> length(c1) >= length(c2) ? c1 : c2, old_outer_cache,new_outer_cache) - rap!(C,Plt,A,Pr,outer_cache),(outer_cache...,R) + rap!(C,Rt,A,P,outer_cache),(outer_cache...,R) end - _rap(Plt,A,Pr,cache) + _rap(Rt,A,P,cache) end function reduce_spmtmm_cache(cache,::Type{SparseMatrixCSR}) @@ -874,27 +894,27 @@ function reduce_spmtmm_cache(cache,::Type{SparseMatrixCSR}) (xb,x,JAP) end -function rap!(C::SparseMatrixCSR{Bi,Tv,Ti}, - Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, - A::SparseMatrixCSR{Bi,Tv,Ti}, - Pr::SparseMatrixCSR{Bi,Tv,Ti}, - cache) where {Bi,Tv,Ti} +function rap!(C::SparseMatrixCSR, + Rt::Transpose{Tv,<:SparseMatrixCSR} where Tv, + A::SparseMatrixCSR, + P::SparseMatrixCSR, + cache) (a,b) = size(C) - p,q = size(Plt) + p,q = size(Rt) m,r = size(A) - n,s = size(Pr) + n,s = size(P) if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end if (a,b) != (p,s) && throw(DimensionMismatch("Dimensions of C $(size(C)) don't match dimensions of R*A*P ($p,$q)*($m,$r)*($n,$s)."));end - Pl = Plt.parent + R = Rt.parent JC = colvals(C) VC = nonzeros(C) - VC .= zero(Tv) + VC .= zero(eltype(C)) JA = colvals(A) VA = nonzeros(A) - JPr = colvals(Pr) - VPr = nonzeros(Pr) + JP = colvals(P) + VP = nonzeros(P) xb, x, JAP = cache xb .= 0 # loop over rows in A @@ -905,22 +925,22 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,Ti}, j = JA[jp] va = VA[jp] # loop over columns "k" in row j of B - for kp in nzrange(Pr, j) - k = JPr[kp] + for kp in nzrange(P, j) + k = JP[kp] # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. if xb[k] != i lp += 1 JAP[lp] = k xb[k] = i - x[k] = va * VPr[kp] + x[k] = va * VP[kp] else - x[k] += va * VPr[kp] + x[k] += va * VP[kp] end end end - for kp in nzrange(Pl, i) - k = colvals(Pl)[kp] # rowvals when transposed conceptually - v = nonzeros(Pl)[kp] + for kp in nzrange(R, i) + k = colvals(R)[kp] # rowvals when transposed conceptually + v = nonzeros(R)[kp] for jp in nzrange(C,k) j = JC[jp] if xb[j] == i @@ -932,27 +952,27 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,Ti}, C end -function rap!(C::SparseMatrixCSR{Bi,Tv,Ti}, - Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, - A::SparseMatrixCSR{Bi,Tv,Ti}, - Pr::SparseMatrixCSR{Bi,Tv,Ti}, +function rap!(C::SparseMatrixCSR, + Rt::Transpose{Tv,<:SparseMatrixCSR} where Tv, + A::SparseMatrixCSR, + P::SparseMatrixCSR, α::Number, β::Number, - cache) where {Bi,Tv,Ti} + cache) (a,b) = size(C) - p,q = size(Plt) + p,q = size(Rt) m,r = size(A) - n,s = size(Pr) + n,s = size(P) if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end if (a,b) != (p,s) && throw(DimensionMismatch("Dimensions of C $(size(C)) don't match dimensions of R*A*P ($p,$q)*($m,$r)*($n,$s)."));end - Pl = Plt.parent + R = Rt.parent JC = colvals(C) VC = nonzeros(C) JA = colvals(A) VA = nonzeros(A) - JPr = colvals(Pr) - VPr = nonzeros(Pr) + JP = colvals(P) + VP = nonzeros(P) xb, x, JAP = cache xb .= 0 VC .*= β @@ -964,22 +984,22 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,Ti}, j = JA[jp] va = α*VA[jp] # loop over columns "k" in row j of B - for kp in nzrange(Pr, j) - k = JPr[kp] + for kp in nzrange(P, j) + k = JP[kp] # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. if xb[k] != i lp += 1 JAP[lp] = k xb[k] = i - x[k] = va*VPr[kp] + x[k] = va*VP[kp] else - x[k] += va*VPr[kp] + x[k] += va*VP[kp] end end end - for kp in nzrange(Pl, i) - k = colvals(Pl)[kp] # rowvals when transposed conceptually - vpl = nonzeros(Pl)[kp] + for kp in nzrange(R, i) + k = colvals(R)[kp] # rowvals when transposed conceptually + vpl = nonzeros(R)[kp] for jp in nzrange(C,k) j = JC[jp] if xb[j] == i @@ -991,37 +1011,41 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,Ti}, C end -# rap variants -function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, - A::SparseMatrixCSR{Bi,Tv,TiA}, - Pr::SparseMatrixCSR{Bi,Tv,TiPr}) where {Bi,Tv,TiPl,TiA,TiPr} - p,q = size(Pl) +# RAP variants +function rap(R::SparseMatrixCSR{Bi,TvR,TiR}, + A::SparseMatrixCSR{Bi,TvA,TiA}, + P::SparseMatrixCSR{Bi,TvP,TiP}) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP} + p,q = size(R) m,r = size(A) - n,s = size(Pr) + n,s = size(P) if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end - function rap_symbolic!(Pl,A,Pr) - JPl = colvals(Pl) + + function rap_symbolic!(R,A,P) + Ti = promote_type(TiR,TiA,TiP) + Tv = promote_type(TvR,TvA,TvP) + + JR = colvals(R) JA = colvals(A) - JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. - xbRA = zeros(TiA, r) - xbC = zeros(TiA, s+1) # this vector will also serve as as colptr array in halfperm + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + xbRA = zeros(Ti, r) + xbC = zeros(Ti, s+1) # this vector will also serve as as colptr array in halfperm xRA = similar(xbRA, Tv) # sparse accumulator xC = similar(xbC, Tv) # sparse accumulator - max_rPl = find_max_row_length(Pl) + max_rR = find_max_row_length(R) max_rA = find_max_row_length(A) - max_rPr = find_max_row_length(Pr) + max_rP = find_max_row_length(P) + max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR)) - max_rC = max((max_rPl*max_rA*max_rPr),(max_rA*max_rPl)) - JRA = Vector{TiA}(undef,max_rC) - IC = Vector{TiA}(undef,p+1) + JRA = Vector{Ti}(undef,max_rC) + IC = Vector{Ti}(undef,p+1) nnz_C = 1 IC[1] = nnz_C for i in 1:p ccRA = 0 # loop over columns "j" in row i of A - for jp in nzrange(Pl, i) - j = JPl[jp] + for jp in nzrange(R, i) + j = JR[jp] # loop over columns "k" in row j of B for kp in nzrange(A, j) k = JA[kp] @@ -1036,8 +1060,8 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, ccC = 0 for jp in 1:ccRA j = JRA[jp] - for kp in nzrange(Pr,j) - k = JPr[kp] + for kp in nzrange(P,j) + k = JP[kp] if xbC[k] != i xbC[k] = i ccC += 1 @@ -1047,18 +1071,18 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, nnz_C += ccC IC[i+1] = nnz_C end - JC = Vector{TiA}(undef, nnz_C-1) + JC = Vector{Ti}(undef, nnz_C-1) VC = zeros(Tv,nnz_C-1) cache = (xbRA,xRA,JRA,xbC,xC) SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized end - function rap_numeric!(C,Pl,A,Pr,cache) - JPl = colvals(Pl) - VPl = nonzeros(Pl) + function rap_numeric!(C,R,A,P,cache) + JR = colvals(R) + VR = nonzeros(R) JA = colvals(A) VA = nonzeros(A) - JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. - VPr = nonzeros(Pr) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + VP = nonzeros(P) JC = colvals(C) VC = nonzeros(C) (xbRA,xRA,JRA,xbC,xC) = cache @@ -1066,9 +1090,9 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, for i in 1:p ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows # loop over columns "j" in row i of A - for jp in nzrange(Pl, i) - j = JPl[jp] - vpl = VPl[jp] + for jp in nzrange(R, i) + j = JR[jp] + vpl = VR[jp] # loop over columns "k" in row j of B for kp in nzrange(A, j) k = JA[kp] @@ -1085,15 +1109,15 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, end for jp in 1:ccRA j = JRA[jp] - for kp in nzrange(Pr,j) - k = JPr[kp] + for kp in nzrange(P,j) + k = JP[kp] if xbC[k] != i xbC[k] = i JC[jpC] = k jpC += 1 - xC[k] = xRA[j]*VPr[kp] + xC[k] = xRA[j]*VP[kp] else - xC[k] += xRA[j]*VPr[kp] + xC[k] += xRA[j]*VP[kp] end end end @@ -1103,17 +1127,17 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, end end end - function _rap(Pl,A,Pr) - C,(xbRA,xRA,JRA,xbC,xC) = rap_symbolic!(Pl,A,Pr) + function _rap(R,A,P) + C,(xbRA,xRA,JRA,xbC,xC) = rap_symbolic!(R,A,P) xbRA .= 0 xbC .= 0 cache = (xbRA,xRA,JRA,xbC,xC) - rap_numeric!(C,Pl,A,Pr,cache) + rap_numeric!(C,R,A,P,cache) Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C) halfperm!(C,Ct) C,cache end - _rap(Pl,A,Pr) + _rap(R,A,P) end # Reuses internal arrays of A!!! @@ -1134,28 +1158,31 @@ function reduce_spmtmm_cache(cache,::Type{M} where M <: SparseMatrixCSC) reduce_spmmmt_cache(cache,SparseMatrixCSR) end -function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, - A::SparseMatrixCSR{Bi,Tv,TiA}, - Pr::SparseMatrixCSR{Bi,Tv,TiPr}, - cache) where {Bi,Tv,TiPl,TiA,TiPr} - p,q = size(Pl) +function rap(R::SparseMatrixCSR{Bi,TvR,TiR}, + A::SparseMatrixCSR{Bi,TvA,TiA}, + P::SparseMatrixCSR{Bi,TvP,TiP}, + cache) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP} + p,q = size(R) m,r = size(A) - n,s = size(Pr) + n,s = size(P) if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end - function rap_symbolic!(Pl,A,Pr,cache) - JPl = colvals(Pl) + + function rap_symbolic!(R,A,P,cache) + Ti = promote_type(TiR,TiA,TiP) + Tv = promote_type(TvR,TvA,TvP) + JR = colvals(R) JA = colvals(A) - JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. (xbRA,_,JRA,xbC,_) = cache - IC = Vector{TiA}(undef,p+1) + IC = Vector{Ti}(undef,p+1) nnz_C = 1 IC[1] = nnz_C for i in 1:p ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows # loop over columns "j" in row i of A - for jp in nzrange(Pl, i) - j = JPl[jp] + for jp in nzrange(R, i) + j = JR[jp] # loop over columns "k" in row j of B for kp in nzrange(A, j) k = JA[kp] @@ -1170,8 +1197,8 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows for jp in 1:ccRA j = JRA[jp] - for kp in nzrange(Pr,j) - k = JPr[kp] + for kp in nzrange(P,j) + k = JP[kp] if xbC[k] != i xbC[k] = i ccC += 1 @@ -1181,17 +1208,17 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, nnz_C += ccC IC[i+1] = nnz_C end - JC = Vector{TiA}(undef, nnz_C-1) + JC = Vector{Ti}(undef, nnz_C-1) VC = zeros(Tv,nnz_C-1) SparseMatrixCSR{Bi}(p,s,IC,JC,VC) # values not yet initialized end - function rap_numeric!(C,Pl,A,Pr,cache) - JPl = colvals(Pl) - VPl = nonzeros(Pl) + function rap_numeric!(C,R,A,P,cache) + JR = colvals(R) + VR = nonzeros(R) JA = colvals(A) VA = nonzeros(A) - JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. - VPr = nonzeros(Pr) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + VP = nonzeros(P) JC = colvals(C) VC = nonzeros(C) (xbRA,xRA,JRA,xbC,xC) = cache @@ -1199,9 +1226,9 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, for i in 1:p ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows # loop over columns "j" in row i of A - for jp in nzrange(Pl, i) - j = JPl[jp] - vpl = VPl[jp] + for jp in nzrange(R, i) + j = JR[jp] + vpl = VR[jp] # loop over columns "k" in row j of B for kp in nzrange(A, j) k = JA[kp] @@ -1218,15 +1245,15 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, end for jp in 1:ccRA j = JRA[jp] - for kp in nzrange(Pr,j) - k = JPr[kp] + for kp in nzrange(P,j) + k = JP[kp] if xbC[k] != i xbC[k] = i JC[jpC] = k jpC += 1 - xC[k] = xRA[j]*VPr[kp] + xC[k] = xRA[j]*VP[kp] else - xC[k] += xRA[j]*VPr[kp] + xC[k] += xRA[j]*VP[kp] end end end @@ -1236,12 +1263,12 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, end end end - function _rap(Pl,A,Pr,old_cache) - max_rPl = find_max_row_length(Pl) + function _rap(R,A,P,old_cache) + max_rR = find_max_row_length(R) max_rA = find_max_row_length(A) - max_rPr = find_max_row_length(Pr) + max_rP = find_max_row_length(P) (xbRA,xRA,JRA,xbC,xC) = old_cache - max_rC = max((max_rPl*max_rA*max_rPr),(max_rA*max_rPl)) + max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR)) JRA2 = max_rC > length(JRA) ? similar(JRA,max_rC) : JRA if r > length(xbRA) xbRA2 = similar(xbRA,r) @@ -1254,15 +1281,15 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl}, new_cache = (xbRA2,xRA2,JRA2,xbC,xC) xbRA2 .= 0 xbC .= 0 - C = rap_symbolic!(Pl,A,Pr,new_cache) + C = rap_symbolic!(R,A,P,new_cache) xbRA2 .= 0 xbC .= 0 - rap_numeric!(C,Pl,A,Pr,new_cache) + rap_numeric!(C,R,A,P,new_cache) Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C) halfperm!(C,Ct) C,new_cache end - _rap(Pl,A,Pr,cache) + _rap(R,A,P,cache) end function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSR) @@ -1274,34 +1301,34 @@ function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSC) reduce_spmtmm_cache(cache,SparseMatrixCSR) end -function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, - Pl::SparseMatrixCSR{Bi,Tv,TiPl}, - A::SparseMatrixCSR{Bi,Tv,TiA}, - Pr::SparseMatrixCSR{Bi,Tv,TiPr}, - cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr} - p,q = size(Pl) +function rap!(C::SparseMatrixCSR, + R::SparseMatrixCSR, + A::SparseMatrixCSR, + P::SparseMatrixCSR, + cache) + p,q = size(R) m,r = size(A) - n,s = size(Pr) + n,s = size(P) if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end - JPl = colvals(Pl) - VPl = nonzeros(Pl) + JR = colvals(R) + VR = nonzeros(R) JA = colvals(A) VA = nonzeros(A) - JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. - VPr = nonzeros(Pr) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + VP = nonzeros(P) JC = colvals(C) VC = nonzeros(C) - VC .= zero(Tv) + VC .= zero(eltype(C)) (xbRA,xRA,JRA,xbC,xC) = cache xbRA .= 0 xbC .= 0 for i in 1:p lp = 0 # local column pointer, refresh every row, start at 0 to allow empty rows # loop over columns "j" in row i of A - for jp in nzrange(Pl, i) - j = JPl[jp] - vpl = VPl[jp] + for jp in nzrange(R, i) + j = JR[jp] + vpl = VR[jp] # loop over columns "k" in row j of B for kp in nzrange(A, j) @@ -1321,13 +1348,13 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, for jp in 1:lp j = JRA[jp] vra = xRA[j] - for kp in nzrange(Pr,j) - k = JPr[kp] + for kp in nzrange(P,j) + k = JP[kp] if xbC[k] != i xbC[k] = i - xC[k] = vra*VPr[kp] + xC[k] = vra*VP[kp] else - xC[k] += vra*VPr[kp] + xC[k] += vra*VP[kp] end end end @@ -1341,37 +1368,37 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, C end -function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, - Pl::SparseMatrixCSR{Bi,Tv,TiPl}, - A::SparseMatrixCSR{Bi,Tv,TiA}, - Pr::SparseMatrixCSR{Bi,Tv,TiPr}, +function rap!(C::SparseMatrixCSR, + R::SparseMatrixCSR, + A::SparseMatrixCSR, + P::SparseMatrixCSR, α::Number, β::Number, - cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr} - p,q = size(Pl) + cache) + p,q = size(R) m,r = size(A) - n,s = size(Pr) + n,s = size(P) if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end - JPl = colvals(Pl) - VPl = nonzeros(Pl) + JR = colvals(R) + VR = nonzeros(R) JA = colvals(A) VA = nonzeros(A) - JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. - VPr = nonzeros(Pr) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + VP = nonzeros(P) JC = colvals(C) VC = nonzeros(C) VC .*= β (xbRA,xRA,JRA,xbC,xC) = cache xbRA .= 0 xbC .= 0 - xC .= zero(Tv) + # xC .= zero(Tv) for i in 1:p lp = 0 # local column pointer, refresh every row, start at 0 to allow empty rows # loop over columns "j" in row i of A - for jp in nzrange(Pl, i) - j = JPl[jp] - vpl = VPl[jp] + for jp in nzrange(R, i) + j = JR[jp] + vpl = VR[jp] # loop over columns "k" in row j of B for kp in nzrange(A, j) k = JA[kp] @@ -1388,13 +1415,13 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, end for jp in 1:lp j = JRA[jp] - for kp in nzrange(Pr,j) - k = JPr[kp] + for kp in nzrange(P,j) + k = JP[kp] if xbC[k] != i xbC[k] = i - xC[k] = xRA[j]*VPr[kp] + xC[k] = xRA[j]*VP[kp] else - xC[k] += xRA[j]*VPr[kp] + xC[k] += xRA[j]*VP[kp] end end end @@ -1409,45 +1436,46 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, end # RARt variants -function rap(Pl::SparseMatrixCSR{Bi,Tv,TiA}, - A::SparseMatrixCSR{Bi,Tv,TiB}, - Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiC}}) where {Bi,Tv,TiA,TiB,TiC} - p,q = size(Pl) +function rap(R::SparseMatrixCSR, + A::SparseMatrixCSR, + Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv) + p,q = size(R) m,r = size(A) - n,s = size(Prt) + n,s = size(Pt) if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end - rap(Pl,A,copy(Prt)) + rap(R,A,copy(Pt)) end -function rap(Pl::SparseMatrixCSR{Bi,Tv,TiA}, - A::SparseMatrixCSR{Bi,Tv,TiB}, - Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiC}},cache) where {Bi,Tv,TiA,TiB,TiC} - p,q = size(Pl) +function rap(R::SparseMatrixCSR, + A::SparseMatrixCSR, + Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv, + cache) + p,q = size(R) m,r = size(A) - n,s = size(Prt) + n,s = size(Pt) if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end - rap(Pl,A,copy(Prt),cache) + rap(R,A,copy(Pt),cache) end -function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, - Pl::SparseMatrixCSR{Bi,Tv,TiPl}, - A::SparseMatrixCSR{Bi,Tv,TiA}, - Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiPr}}, - cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr} - p,q = size(Pl) +function rap!(C::SparseMatrixCSR, + R::SparseMatrixCSR, + A::SparseMatrixCSR, + Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv, + cache) + p,q = size(R) m,r = size(A) - n,s = size(Prt) + n,s = size(Pt) if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end - Pr = Prt.parent - JPl = colvals(Pl) - VPl = nonzeros(Pl) + P = Pt.parent + JR = colvals(R) + VR = nonzeros(R) JA = colvals(A) VA = nonzeros(A) - IPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. - VPr = nonzeros(Pr) + IP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + VP = nonzeros(P) JC = colvals(C) VC = nonzeros(C) # some cache items are present with the regular rap product in mind, which is how the allocating verison is performed @@ -1455,9 +1483,9 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, xb .= 0 for i in 1:p # loop over columns "j" in row i of A - for jp in nzrange(Pl, i) - j = JPl[jp] - vpl = VPl[jp] + for jp in nzrange(R, i) + j = JR[jp] + vpl = VR[jp] # loop over columns "k" in row j of B for kp in nzrange(A, j) k = JA[kp] @@ -1470,40 +1498,40 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, end end end - for jpPr in nzrange(C,i) - jPr = JC[jpPr] - v = Tv(0) - for ip in nzrange(Pr,jPr) - iPr = IPr[ip] - if xb[iPr] == i - v += x[iPr]*VPr[ip] + for jpP in nzrange(C,i) + jP = JC[jpP] + v = zero(eltype(C)) + for ip in nzrange(P,jP) + iP = IP[ip] + if xb[iP] == i + v += x[iP]*VP[ip] end end - VC[jpPr] = v + VC[jpP] = v end end C end -function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, - Pl::SparseMatrixCSR{Bi,Tv,TiPl}, - A::SparseMatrixCSR{Bi,Tv,TiA}, - Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiPr}}, +function rap!(C::SparseMatrixCSR, + R::SparseMatrixCSR, + A::SparseMatrixCSR, + Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv, α::Number, β::Number, - cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr} - p,q = size(Pl) + cache) + p,q = size(R) m,r = size(A) - n,s = size(Prt) + n,s = size(Pt) if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end - Pr = Prt.parent - JPl = colvals(Pl) - VPl = nonzeros(Pl) + P = Pt.parent + JR = colvals(R) + VR = nonzeros(R) JA = colvals(A) VA = nonzeros(A) - IPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed. - VPr = nonzeros(Pr) + IP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + VP = nonzeros(P) JC = colvals(C) VC = nonzeros(C) VC .*= β @@ -1512,9 +1540,9 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, xb .= 0 for i in 1:p # loop over columns "j" in row i of A - for jp in nzrange(Pl, i) - j = JPl[jp] - vpl = VPl[jp] + for jp in nzrange(R, i) + j = JR[jp] + vpl = VR[jp] # loop over columns "k" in row j of B for kp in nzrange(A, j) k = JA[kp] @@ -1527,146 +1555,124 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC}, end end end - for jpPr in nzrange(C,i) - jPr = JC[jpPr] - v = Tv(0) - for ip in nzrange(Pr,jPr) - iPr = IPr[ip] - if xb[iPr] == i - v += x[iPr]*VPr[ip] + for jpP in nzrange(C,i) + jP = JC[jpP] + v = zero(eltype(C)) + for ip in nzrange(P,jP) + iP = IP[ip] + if xb[iP] == i + v += x[iP]*VP[ip] end end - VC[jpPr] += α*v + VC[jpP] += α*v end end C end ### CSC in terms of CSR -function rap(A::SparseMatrixCSC{Tv,TiA}, - B::SparseMatrixCSC{Tv,TiB}, - C::SparseMatrixCSC{Tv,TiC}) where {Tv,TiA,TiB,TiC} +function rap(A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::SparseMatrixCSC) D,cache = rap(ascsr(C),ascsr(B),ascsr(A)) ascsc(D),cache end -function rap(A::SparseMatrixCSC{Tv,TiA}, - B::SparseMatrixCSC{Tv,TiB}, - C::SparseMatrixCSC{Tv,TiC}, - cache) where {Tv,TiA,TiB,TiC} +function rap(A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::SparseMatrixCSC, + cache) D,new_cache = rap(ascsr(C),ascsr(B),ascsr(A),cache) ascsc(D),new_cache end -function rap!(D::SparseMatrixCSC{Tv,TiD}, - A::SparseMatrixCSC{Tv,TiA}, - B::SparseMatrixCSC{Tv,TiB}, - C::SparseMatrixCSC{Tv,TiC}, - cache) where {Tv,TiD,TiA,TiB,TiC} +function rap!(D::SparseMatrixCSC, + A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::SparseMatrixCSC, + cache) rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache) D end -function rap!(D::SparseMatrixCSC{Tv,TiD}, - A::SparseMatrixCSC{Tv,TiA}, - B::SparseMatrixCSC{Tv,TiB}, - C::SparseMatrixCSC{Tv,TiC}, - cache::JaggedArray{X,Y} where {X<:Integer, Y<:Integer}, - acc) where {Tv,TiD,TiA,TiB,TiC} - rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache,acc) - D -end - -function rap!(D::SparseMatrixCSC{Tv,TiD}, - A::SparseMatrixCSC{Tv,TiA}, - B::SparseMatrixCSC{Tv,TiB}, - C::SparseMatrixCSC{Tv,TiC}, +function rap!(D::SparseMatrixCSC, + A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::SparseMatrixCSC, α::Number, β::Number, - cache) where {Tv,TiD,TiA,TiB,TiC} + cache) rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache) D end -function rap!(D::SparseMatrixCSC{Tv,TiD}, - A::SparseMatrixCSC{Tv,TiA}, - B::SparseMatrixCSC{Tv,TiB}, - C::SparseMatrixCSC{Tv,TiC}, - α::Number, - β::Number, - cache::JaggedArray{X,Y} where {X <: Integer, Y<:Integer}, - acc) where {Tv,TiD,TiA,TiB,TiC} - rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache,acc) - D -end - # PtAP -function rap(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, - B::SparseMatrixCSC{Tv,TiB}, - C::SparseMatrixCSC{Tv,TiC}) where {Tv,TiA,TiB,TiC} +function rap(A::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, + C::SparseMatrixCSC) D,cache = rap(ascsr(C),ascsr(B),transpose(ascsr(A.parent))) ascsc(D),cache end -function rap(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, - B::SparseMatrixCSC{Tv,TiB}, - C::SparseMatrixCSC{Tv,TiC}, - cache) where {Tv,TiA,TiB,TiC} +function rap(A::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, + C::SparseMatrixCSC, + cache) D,cache = rap(ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache) ascsc(D),cache end -function rap!(D::SparseMatrixCSC{Tv,TiD}, - A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, - B::SparseMatrixCSC{Tv,TiB}, - C::SparseMatrixCSC{Tv,TiC}, - cache) where {Tv,TiD,TiA,TiB,TiC} +function rap!(D::SparseMatrixCSC, + A::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, + C::SparseMatrixCSC, + cache) rap!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache) D end -function rap!(D::SparseMatrixCSC{Tv,TiD}, - A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}}, - B::SparseMatrixCSC{Tv,TiB}, - C::SparseMatrixCSC{Tv,TiC}, +function rap!(D::SparseMatrixCSC, + A::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, + C::SparseMatrixCSC, α::Number, β::Number, - cache) where {Tv,TiD,TiA,TiB,TiC} + cache) rap!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),α,β,cache) D end # RARt -function rap(A::SparseMatrixCSC{Tv,Ti}, - B::SparseMatrixCSC{Tv,Ti}, - C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}) where {Tv,Ti<:Integer} +function rap(A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::Transpose{Tv,<:SparseMatrixCSC} where Tv) D,new_cache = rap(transpose(ascsr(C.parent)),ascsr(B),ascsr(A)) ascsc(D),new_cache end -function rap(A::SparseMatrixCSC{Tv,Ti}, - B::SparseMatrixCSC{Tv,Ti}, - C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, - cache) where {Tv,Ti<:Integer} +function rap(A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::Transpose{Tv,<:SparseMatrixCSC} where Tv, + cache) D,new_cache = rap(transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache) ascsc(D),new_cache end -function rap!(D::SparseMatrixCSC{Tv,Ti}, - A::SparseMatrixCSC{Tv,Ti}, - B::SparseMatrixCSC{Tv,Ti}, - C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, - cache) where {Tv,Ti<:Integer} +function rap!(D::SparseMatrixCSC, + A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::Transpose{Tv,<:SparseMatrixCSC} where Tv, + cache) rap!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache) D end -function rap!(D::SparseMatrixCSC{Tv,Ti}, - A::SparseMatrixCSC{Tv,Ti}, - B::SparseMatrixCSC{Tv,Ti}, - C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, +function rap!(D::SparseMatrixCSC, + A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::Transpose{Tv,<:SparseMatrixCSC} where Tv, α::Number, β::Number, - cache) where {Tv,Ti<:Integer} + cache) rap!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),α,β,cache) D end \ No newline at end of file diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl index d8d4e658..6d8ab9b0 100644 --- a/test/spmtmm_tests.jl +++ b/test/spmtmm_tests.jl @@ -3,6 +3,7 @@ using SparseMatricesCSR using PartitionedArrays using LinearAlgebra using Test +using InteractiveUtils function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC,args...) if size(A) != size(B) && return false; end @@ -132,28 +133,98 @@ function parallel_tests(pA,pB,sparse_func) end end +function parallel_time(pA,pB,sparse_func) + A = centralize(sparse_func,pA) + B = centralize(sparse_func,pB) + # explicit parallel transpose + pBt = explicit_transpose(pB) |> fetch + Bt = centralize(sparse_func,pBt) + @test Bt == copy(transpose(B)) + hp_B = halfperm(B) + @test Bt == hp_B + + AB0 = A*B + C0 = transpose(B)*AB0 + # test basic sequential csr implementations to default csc sequential implementations. + pAB,cacheAB = spmm(pA,pB,reuse=true) + print("spmm:\t") + @time spmm(pA,pB,reuse=true) + + # pB will be transposed internally + pC,cacheC = spmtm(pB,pAB,reuse=true) + print("spmtm:\t") + @time spmtm(pB,pAB,reuse=true) + spmm!(pAB,pA,pB,cacheAB) + print("spmm!:\t") + @time spmm!(pAB,pA,pB,cacheAB) + spmtm!(pC,pB,pAB,cacheC) + print("spmtm!:\t") + @time spmtm!(pC,pB,pAB,cacheC) + # pC,cacheC = spmtmm(pA,pB) + pC,cacheC = spmtmm(pB,pA,pB,reuse=true) + print("spmtmm:\t") + # @time spmtmm(pA,pB) + @time spmtmm(pB,pA,pB,reuse=true) + # spmtmm!(pC,pA,pB,cacheC) + spmtmm!(pC,pB,pA,pB,cacheC) + print("spmtmm!:") + # @time spmtmm!(pC,pA,pB,cacheC) + @time spmtmm!(pC,pB,pA,pB,cacheC) + pC,cacheC = spmm(pBt,pAB,reuse=true) + print("spmm:\t") + @time spmm(pBt,pAB,reuse=true) + spmm!(pC,pBt,pAB,cacheC) + print("spmm!:\t") + @time spmm!(pC,pBt,pAB,cacheC) + + # pB will be transposed internally + pC,cacheC = spmmm(pBt,pA,pB,reuse=true) + print("spmmm: ") + @time spmmm(pBt,pA,pB,reuse=true) + spmmm!(pC,pBt,pA,pB,cacheC) + print("spmmm!:") + @time spmmm!(pC,pBt,pA,pB,cacheC) + + # @code_warntype spmmm!(pC,pBt,pA,pB,cacheC) + print("Local SpMM:\t") + C = A*B + @time C = A*B + X,cache = rap(Bt,A,B) + print("RAP:\t") + @time rap(Bt,A,B) + rap!(X,Bt,A,B,cache) + print("RAP!:\t") + @time rap!(X,Bt,A,B,cache) +end + +function Base.display(A::SparseMatrixCSR) + display(halfperm(A) |> PartitionedArrays.ascsc) +end + function spmtmm_tests(distribute) nodes_per_dir = (5,5,5) parts_per_dir = (1,2,2) np = prod(parts_per_dir) ranks = distribute(LinearIndices((np,))) - Ti = Int32 - pA = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = Ti)...) |> fetch - pB = pA - parallel_tests(pA,pB,sparsecsr) - - # Testing with a real prolongator requires PartitionedSolvers - # T = eltype(typeof(own_own_values(pA).items)) - # pB = prolongator(T,pA) - # parallel_tests(pA,pB,sparsecsr) - - #### CSC #### - pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = Ti)...) |> fetch - pB = pA - parallel_tests(pA,pB,sparse) - - # Testing with a real prolongator requires PartitionedSolvers - # T = eltype(typeof(own_own_values(pA).items)) - # pB = prolongator(T,pA) - # parallel_tests(pA,pB,sparse) + for (TiA,TiB,TvA,TvB) in [(Int32,Int32,Float32,Float32),(Int32,Int64,Float32,Float32),(Int32,Int32,Float32,Float64),(Int32,Int64,Float32,Float64),(Int32,Int64,Int64,Int64),(Int32,Int64,Int64,Float32),(Int32,Int64,Float64,Int32)] + pA = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiA,value_type=TvA)...) |> fetch + pB = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiB,value_type=TvB)...) |> fetch + + parallel_tests(pA,pB,sparsecsr) + # Testing with a real prolongator requires PartitionedSolvers + # T = eltype(typeof(own_own_values(pA).items)) + # pB = prolongator(T,pA) + # parallel_tests(pA,pB,sparsecsr) + + #### CSC #### + pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiA, value_type=TvA)...) |> fetch + pB = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiB, value_type=TvB)...) |> fetch + + parallel_tests(pA,pB,sparse) + # Testing with a real prolongator requires PartitionedSolvers + # T = eltype(typeof(own_own_values(pA).items)) + # pB = prolongator(T,pA) + # parallel_tests(pA,pB,sparse) + # break + end end \ No newline at end of file diff --git a/times.txt b/times.txt index 4e9217e8..835d3320 100644 --- a/times.txt +++ b/times.txt @@ -1 +1 @@ -Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2073056, max = 0.2073056, avg = 0.2073056), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4125464, max = 0.4125464, avg = 0.4125464), "Phase 1" => (min = 1.0e-6, max = 1.0e-6, avg = 1.0e-6)) +Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2139673, max = 0.2139673, avg = 0.2139673), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4185178, max = 0.4185178, avg = 0.4185178), "Phase 1" => (min = 1.0e-6, max = 1.0e-6, avg = 1.0e-6)) From 1e48b64f1f54aed68482355e035cacab5e00d0a5 Mon Sep 17 00:00:00 2001 From: jop611 Date: Wed, 29 Jan 2025 12:36:01 +0100 Subject: [PATCH 30/34] fixed exported function name (rap vs RAP). --- src/PartitionedArrays.jl | 4 ++-- test/debug_array/runtests.jl | 20 ++++++++++---------- test/spmtmm_tests.jl | 5 ++--- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl index 2d250e0a..eaa88a17 100644 --- a/src/PartitionedArrays.jl +++ b/src/PartitionedArrays.jl @@ -211,8 +211,8 @@ export near_nullspace_linear_elasticity export prolongator include("gallery.jl") -export RAP -export RAP! +export rap +export rap! export -,+ include("sequential_implementations.jl") diff --git a/test/debug_array/runtests.jl b/test/debug_array/runtests.jl index a175b722..88abfdeb 100644 --- a/test/debug_array/runtests.jl +++ b/test/debug_array/runtests.jl @@ -3,25 +3,25 @@ module DebugArrayRunTests using Test using PartitionedArrays -@testset "debug_array" begin include("debug_array_tests.jl") end +# @testset "debug_array" begin include("debug_array_tests.jl") end -@testset "primitives" begin include("primitives_tests.jl") end +# @testset "primitives" begin include("primitives_tests.jl") end -@testset "p_range" begin include("p_range_tests.jl") end +# @testset "p_range" begin include("p_range_tests.jl") end -@testset "p_vector" begin include("p_vector_tests.jl") end +# @testset "p_vector" begin include("p_vector_tests.jl") end -@testset "p_sparse_matrix" begin include("p_sparse_matrix_tests.jl") end +# @testset "p_sparse_matrix" begin include("p_sparse_matrix_tests.jl") end -@testset "block_arrays" begin include("block_arrays_tests.jl") end +# @testset "block_arrays" begin include("block_arrays_tests.jl") end -@testset "gallery" begin include("gallery_tests.jl") end +# @testset "gallery" begin include("gallery_tests.jl") end -@testset "p_timer" begin include("p_timer_tests.jl") end +# @testset "p_timer" begin include("p_timer_tests.jl") end -@testset "fdm_example" begin include("fdm_example.jl") end +# @testset "fdm_example" begin include("fdm_example.jl") end -@testset "fem_example" begin include("fem_example.jl") end +# @testset "fem_example" begin include("fem_example.jl") end @testset "spmtmm_tests" begin include("spmtmm_tests.jl") end diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl index 6d8ab9b0..1ceceeea 100644 --- a/test/spmtmm_tests.jl +++ b/test/spmtmm_tests.jl @@ -3,7 +3,6 @@ using SparseMatricesCSR using PartitionedArrays using LinearAlgebra using Test -using InteractiveUtils function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC,args...) if size(A) != size(B) && return false; end @@ -209,8 +208,8 @@ function spmtmm_tests(distribute) for (TiA,TiB,TvA,TvB) in [(Int32,Int32,Float32,Float32),(Int32,Int64,Float32,Float32),(Int32,Int32,Float32,Float64),(Int32,Int64,Float32,Float64),(Int32,Int64,Int64,Int64),(Int32,Int64,Int64,Float32),(Int32,Int64,Float64,Int32)] pA = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiA,value_type=TvA)...) |> fetch pB = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiB,value_type=TvB)...) |> fetch - parallel_tests(pA,pB,sparsecsr) + # Testing with a real prolongator requires PartitionedSolvers # T = eltype(typeof(own_own_values(pA).items)) # pB = prolongator(T,pA) @@ -219,8 +218,8 @@ function spmtmm_tests(distribute) #### CSC #### pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiA, value_type=TvA)...) |> fetch pB = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiB, value_type=TvB)...) |> fetch - parallel_tests(pA,pB,sparse) + # Testing with a real prolongator requires PartitionedSolvers # T = eltype(typeof(own_own_values(pA).items)) # pB = prolongator(T,pA) From 33840d8b0b5f87b44451033b291d1795ed23d5db Mon Sep 17 00:00:00 2001 From: jop611 Date: Wed, 29 Jan 2025 13:29:12 +0100 Subject: [PATCH 31/34] workaround for absence of 'reuse' kwargs in local sparse matrix multiplications algorithms --- src/sequential_implementations.jl | 11 +++ test/mpi_array/runtests.jl | 20 ++--- test/spmtmm_tests.jl | 136 +++++++++++++++--------------- 3 files changed, 89 insertions(+), 78 deletions(-) diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl index 3585e38b..49540046 100644 --- a/src/sequential_implementations.jl +++ b/src/sequential_implementations.jl @@ -660,6 +660,17 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR, C end +function rap(A::Union{Transpose{TA,<:AbstractSparseMatrix},<:AbstractSparseMatrix} where TA, + B::M where M<:AbstractSparseMatrix, + C::Union{Transpose{TC,<:AbstractSparseMatrix},<:AbstractSparseMatrix} where TC + ;reuse=Val(true)) + D,cache = rap(A,B,C) + if val_parameter(reuse) + return D,cache + end + D +end + # PtAP variants function rap(Rt::Transpose{TvR,SparseMatrixCSR{Bi,TvR,TiR}}, A::SparseMatrixCSR{Bi,TvA,TiA}, diff --git a/test/mpi_array/runtests.jl b/test/mpi_array/runtests.jl index fc6f0aee..ffdc1f1e 100644 --- a/test/mpi_array/runtests.jl +++ b/test/mpi_array/runtests.jl @@ -3,16 +3,16 @@ module MPIArrayRunTests using Test using PartitionedArrays -# @testset "mpi_array" begin include("mpi_array_tests.jl") end -# @testset "primitives" begin include("primitives_tests.jl") end -# @testset "p_range_tests" begin include("p_range_tests.jl") end -# @testset "p_vector_tests" begin include("p_vector_tests.jl") end -# @testset "p_sparse_matrix_tests" begin include("p_sparse_matrix_tests.jl") end -# @testset "gallery" begin include("gallery_tests.jl") end -# @testset "block_arrays" begin include("block_arrays_tests.jl") end -# @testset "p_timer_tests" begin include("p_timer_tests.jl") end -# @testset "fdm_example" begin include("fdm_example.jl") end -# @testset "fem_example" begin include("fem_example.jl") end +@testset "mpi_array" begin include("mpi_array_tests.jl") end +@testset "primitives" begin include("primitives_tests.jl") end +@testset "p_range_tests" begin include("p_range_tests.jl") end +@testset "p_vector_tests" begin include("p_vector_tests.jl") end +@testset "p_sparse_matrix_tests" begin include("p_sparse_matrix_tests.jl") end +@testset "gallery" begin include("gallery_tests.jl") end +@testset "block_arrays" begin include("block_arrays_tests.jl") end +@testset "p_timer_tests" begin include("p_timer_tests.jl") end +@testset "fdm_example" begin include("fdm_example.jl") end +@testset "fem_example" begin include("fem_example.jl") end @testset "spmtmm_tests" begin include("spmtmm_tests.jl") end end #module diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl index 1ceceeea..aacfdcad 100644 --- a/test/spmtmm_tests.jl +++ b/test/spmtmm_tests.jl @@ -132,73 +132,73 @@ function parallel_tests(pA,pB,sparse_func) end end -function parallel_time(pA,pB,sparse_func) - A = centralize(sparse_func,pA) - B = centralize(sparse_func,pB) - # explicit parallel transpose - pBt = explicit_transpose(pB) |> fetch - Bt = centralize(sparse_func,pBt) - @test Bt == copy(transpose(B)) - hp_B = halfperm(B) - @test Bt == hp_B - - AB0 = A*B - C0 = transpose(B)*AB0 - # test basic sequential csr implementations to default csc sequential implementations. - pAB,cacheAB = spmm(pA,pB,reuse=true) - print("spmm:\t") - @time spmm(pA,pB,reuse=true) +# function parallel_time(pA,pB,sparse_func) +# A = centralize(sparse_func,pA) +# B = centralize(sparse_func,pB) +# # explicit parallel transpose +# pBt = explicit_transpose(pB) |> fetch +# Bt = centralize(sparse_func,pBt) +# @test Bt == copy(transpose(B)) +# hp_B = halfperm(B) +# @test Bt == hp_B + +# AB0 = A*B +# C0 = transpose(B)*AB0 +# # test basic sequential csr implementations to default csc sequential implementations. +# pAB,cacheAB = spmm(pA,pB,reuse=true) +# print("spmm:\t") +# @time spmm(pA,pB,reuse=true) - # pB will be transposed internally - pC,cacheC = spmtm(pB,pAB,reuse=true) - print("spmtm:\t") - @time spmtm(pB,pAB,reuse=true) - spmm!(pAB,pA,pB,cacheAB) - print("spmm!:\t") - @time spmm!(pAB,pA,pB,cacheAB) - spmtm!(pC,pB,pAB,cacheC) - print("spmtm!:\t") - @time spmtm!(pC,pB,pAB,cacheC) - # pC,cacheC = spmtmm(pA,pB) - pC,cacheC = spmtmm(pB,pA,pB,reuse=true) - print("spmtmm:\t") - # @time spmtmm(pA,pB) - @time spmtmm(pB,pA,pB,reuse=true) - # spmtmm!(pC,pA,pB,cacheC) - spmtmm!(pC,pB,pA,pB,cacheC) - print("spmtmm!:") - # @time spmtmm!(pC,pA,pB,cacheC) - @time spmtmm!(pC,pB,pA,pB,cacheC) - pC,cacheC = spmm(pBt,pAB,reuse=true) - print("spmm:\t") - @time spmm(pBt,pAB,reuse=true) - spmm!(pC,pBt,pAB,cacheC) - print("spmm!:\t") - @time spmm!(pC,pBt,pAB,cacheC) - - # pB will be transposed internally - pC,cacheC = spmmm(pBt,pA,pB,reuse=true) - print("spmmm: ") - @time spmmm(pBt,pA,pB,reuse=true) - spmmm!(pC,pBt,pA,pB,cacheC) - print("spmmm!:") - @time spmmm!(pC,pBt,pA,pB,cacheC) - - # @code_warntype spmmm!(pC,pBt,pA,pB,cacheC) - print("Local SpMM:\t") - C = A*B - @time C = A*B - X,cache = rap(Bt,A,B) - print("RAP:\t") - @time rap(Bt,A,B) - rap!(X,Bt,A,B,cache) - print("RAP!:\t") - @time rap!(X,Bt,A,B,cache) -end - -function Base.display(A::SparseMatrixCSR) - display(halfperm(A) |> PartitionedArrays.ascsc) -end +# # pB will be transposed internally +# pC,cacheC = spmtm(pB,pAB,reuse=true) +# print("spmtm:\t") +# @time spmtm(pB,pAB,reuse=true) +# spmm!(pAB,pA,pB,cacheAB) +# print("spmm!:\t") +# @time spmm!(pAB,pA,pB,cacheAB) +# spmtm!(pC,pB,pAB,cacheC) +# print("spmtm!:\t") +# @time spmtm!(pC,pB,pAB,cacheC) +# # pC,cacheC = spmtmm(pA,pB) +# pC,cacheC = spmtmm(pB,pA,pB,reuse=true) +# print("spmtmm:\t") +# # @time spmtmm(pA,pB) +# @time spmtmm(pB,pA,pB,reuse=true) +# # spmtmm!(pC,pA,pB,cacheC) +# spmtmm!(pC,pB,pA,pB,cacheC) +# print("spmtmm!:") +# # @time spmtmm!(pC,pA,pB,cacheC) +# @time spmtmm!(pC,pB,pA,pB,cacheC) +# pC,cacheC = spmm(pBt,pAB,reuse=true) +# print("spmm:\t") +# @time spmm(pBt,pAB,reuse=true) +# spmm!(pC,pBt,pAB,cacheC) +# print("spmm!:\t") +# @time spmm!(pC,pBt,pAB,cacheC) + +# # pB will be transposed internally +# pC,cacheC = spmmm(pBt,pA,pB,reuse=true) +# print("spmmm: ") +# @time spmmm(pBt,pA,pB,reuse=true) +# spmmm!(pC,pBt,pA,pB,cacheC) +# print("spmmm!:") +# @time spmmm!(pC,pBt,pA,pB,cacheC) + +# # @code_warntype spmmm!(pC,pBt,pA,pB,cacheC) +# print("Local SpMM:\t") +# C = A*B +# @time C = A*B +# X,cache = rap(Bt,A,B) +# print("RAP:\t") +# @time rap(Bt,A,B) +# rap!(X,Bt,A,B,cache) +# print("RAP!:\t") +# @time rap!(X,Bt,A,B,cache) +# end + +# function Base.display(A::SparseMatrixCSR) +# display(halfperm(A) |> PartitionedArrays.ascsc) +# end function spmtmm_tests(distribute) nodes_per_dir = (5,5,5) @@ -209,7 +209,7 @@ function spmtmm_tests(distribute) pA = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiA,value_type=TvA)...) |> fetch pB = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiB,value_type=TvB)...) |> fetch parallel_tests(pA,pB,sparsecsr) - + # Testing with a real prolongator requires PartitionedSolvers # T = eltype(typeof(own_own_values(pA).items)) # pB = prolongator(T,pA) @@ -219,7 +219,7 @@ function spmtmm_tests(distribute) pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiA, value_type=TvA)...) |> fetch pB = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiB, value_type=TvB)...) |> fetch parallel_tests(pA,pB,sparse) - + # Testing with a real prolongator requires PartitionedSolvers # T = eltype(typeof(own_own_values(pA).items)) # pB = prolongator(T,pA) From 1816fd739002fa378f8c5025b266c7a0c084d8b9 Mon Sep 17 00:00:00 2001 From: jop611 Date: Wed, 29 Jan 2025 15:41:52 +0100 Subject: [PATCH 32/34] fixed type piracy --- src/PartitionedArrays.jl | 6 +- src/p_sparse_matrix.jl | 71 +++++++++++--------- src/sequential_implementations.jl | 106 +++++++++++++++++------------- test/debug_array/runtests.jl | 20 +++--- test/debug_array/spmtmm_tests.jl | 10 +-- test/spmtmm_tests.jl | 8 +-- times.txt | 2 +- 7 files changed, 129 insertions(+), 94 deletions(-) diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl index eaa88a17..05746ce7 100644 --- a/src/PartitionedArrays.jl +++ b/src/PartitionedArrays.jl @@ -211,9 +211,13 @@ export near_nullspace_linear_elasticity export prolongator include("gallery.jl") +export add +export subtract +export mul +export matmul +export matmul! export rap export rap! -export -,+ include("sequential_implementations.jl") end # module diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index 560ba66a..196bdb90 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -2351,20 +2351,22 @@ end ### NEW ### function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) + @assert A.assembled + @assert B.assembled t = consistent(B,partition(axes(A,2)),reuse=true) A_own_own = own_own_values(A) A_own_ghost = own_ghost_values(A) - C_own_own_1 = map(*,A_own_own,own_own_values(B)) + C_own_own_1 = map(matmul,A_own_own,own_own_values(B)) # Wait for consistent B2, cacheB2 = fetch(t) - C_own_ghost_1 = map(*,A_own_own,own_ghost_values(B2)) - C_own_own_2 = map(*,A_own_ghost,ghost_own_values(B2)) - C_own_ghost_2 = map(*,A_own_ghost,ghost_ghost_values(B2)) + C_own_ghost_1 = map(matmul,A_own_own,own_ghost_values(B2)) + C_own_own_2 = map(matmul,A_own_ghost,ghost_own_values(B2)) + C_own_ghost_2 = map(matmul,A_own_ghost,ghost_ghost_values(B2)) - C_own_own = map(+, C_own_own_1, C_own_own_2) - C_own_ghost = map(+, C_own_ghost_1, C_own_ghost_2) + C_own_own = map(add, C_own_own_1, C_own_own_2) + C_own_ghost = map(add, C_own_ghost_1, C_own_ghost_2) Coo_cache = map(construct_spmm_cache, C_own_own) Cog_cache = map(construct_spmm_cache, C_own_ghost) @@ -2392,12 +2394,12 @@ function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) C_own_own = own_own_values(C) C_own_ghost = own_ghost_values(C) - map(mul!, C_own_own, A_own_own, own_own_values(B),Coo_cache) + map(matmul!, C_own_own, A_own_own, own_own_values(B),Coo_cache) wait(t) - map(mul!, C_own_ghost, A_own_own, own_ghost_values(B2),Cog_cache) + map(matmul!, C_own_ghost, A_own_own, own_ghost_values(B2),Cog_cache) - map((C,A,B,cache) -> mul!(C,A,B,1,1,cache), C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache) - map((C,A,B,cache) -> mul!(C,A,B,1,1,cache), C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache) + map((C,A,B,cache) -> matmul!(C,A,B,1,1,cache), C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache) + map((C,A,B,cache) -> matmul!(C,A,B,1,1,cache), C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache) C end ### End NEW ### @@ -2440,13 +2442,15 @@ end ### NEW ### function spmtm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) + @assert A.assembled + @assert B.assembled Aoo = own_own_values(A) Aog = own_ghost_values(A) Boo = own_own_values(B) Bog = own_ghost_values(B) - C1go = map((A,B)->transpose(A)*B,Aog,Boo) - C1gg = map((A,B)->transpose(A)*B,Aog,Bog) + C1go = map((A,B)->matmul(transpose(A),B),Aog,Boo) + C1gg = map((A,B)->matmul(transpose(A),B),Aog,Bog) C1_values = map(C1go, C1gg, partition(A), partition(B)) do ghost_own, ghost_ghost, A_part, B_part own_own = similar(ghost_ghost, size(A_part.blocks.own_own, 2), size(B_part.blocks.own_own, 2)) @@ -2459,8 +2463,8 @@ function spmtm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) C1_unassembled = PSparseMatrix(C1_values,partition(axes(A,2)),partition(axes(B,2)),assembled) t = assemble(C1_unassembled,reuse=true) - C2oo = map((A,B)->transpose(A)*B,Aoo,Boo) - C2og = map((A,B)->transpose(A)*B,Aoo,Bog) + C2oo = map((A,B)->matmul(transpose(A),B),Aoo,Boo) + C2og = map((A,B)->matmul(transpose(A),B),Aoo,Bog) C2_values = map(C2oo, C2og, partition(A), partition(B)) do own_own, own_ghost, A_part, B_part ghost_own = similar(own_own,0,size(own_own,2)) @@ -2497,12 +2501,12 @@ function spmtm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) Boo = own_own_values(B) Bog = own_ghost_values(B) - map((C,A,B,cache)->mul!(C,transpose(A),B,cache),ghost_own_values(C1_unassembled),Aog,Boo,Cgo_cache) - map((C,A,B,cache)->mul!(C,transpose(A),B,cache),ghost_ghost_values(C1_unassembled),Aog,Bog,Cgg_cache) + map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),ghost_own_values(C1_unassembled),Aog,Boo,Cgo_cache) + map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),ghost_ghost_values(C1_unassembled),Aog,Bog,Cgg_cache) t = assemble!(C1, C1_unassembled, assemblyCache) - map((C,A,B,cache)->mul!(C,transpose(A),B,cache),own_own_values(C2),Aoo,Boo,Coo_cache) - map((C,A,B,cache)->mul!(C,transpose(A),B,cache),own_ghost_values(C2),Aoo,Bog,Cog_cache) + map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),own_own_values(C2),Aoo,Boo,Coo_cache) + map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),own_ghost_values(C2),Aoo,Bog,Cog_cache) wait(t) add!(C, C1, C2, mergeCache) C @@ -3059,7 +3063,7 @@ end function add(A::PSparseMatrix,B::PSparseMatrix) function add_own_own(A,B) - C = A+B + C = add(A,B) # reuse IA/IB for cache KA = precompute_nzindex(C,A) KB = precompute_nzindex(C,B) @@ -3129,6 +3133,9 @@ end # Interpret A as if its transpose is needed function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(false)) + @assert A.assembled + @assert B.assembled + @assert C.assembled consistency_task = consistent(C, partition(axes(B,2)),reuse=true) Aoo = own_own_values(A) @@ -3152,8 +3159,8 @@ function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fal Dog1, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Boo,Cog2,Dgo_cache) |> tuple_of_arrays Dog2, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Bog,Cgg,Dog_cache) |> tuple_of_arrays - Dgo = map(+,Dgo1,Dgo2) # different sparsity patterns so not in-place. - Dog = map(+,Dog1,Dog2) + Dgo = map(add,Dgo1,Dgo2) # different sparsity patterns so not in-place. + Dog = map(add,Dog1,Dog2) D1_values = map(Dgo, Dog, partition(C), partition(C2)) do ghost_own, ghost_ghost, C_part, C2_part own_own = similar(ghost_ghost, size(C_part.blocks.own_own, 2), size(C2_part.blocks.own_own, 2)) @@ -3168,8 +3175,8 @@ function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fal Doo2,Doo_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays Dog2,Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays - Doo = map(+,Doo1,Doo2) - Dog = map(+,Dog1,Dog2) + Doo = map(add,Doo1,Doo2) + Dog = map(add,Dog1,Dog2) Doo_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Doo_cache,Doo) Dog_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dog_cache,Dog) @@ -3195,6 +3202,8 @@ function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fal end function spmtmm(A::PSparseMatrix,P::PSparseMatrix;kwargs...) + @assert A.assembled + @assert P.assembled spmtmm(transpose(P),A,P;kwargs...) end @@ -3226,7 +3235,6 @@ function spmtmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMa Cgg = ghost_ghost_values(C2) map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dgg,Aog,Boo,Cog2,Dgg_cache) - map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dgo,Aog,Bog,Cgo,Dgo_cache) map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dgg,Aog,Bog,Cgg,Dgg_cache) @@ -3246,6 +3254,9 @@ function spmtmm!(C::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache) end function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(false)) + @assert A.assembled + @assert B.assembled + @assert C.assembled B2_task = consistent(B,partition(axes(A,2)),reuse=true) Aoo = own_own_values(A) Aog = own_ghost_values(A) @@ -3261,7 +3272,7 @@ function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fals Bgg = ghost_ghost_values(B2) Doo2,Doo_cache = map(rap,Aog,Bgo,Coo,Doo_cache) |> tuple_of_arrays - Doo12 = map(+,Doo1,Doo2) + Doo12 = map(add,Doo1,Doo2) C2, Ccache = fetch(C2_task) @@ -3272,17 +3283,17 @@ function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fals Doo3,Doo_cache = map(rap,Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays Doo4,Doo_cache = map(rap,Aog,Bgg,Cgo,Doo_cache) |> tuple_of_arrays - Doo34 = map(+,Doo3,Doo4) - Doo = map(+,Doo12,Doo34) + Doo34 = map(add,Doo3,Doo4) + Doo = map(add,Doo12,Doo34) Dog1,Dog_cache = map(rap,Aoo,Boo,Cog) |> tuple_of_arrays Dog2,Dog_cache = map(rap,Aog,Bgo,Cog,Dog_cache) |> tuple_of_arrays Dog3,Dog_cache = map(rap,Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays Dog4,Dog_cache = map(rap,Aog,Bgg,Cgg,Dog_cache) |> tuple_of_arrays - Dog12 = map(+,Dog1,Dog2) - Dog34 = map(+,Dog3,Dog4) - Dog = map(+,Dog12,Dog34) + Dog12 = map(add,Dog1,Dog2) + Dog34 = map(add,Dog3,Dog4) + Dog = map(add,Dog12,Dog34) D_values = map(Doo, Dog, partition(A),partition(C2)) do own_own, own_ghost, A_part,C_part ghost_own = similar(own_own,0,size(own_own, 2)) diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl index 49540046..f69da7b1 100644 --- a/src/sequential_implementations.jl +++ b/src/sequential_implementations.jl @@ -1,34 +1,52 @@ -function Base.:*(A::SparseMatrixCSR,B::SparseMatrixCSR) - C = ascsc(B)*ascsc(A) +function matmul(A::SparseMatrixCSC,B::SparseMatrixCSC) + A*B +end + +function matmul(A::Transpose{Tv,<:SparseMatrixCSC} where Tv,B::SparseMatrixCSC) + A*B +end + +function matmul(A::SparseMatrixCSC,B::Transpose{Tv,<:SparseMatrixCSC} where Tv) + A*B +end + +function matmul(A::Transpose{TvA,<:SparseMatrixCSC} where TvA,B::Transpose{TvB,<:SparseMatrixCSC} where TvB) + A*B +end + +function matmul(A::SparseMatrixCSR,B::SparseMatrixCSR) + C = matmul(ascsc(B),ascsc(A)) ascsr(C) end -function Base.:*(At::Transpose{Tv,<:SparseMatrixCSR} where Tv,B::SparseMatrixCSR) - C = ascsc(B)*transpose(ascsc(At.parent)) +function matmul(At::Transpose{Tv,<:SparseMatrixCSR} where Tv,B::SparseMatrixCSR) + C = matmul(ascsc(B),transpose(ascsc(At.parent))) ascsr(C) end -function Base.:*(A::SparseMatrixCSR,Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv) +function matmul(A::SparseMatrixCSR,Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv) C = transpose(ascsc(Bt.parent))*ascsc(A) ascsr(C) end -function Base.:*(At::Transpose{TvA,<:SparseMatrixCSR} where TvA,Bt::Transpose{TvB,<:SparseMatrixCSR} where TvB) +function matmul(At::Transpose{TvA,<:SparseMatrixCSR} where TvA,Bt::Transpose{TvB,<:SparseMatrixCSR} where TvB) C = transpose(ascsc(Bt.parent))*transpose(ascsc(At.parent)) ascsr(C) end -function Base.:*(x::Number,A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} +function mul(x::Number,A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> x*a, A.nzval)) end -function Base.:*(A::SparseMatrixCSR,x::Number) *(x,A) end -function Base.:/(A::SparseMatrixCSR{Bi,Tv,Ti},x::Number) where {Bi,Tv,Ti} - SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> a/x, A.nzval)) -end +function mul(A::SparseMatrixCSR,x::Number) mul(x,A) end + + +# function quotient(A::SparseMatrixCSR{Bi,Tv,Ti},x::Number) where {Bi,Tv,Ti} +# SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> a/x, A.nzval)) +# end # Alternative to lazy csr to csc for matrix addition that does not drop structural zeros. -function Base.:+(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB} +function add(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB} if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end Ti = promote_type(TiA,TiB) Tv = promote_type(TvA,TvB) @@ -88,7 +106,7 @@ function Base.:+(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) end # Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B. -function Base.:-(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB} +function subtract(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB} if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end Ti = promote_type(TiA,TiB) Tv = promote_type(TvA,TvB) @@ -147,12 +165,12 @@ function Base.:-(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) SparseMatrixCSR{Bi}(p,r,IC,JC,VC) # A += B end -function Base.:-(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} +function subtract(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a->-a, A.nzval)) end # Alternative to lazy csr to csc for matrix addition that does not drop structural zeros. -function Base.:+(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB} +function add(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB} if size(A) != size(B) && throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end Ti = promote_type(TiA,TiB) Tv = promote_type(TvA,TvB) @@ -212,7 +230,7 @@ function Base.:+(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where end # Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B. -function Base.:-(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB} +function subtract(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB} if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end Ti = promote_type(TiA,TiB) Tv = promote_type(TvA,TvB) @@ -271,33 +289,33 @@ function Base.:-(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where SparseMatrixCSC{Tv,Ti}(p,q,JC,IC,VC) end -function Base.:-(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} +function subtract(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} SparseMatrixCSC{Tv,Ti}(size(A)..., copy(A.colptr), copy(A.rowval), map(a->-a, A.nzval)) end -function LinearAlgebra.mul!(C::SparseMatrixCSC, +function matmul!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC, cache) - mul!(ascsr(C),ascsr(B),ascsr(A),cache) + matmul!(ascsr(C),ascsr(B),ascsr(A),cache) C end -function LinearAlgebra.mul!(C::SparseMatrixCSC, +function matmul!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC, α::Number, β::Number, cache) - mul!(ascsr(C),ascsr(B),ascsr(A),α,β,cache) + matmul!(ascsr(C),ascsr(B),ascsr(A),α,β,cache) C end -function LinearAlgebra.mul!(C::SparseMatrixCSC, - At::Transpose{Tv,<:SparseMatrixCSC} where Tv, - B::SparseMatrixCSC) +function matmul!(C::SparseMatrixCSC, + At::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC) a,b = size(C) p,q = size(At) r,s = size(B) @@ -342,7 +360,7 @@ function LinearAlgebra.mul!(C::SparseMatrixCSC, C end -function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, +function matmul!(C::SparseMatrixCSC{Tv,Ti}, At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, B::SparseMatrixCSC{Tv,Ti}, α::Number, @@ -389,14 +407,14 @@ function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti}, C end -function LinearAlgebra.mul!(C::SparseMatrixCSC, +function matmul!(C::SparseMatrixCSC, A::SparseMatrixCSC, Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv) - mul!(ascsr(C),transpose(ascsr(B)),ascsr(A)) + matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A)) C end -function LinearAlgebra.mul!(C::SparseMatrixCSR, +function matmul!(C::SparseMatrixCSR, A::SparseMatrixCSR, B::SparseMatrixCSR, cache) @@ -446,15 +464,15 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR, C end -function LinearAlgebra.mul!(C::SparseMatrixCSC, +function matmul!(C::SparseMatrixCSC, A::SparseMatrixCSC, Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv, cache) - mul!(ascsr(C),transpose(ascsr(B)),ascsr(A),cache) + matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),cache) C end -function LinearAlgebra.mul!(C::SparseMatrixCSR, +function matmul!(C::SparseMatrixCSR, A::SparseMatrixCSR, B::SparseMatrixCSR, α::Number, @@ -506,35 +524,35 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR, C end -function LinearAlgebra.mul!(C::SparseMatrixCSC, +function matmul!(C::SparseMatrixCSC, A::SparseMatrixCSC, Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv, α::Number, β::Number, cache) - mul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),α,β,cache) + matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),α,β,cache) C end -function LinearAlgebra.mul!(C::SparseMatrixCSC, +function matmul!(C::SparseMatrixCSC, At::Transpose{Tv,<:SparseMatrixCSC} where Tv, B::SparseMatrixCSC, cache) - mul!(ascsr(C),ascsr(B),transpose(ascsr(At.parent))) + matmul!(ascsr(C),ascsr(B),transpose(ascsr(At.parent))) C end -function LinearAlgebra.mul!(C::SparseMatrixCSC, +function matmul!(C::SparseMatrixCSC, At::Transpose{Tv,<:SparseMatrixCSC} where Tv, B::SparseMatrixCSC, α::Number, β::Number, cache) - mul!(ascsr(C),ascsr(A),transpose(ascsr(At.parent)),α,β) + matmul!(ascsr(C),ascsr(A),transpose(ascsr(At.parent)),α,β) C end -# Workaround to supply in-place mul! with auxiliary array, as these are not returned by multiply function exported by SparseArrays +# Workaround to supply in-place matmul with auxiliary array, as these are not returned by multiply function exported by SparseArrays function construct_spmm_cache(A::SparseMatrixCSR{Bi,Tv,Ti} where Bi) where {Tv,Ti} q = size(A,2) xb = zeros(Ti,q) @@ -556,7 +574,7 @@ function construct_spmtm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} construct_spmtm_cache(ascsr(A)) end -function LinearAlgebra.mul!(C::SparseMatrixCSR, +function matmul!(C::SparseMatrixCSR, At::Transpose{Tv,<:SparseMatrixCSR} where Tv, B::SparseMatrixCSR, cache) @@ -599,7 +617,7 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR, C end -function LinearAlgebra.mul!(C::SparseMatrixCSR, +function matmul!(C::SparseMatrixCSR, At::Transpose{Tv,<:SparseMatrixCSR} where Tv, B::SparseMatrixCSR, α::Number, @@ -644,19 +662,19 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR, C end -function LinearAlgebra.mul!(C::SparseMatrixCSR, +function matmul!(C::SparseMatrixCSR, A::SparseMatrixCSR, Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv) - mul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A)) + matmul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A)) C end -function LinearAlgebra.mul!(C::SparseMatrixCSR, +function matmul!(C::SparseMatrixCSR, A::SparseMatrixCSR, Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv, α::Number, β::Number) - mul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A), α, β) + matmul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A), α, β) C end diff --git a/test/debug_array/runtests.jl b/test/debug_array/runtests.jl index 88abfdeb..a175b722 100644 --- a/test/debug_array/runtests.jl +++ b/test/debug_array/runtests.jl @@ -3,25 +3,25 @@ module DebugArrayRunTests using Test using PartitionedArrays -# @testset "debug_array" begin include("debug_array_tests.jl") end +@testset "debug_array" begin include("debug_array_tests.jl") end -# @testset "primitives" begin include("primitives_tests.jl") end +@testset "primitives" begin include("primitives_tests.jl") end -# @testset "p_range" begin include("p_range_tests.jl") end +@testset "p_range" begin include("p_range_tests.jl") end -# @testset "p_vector" begin include("p_vector_tests.jl") end +@testset "p_vector" begin include("p_vector_tests.jl") end -# @testset "p_sparse_matrix" begin include("p_sparse_matrix_tests.jl") end +@testset "p_sparse_matrix" begin include("p_sparse_matrix_tests.jl") end -# @testset "block_arrays" begin include("block_arrays_tests.jl") end +@testset "block_arrays" begin include("block_arrays_tests.jl") end -# @testset "gallery" begin include("gallery_tests.jl") end +@testset "gallery" begin include("gallery_tests.jl") end -# @testset "p_timer" begin include("p_timer_tests.jl") end +@testset "p_timer" begin include("p_timer_tests.jl") end -# @testset "fdm_example" begin include("fdm_example.jl") end +@testset "fdm_example" begin include("fdm_example.jl") end -# @testset "fem_example" begin include("fem_example.jl") end +@testset "fem_example" begin include("fem_example.jl") end @testset "spmtmm_tests" begin include("spmtmm_tests.jl") end diff --git a/test/debug_array/spmtmm_tests.jl b/test/debug_array/spmtmm_tests.jl index ba3cf431..5969b60f 100644 --- a/test/debug_array/spmtmm_tests.jl +++ b/test/debug_array/spmtmm_tests.jl @@ -7,12 +7,14 @@ include(joinpath("..","spmtmm_tests.jl")) v = 1:5 M = sparse(v,v,v) -@test nnz(M-M) == nnz(M) -display(M-M) +Z = subtract(M,M) +@test nnz(Z) == nnz(M) +display(Z) M = sparsecsr(v,v,v) -@test nnz(M-M) == nnz(M) -display(M-M) +Z = subtract(M,M) +@test nnz(Z) == nnz(M) +display(Z) with_debug(spmtmm_tests) diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl index aacfdcad..bf80328a 100644 --- a/test/spmtmm_tests.jl +++ b/test/spmtmm_tests.jl @@ -51,8 +51,8 @@ function parallel_tests(pA,pB,sparse_func) hp_B = halfperm(B) @test Bt == hp_B - AB0 = A*B - C0 = transpose(B)*AB0 + AB0 = matmul(A,B) + C0 = matmul(transpose(B),AB0) # test basic sequential csr implementations to default csc sequential implementations. pAB,cacheAB = spmm(pA,pB,reuse=true) AB = centralize(sparse_func,pAB) @@ -96,8 +96,8 @@ function parallel_tests(pA,pB,sparse_func) # unequal sizes backward (small to large) if size(pA) != size(pB) - CB0 = C0*Bt - D0 = transpose(Bt)*CB0 + CB0 = matmul(C0,Bt) + D0 = matmul(transpose(Bt),CB0) pCB,cacheCB = spmm(pC,pBt,reuse=true) CB = centralize(sparse_func,pCB) @test approx_equivalent(CB,CB0) diff --git a/times.txt b/times.txt index 835d3320..e4fd4e27 100644 --- a/times.txt +++ b/times.txt @@ -1 +1 @@ -Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2139673, max = 0.2139673, avg = 0.2139673), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4185178, max = 0.4185178, avg = 0.4185178), "Phase 1" => (min = 1.0e-6, max = 1.0e-6, avg = 1.0e-6)) +Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2017, max = 0.2017, avg = 0.2017), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4044642, max = 0.4044642, avg = 0.4044642), "Phase 1" => (min = 2.0e-7, max = 2.0e-7, avg = 2.0e-7)) From 1794482bc23e1875fb1859fbfb1b154b32bf2b03 Mon Sep 17 00:00:00 2001 From: jop611 Date: Thu, 30 Jan 2025 13:39:53 +0100 Subject: [PATCH 33/34] fixed oversight in sarse matrix expansion in consistent --- src/p_sparse_matrix.jl | 134 ++++++++++++++--------------- src/sequential_implementations.jl | 135 ++++++++++++------------------ src/sparse_utils.jl | 33 ++++---- 3 files changed, 137 insertions(+), 165 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index 196bdb90..29bef80a 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -1964,12 +1964,13 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D}, map_global_to_own!(J_rcv_own,cols_co) map_global_to_ghost!(J_rcv_ghost,cols_co) n_ghost_rows = ghost_length(rows_co) + n_own_rows = own_length(rows_co) n_own_cols = own_length(cols_co) n_ghost_cols = ghost_length(cols_co) TA = typeof(A.blocks.ghost_own) own_own = A.blocks.own_own - # New own_ghost shares as much memory with existing own_ghost block as possible. Extent depends on sparse format in use. - own_ghost = expand_sparse_matrix_columns(A.blocks.own_ghost,n_ghost_cols) + # New own_ghost shares index and value arrays with existing own_ghost block. Pointer arrays are newly allocated (in case of CSC and CSR). + own_ghost = expand_sparse_matrix(A.blocks.own_ghost,n_own_rows,n_ghost_cols) ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols) ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols) K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own) @@ -2326,82 +2327,81 @@ function spmm!(C,A,B,state) end ### OLD ### -# function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) -# # TODO latency hiding -# @assert A.assembled -# @assert B.assembled -# col_partition = partition(axes(A,2)) -# C,cacheC = consistent(B,col_partition;reuse=true) |> fetch -# D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays -# assembled = true -# D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled) -# if val_parameter(reuse) -# cache = (C,cacheC,cacheD) -# return D,cache -# end -# D -# end - -# function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) -# (C,cacheC,cacheD)= cache -# consistent!(C,B,cacheC) |> wait -# map(spmm!,partition(D),partition(A),partition(C),cacheD) -# D -# end - -### NEW ### function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) + # TODO latency hiding @assert A.assembled @assert B.assembled - t = consistent(B,partition(axes(A,2)),reuse=true) - A_own_own = own_own_values(A) - A_own_ghost = own_ghost_values(A) + col_partition = partition(axes(A,2)) + C,cacheC = consistent(B,col_partition;reuse=true) |> fetch + D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays + assembled = true + D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled) + if val_parameter(reuse) + cache = (C,cacheC,cacheD) + return D,cache + end + D +end - C_own_own_1 = map(matmul,A_own_own,own_own_values(B)) - - # Wait for consistent - B2, cacheB2 = fetch(t) - C_own_ghost_1 = map(matmul,A_own_own,own_ghost_values(B2)) - C_own_own_2 = map(matmul,A_own_ghost,ghost_own_values(B2)) - C_own_ghost_2 = map(matmul,A_own_ghost,ghost_ghost_values(B2)) +function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) + (C,cacheC,cacheD)= cache + consistent!(C,B,cacheC) |> wait + map(spmm!,partition(D),partition(A),partition(C),cacheD) + D +end + +### NEW ### +# function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) +# @assert A.assembled +# @assert B.assembled +# t = consistent(B,partition(axes(A,2)),reuse=true) +# A_own_own = own_own_values(A) +# A_own_ghost = own_ghost_values(A) +# C_own_own_1 = map(matmul,A_own_own,own_own_values(B)) + +# # Wait for consistent +# B2, cacheB2 = fetch(t) +# C_own_ghost_1 = map(matmul,A_own_own,own_ghost_values(B2)) +# C_own_own_2 = map(matmul,A_own_ghost,ghost_own_values(B2)) +# C_own_ghost_2 = map(matmul,A_own_ghost,ghost_ghost_values(B2)) - C_own_own = map(add, C_own_own_1, C_own_own_2) - C_own_ghost = map(add, C_own_ghost_1, C_own_ghost_2) +# C_own_own = map(add, C_own_own_1, C_own_own_2) +# C_own_ghost = map(add, C_own_ghost_1, C_own_ghost_2) - Coo_cache = map(construct_spmm_cache, C_own_own) - Cog_cache = map(construct_spmm_cache, C_own_ghost) +# Coo_cache = map(construct_spmm_cache, C_own_own) +# Cog_cache = map(construct_spmm_cache, C_own_ghost) - C_values = map(C_own_own,C_own_ghost,partition(A),partition(B2)) do own_own,own_ghost,A_part,B_part - ghost_own = similar(own_own,0,size(own_own,2)) - ghost_ghost = similar(own_own,0,size(own_ghost,2)) - blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) - split_matrix(blocks,A_part.row_permutation,B_part.col_permutation) - end +# C_values = map(C_own_own,C_own_ghost,partition(A),partition(B2)) do own_own,own_ghost,A_part,B_part +# ghost_own = similar(own_own,0,size(own_own,2)) +# ghost_ghost = similar(own_own,0,size(own_ghost,2)) +# blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) +# split_matrix(blocks,A_part.row_permutation,B_part.col_permutation) +# end - C = PSparseMatrix(C_values,partition(axes(A,1)),partition(axes(B2,2)),true) - if val_parameter(reuse) - cache = (B2,cacheB2,(Coo_cache,Cog_cache)) - return C,cache - end - C -end +# C = PSparseMatrix(C_values,partition(axes(A,1)),partition(axes(B2,2)),true) +# if val_parameter(reuse) +# cache = (B2,cacheB2,(Coo_cache,Cog_cache)) +# return C,cache +# end +# C +# end -function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) - (B2,cacheB2,(Coo_cache,Cog_cache)) = cache - t = consistent!(B2,B,cacheB2) - A_own_own = own_own_values(A) - A_own_ghost = own_ghost_values(A) - C_own_own = own_own_values(C) - C_own_ghost = own_ghost_values(C) +# function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) +# (B2,cacheB2,(Coo_cache,Cog_cache)) = cache +# t = consistent!(B2,B,cacheB2) +# A_own_own = own_own_values(A) +# A_own_ghost = own_ghost_values(A) +# C_own_own = own_own_values(C) +# C_own_ghost = own_ghost_values(C) - map(matmul!, C_own_own, A_own_own, own_own_values(B),Coo_cache) - wait(t) - map(matmul!, C_own_ghost, A_own_own, own_ghost_values(B2),Cog_cache) +# map(matmul!,C_own_own,A_own_own,own_own_values(B),Coo_cache) +# wait(t) +# map(matmul!,C_own_ghost,A_own_own,own_ghost_values(B2),Cog_cache) - map((C,A,B,cache) -> matmul!(C,A,B,1,1,cache), C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache) - map((C,A,B,cache) -> matmul!(C,A,B,1,1,cache), C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache) - C -end +# map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache) +# map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache) +# C +# end ### End NEW ### function spmtm(A,B;reuse=Val(false)) diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl index f69da7b1..ed952606 100644 --- a/src/sequential_implementations.jl +++ b/src/sequential_implementations.jl @@ -1,16 +1,5 @@ -function matmul(A::SparseMatrixCSC,B::SparseMatrixCSC) - A*B -end - -function matmul(A::Transpose{Tv,<:SparseMatrixCSC} where Tv,B::SparseMatrixCSC) - A*B -end - -function matmul(A::SparseMatrixCSC,B::Transpose{Tv,<:SparseMatrixCSC} where Tv) - A*B -end - -function matmul(A::Transpose{TvA,<:SparseMatrixCSC} where TvA,B::Transpose{TvB,<:SparseMatrixCSC} where TvB) +function matmul(A::Union{Transpose{TvA,<:SparseMatrixCSC},<:SparseMatrixCSC} where TvA, + B::Union{Transpose{TvB,<:SparseMatrixCSC},<:SparseMatrixCSC} where TvB) A*B end @@ -40,11 +29,6 @@ end function mul(A::SparseMatrixCSR,x::Number) mul(x,A) end - -# function quotient(A::SparseMatrixCSR{Bi,Tv,Ti},x::Number) where {Bi,Tv,Ti} -# SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> a/x, A.nzval)) -# end - # Alternative to lazy csr to csc for matrix addition that does not drop structural zeros. function add(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB} if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end @@ -293,22 +277,20 @@ function subtract(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} SparseMatrixCSC{Tv,Ti}(size(A)..., copy(A.colptr), copy(A.rowval), map(a->-a, A.nzval)) end - function matmul!(C::SparseMatrixCSC, - A::SparseMatrixCSC, - B::SparseMatrixCSC, - cache) + A::SparseMatrixCSC, + B::SparseMatrixCSC, + cache) matmul!(ascsr(C),ascsr(B),ascsr(A),cache) C end - function matmul!(C::SparseMatrixCSC, - A::SparseMatrixCSC, - B::SparseMatrixCSC, - α::Number, - β::Number, - cache) + A::SparseMatrixCSC, + B::SparseMatrixCSC, + α::Number, + β::Number, + cache) matmul!(ascsr(C),ascsr(B),ascsr(A),α,β,cache) C end @@ -361,10 +343,10 @@ function matmul!(C::SparseMatrixCSC, end function matmul!(C::SparseMatrixCSC{Tv,Ti}, - At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, - B::SparseMatrixCSC{Tv,Ti}, - α::Number, - β::Number) where {Tv,Ti} + At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, + B::SparseMatrixCSC{Tv,Ti}, + α::Number, + β::Number) where {Tv,Ti} a,b = size(C) p,q = size(At) r,s = size(B) @@ -408,16 +390,16 @@ function matmul!(C::SparseMatrixCSC{Tv,Ti}, end function matmul!(C::SparseMatrixCSC, - A::SparseMatrixCSC, - Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv) + A::SparseMatrixCSC, + Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv) matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A)) C end function matmul!(C::SparseMatrixCSR, - A::SparseMatrixCSR, - B::SparseMatrixCSR, - cache) + A::SparseMatrixCSR, + B::SparseMatrixCSR, + cache) a,b = size(C) p,q = size(A) r,s = size(B) @@ -465,19 +447,19 @@ function matmul!(C::SparseMatrixCSR, end function matmul!(C::SparseMatrixCSC, - A::SparseMatrixCSC, - Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv, - cache) + A::SparseMatrixCSC, + Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv, + cache) matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),cache) C end function matmul!(C::SparseMatrixCSR, - A::SparseMatrixCSR, - B::SparseMatrixCSR, - α::Number, - β::Number, - cache) + A::SparseMatrixCSR, + B::SparseMatrixCSR, + α::Number, + β::Number, + cache) a,b = size(C) p,q = size(A) r,s = size(B) @@ -525,29 +507,29 @@ function matmul!(C::SparseMatrixCSR, end function matmul!(C::SparseMatrixCSC, - A::SparseMatrixCSC, - Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv, - α::Number, - β::Number, - cache) + A::SparseMatrixCSC, + Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv, + α::Number, + β::Number, + cache) matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),α,β,cache) C end function matmul!(C::SparseMatrixCSC, - At::Transpose{Tv,<:SparseMatrixCSC} where Tv, - B::SparseMatrixCSC, - cache) + At::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, + cache) matmul!(ascsr(C),ascsr(B),transpose(ascsr(At.parent))) C end function matmul!(C::SparseMatrixCSC, - At::Transpose{Tv,<:SparseMatrixCSC} where Tv, - B::SparseMatrixCSC, - α::Number, - β::Number, - cache) + At::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, + α::Number, + β::Number, + cache) matmul!(ascsr(C),ascsr(A),transpose(ascsr(At.parent)),α,β) C end @@ -575,9 +557,9 @@ function construct_spmtm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} end function matmul!(C::SparseMatrixCSR, - At::Transpose{Tv,<:SparseMatrixCSR} where Tv, - B::SparseMatrixCSR, - cache) + At::Transpose{Tv,<:SparseMatrixCSR} where Tv, + B::SparseMatrixCSR, + cache) a,b = size(C) p,q = size(At) r,s = size(B) @@ -618,11 +600,11 @@ function matmul!(C::SparseMatrixCSR, end function matmul!(C::SparseMatrixCSR, - At::Transpose{Tv,<:SparseMatrixCSR} where Tv, - B::SparseMatrixCSR, - α::Number, - β::Number, - cache) + At::Transpose{Tv,<:SparseMatrixCSR} where Tv, + B::SparseMatrixCSR, + α::Number, + β::Number, + cache) a,b = size(C) p,q = size(At) r,s = size(B) @@ -663,17 +645,17 @@ function matmul!(C::SparseMatrixCSR, end function matmul!(C::SparseMatrixCSR, - A::SparseMatrixCSR, - Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv) + A::SparseMatrixCSR, + Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv) matmul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A)) C end function matmul!(C::SparseMatrixCSR, - A::SparseMatrixCSR, - Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv, - α::Number, - β::Number) + A::SparseMatrixCSR, + Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv, + α::Number, + β::Number) matmul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A), α, β) C end @@ -1169,15 +1151,6 @@ function rap(R::SparseMatrixCSR{Bi,TvR,TiR}, _rap(R,A,P) end -# Reuses internal arrays of A!!! -function construct_spmmm_cache(C::SparseMatrixCSR,A::SparseMatrixCSR) - cache = JaggedArray(colvals(A), A.rowptr) -end - -function construct_spmmm_cache(C::SparseMatrixCSC,A::SparseMatrixCSC) - cache = JaggedArray(rowvals(A), A.colptr) -end - function reduce_spmtmm_cache(cache,::Type{M} where M <: SparseMatrixCSR) (xb,x,JAP,_) = cache (xb,x,JAP) diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl index ea12f3f9..12f6d248 100644 --- a/src/sparse_utils.jl +++ b/src/sparse_utils.jl @@ -725,39 +725,38 @@ function precompute_nzindex(C::AbstractSparseArray,A::AbstractSparseArray) K end -function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Tv,Ti} where {Tv, Ti}, n) where Bi - p,q = size(A) - @assert n >= q - SparseMatrixCSR{Bi}(p,n,A.rowptr,A.colval,A.nzval) +function expand_sparse_matrix(A::SparseMatrixCSR{Bi,Tv,Ti} where {Tv, Ti},m,n) where Bi + p = size(A,1) + new_rowptr = similar(A.rowptr,m+1) + map!(identity,new_rowptr,A.rowptr) + last_index = A.colptr[end] + for i in p+1:m+1 + new_colptr[i] = last_index + end + SparseMatrixCSR{Bi}(m,n,A.new_rowptr,A.colval,A.nzval) end -function expand_sparse_matrix_columns(A::SparseMatrixCSC{Tv,Ti}, n) where {Tv,Ti} - p,q = size(A) - @assert n >= q +function expand_sparse_matrix(A::SparseMatrixCSC{Tv,Ti},m,n) where {Tv,Ti} + q = size(A,2) new_colptr = similar(A.colptr,n+1) map!(identity,new_colptr,A.colptr) last_index = A.colptr[end] - foreach(q+1:n+1) do i - new_colptr[i] = last_index + for j in q+1:n+1 + new_colptr[j] = last_index end - SparseMatrixCSC{Tv,Ti}(p,n,new_colptr,A.rowval,A.nzval) + SparseMatrixCSC{Tv,Ti}(m,n,new_colptr,A.rowval,A.nzval) end -# Currently not implemented by the SparseMatricesCSR module +# Currently not implemented by the SparseMatricesCSR package function Base.similar(A::SparseMatrixCSR{Bi}, m::Integer, n::Integer) where Bi SparseMatrixCSR{1}(m, n, ones(eltype(A.rowptr), m+1), eltype(A.colval)[], eltype(A.nzval)[]) end -# Currently not implemented by the SparseMatricesCSR module +# Currently not implemented by SparseMatricesCSR function Base.similar(A::SparseMatrixCSR{Bi}) where Bi SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), similar(nonzeros(A))) end -# This method is implemented also by SparseMatricesCSR, but related methods aren't. -# function Base.copy(A::SparseMatrixCSR{Bi}) where Bi -# SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), copy(nonzeros(A))) -# end - # Currently not implemented by the SparseMatricesCSR module function Base.copy(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}} where {Bi,Tv,Ti}) Acsc_T = copy(transpose(ascsc(At.parent))) # materialize SparseMatrixCSC transpose From 2378df50f580a6bf1cec3d47cb7731187021da74 Mon Sep 17 00:00:00 2001 From: jop611 Date: Fri, 31 Jan 2025 12:55:52 +0100 Subject: [PATCH 34/34] fixes to expand_sparse_matrix and inclusion of general case. --- src/p_sparse_matrix.jl | 128 +++++++++++++++---------------- src/sparse_utils.jl | 53 +++++++++---- test/debug_array/spmtmm_tests.jl | 22 ++++-- 3 files changed, 116 insertions(+), 87 deletions(-) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index 29bef80a..447d96b4 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -2327,81 +2327,81 @@ function spmm!(C,A,B,state) end ### OLD ### +# function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) +# # TODO latency hiding +# @assert A.assembled +# @assert B.assembled +# col_partition = partition(axes(A,2)) +# C,cacheC = consistent(B,col_partition;reuse=true) |> fetch +# D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays +# assembled = true +# D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled) +# if val_parameter(reuse) +# cache = (C,cacheC,cacheD) +# return D,cache +# end +# D +# end + +# function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) +# (C,cacheC,cacheD)= cache +# consistent!(C,B,cacheC) |> wait +# map(spmm!,partition(D),partition(A),partition(C),cacheD) +# D +# end + +### NEW ### function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) - # TODO latency hiding @assert A.assembled @assert B.assembled - col_partition = partition(axes(A,2)) - C,cacheC = consistent(B,col_partition;reuse=true) |> fetch - D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays - assembled = true - D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled) - if val_parameter(reuse) - cache = (C,cacheC,cacheD) - return D,cache - end - D -end - -function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) - (C,cacheC,cacheD)= cache - consistent!(C,B,cacheC) |> wait - map(spmm!,partition(D),partition(A),partition(C),cacheD) - D -end + t = consistent(B,partition(axes(A,2)),reuse=true) + A_own_own = own_own_values(A) + A_own_ghost = own_ghost_values(A) + C_own_own_1 = map(matmul,A_own_own,own_own_values(B)) -### NEW ### -# function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) -# @assert A.assembled -# @assert B.assembled -# t = consistent(B,partition(axes(A,2)),reuse=true) -# A_own_own = own_own_values(A) -# A_own_ghost = own_ghost_values(A) -# C_own_own_1 = map(matmul,A_own_own,own_own_values(B)) - -# # Wait for consistent -# B2, cacheB2 = fetch(t) -# C_own_ghost_1 = map(matmul,A_own_own,own_ghost_values(B2)) -# C_own_own_2 = map(matmul,A_own_ghost,ghost_own_values(B2)) -# C_own_ghost_2 = map(matmul,A_own_ghost,ghost_ghost_values(B2)) + # Wait for consistent + B2, cacheB2 = fetch(t) + C_own_ghost_1 = map(matmul,A_own_own,own_ghost_values(B2)) + C_own_own_2 = map(matmul,A_own_ghost,ghost_own_values(B2)) + C_own_ghost_2 = map(matmul,A_own_ghost,ghost_ghost_values(B2)) -# C_own_own = map(add, C_own_own_1, C_own_own_2) -# C_own_ghost = map(add, C_own_ghost_1, C_own_ghost_2) + C_own_own = map(add, C_own_own_1, C_own_own_2) + C_own_ghost = map(add, C_own_ghost_1, C_own_ghost_2) -# Coo_cache = map(construct_spmm_cache, C_own_own) -# Cog_cache = map(construct_spmm_cache, C_own_ghost) + Coo_cache = map(construct_spmm_cache, C_own_own) + Cog_cache = map(construct_spmm_cache, C_own_ghost) -# C_values = map(C_own_own,C_own_ghost,partition(A),partition(B2)) do own_own,own_ghost,A_part,B_part -# ghost_own = similar(own_own,0,size(own_own,2)) -# ghost_ghost = similar(own_own,0,size(own_ghost,2)) -# blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) -# split_matrix(blocks,A_part.row_permutation,B_part.col_permutation) -# end + C_values = map(C_own_own,C_own_ghost,partition(A),partition(B2)) do own_own,own_ghost,A_part,B_part + ghost_own = similar(own_own,0,size(own_own,2)) + ghost_ghost = similar(own_own,0,size(own_ghost,2)) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + split_matrix(blocks,A_part.row_permutation,B_part.col_permutation) + end -# C = PSparseMatrix(C_values,partition(axes(A,1)),partition(axes(B2,2)),true) -# if val_parameter(reuse) -# cache = (B2,cacheB2,(Coo_cache,Cog_cache)) -# return C,cache -# end -# C -# end + C = PSparseMatrix(C_values,partition(axes(A,1)),partition(axes(B2,2)),true) + if val_parameter(reuse) + cache = (B2,cacheB2,(Coo_cache,Cog_cache)) + return C,cache + end + C +end -# function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) -# (B2,cacheB2,(Coo_cache,Cog_cache)) = cache -# t = consistent!(B2,B,cacheB2) -# A_own_own = own_own_values(A) -# A_own_ghost = own_ghost_values(A) -# C_own_own = own_own_values(C) -# C_own_ghost = own_ghost_values(C) +function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) + (B2,cacheB2,(Coo_cache,Cog_cache)) = cache + t = consistent!(B2,B,cacheB2) + A_own_own = own_own_values(A) + A_own_ghost = own_ghost_values(A) + C_own_own = own_own_values(C) + C_own_ghost = own_ghost_values(C) -# map(matmul!,C_own_own,A_own_own,own_own_values(B),Coo_cache) -# wait(t) -# map(matmul!,C_own_ghost,A_own_own,own_ghost_values(B2),Cog_cache) + map(matmul!,C_own_own,A_own_own,own_own_values(B),Coo_cache) + wait(t) + map(matmul!,C_own_ghost,A_own_own,own_ghost_values(B2),Cog_cache) -# map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache) -# map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache) -# C -# end + map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache) + map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache) + C +end ### End NEW ### function spmtm(A,B;reuse=Val(false)) diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl index 12f6d248..def57040 100644 --- a/src/sparse_utils.jl +++ b/src/sparse_utils.jl @@ -688,8 +688,8 @@ function spmv_csc!(b,x,colptr_A,rowval_A,nzval_A) b end -################ NEW ################ +################ NEW ################ # Variants for findnz() that only allocates memory for the conversion of the pointer array to an index array. # Only use for read-only operations. function findnz_minimal(A::SparseMatrixCSC) @@ -725,24 +725,45 @@ function precompute_nzindex(C::AbstractSparseArray,A::AbstractSparseArray) K end +# General matrix expansion to a larger size, allocates new matrix with new size. +function expand_sparse_matrix(A,m,n) + compresscoo(typeof(A),findnz(A)...,m,n) +end + +# Expand matrix to a larger size without changing non-zero entries. +# Might allocate a new pointer array, but shares index and value arrays with A. function expand_sparse_matrix(A::SparseMatrixCSR{Bi,Tv,Ti} where {Tv, Ti},m,n) where Bi - p = size(A,1) - new_rowptr = similar(A.rowptr,m+1) - map!(identity,new_rowptr,A.rowptr) - last_index = A.colptr[end] - for i in p+1:m+1 - new_colptr[i] = last_index + p,q = size(A) + @assert m >= p + @assert n >= q + if m > p + new_rowptr = similar(A.rowptr,m+1) + map!(identity,new_rowptr,A.rowptr) + last_index = A.rowptr[end] + for i in p+1:m+1 + new_rowptr[i] = last_index + end + else + new_rowptr = A.rowptr end - SparseMatrixCSR{Bi}(m,n,A.new_rowptr,A.colval,A.nzval) + SparseMatrixCSR{Bi}(m,n,new_rowptr,A.colval,A.nzval) end +# Expand matrix to a larger size without changing non-zero entries. +# Might allocate a new pointer array, but shares index and value arrays with A. function expand_sparse_matrix(A::SparseMatrixCSC{Tv,Ti},m,n) where {Tv,Ti} - q = size(A,2) - new_colptr = similar(A.colptr,n+1) - map!(identity,new_colptr,A.colptr) - last_index = A.colptr[end] - for j in q+1:n+1 - new_colptr[j] = last_index + p,q = size(A) + @assert m >= p + @assert n >= q + if n > q + new_colptr = similar(A.colptr,n+1) + map!(identity,new_colptr,A.colptr) + last_index = A.colptr[end] + for j in q+1:n+1 + new_colptr[j] = last_index + end + else + new_colptr = A.colptr end SparseMatrixCSC{Tv,Ti}(m,n,new_colptr,A.rowval,A.nzval) end @@ -971,6 +992,4 @@ end function symbolic_halfperm!(A::SparseMatrixCSC,At::JaggedArray) symbolic_halfperm!(ascsr(A),At) A -end - - +end \ No newline at end of file diff --git a/test/debug_array/spmtmm_tests.jl b/test/debug_array/spmtmm_tests.jl index 5969b60f..1b154b59 100644 --- a/test/debug_array/spmtmm_tests.jl +++ b/test/debug_array/spmtmm_tests.jl @@ -6,14 +6,24 @@ using Test include(joinpath("..","spmtmm_tests.jl")) v = 1:5 -M = sparse(v,v,v) -Z = subtract(M,M) -@test nnz(Z) == nnz(M) +A = sparse(v,v,v) +Z = subtract(A,A) +@test nnz(Z) == nnz(A) display(Z) -M = sparsecsr(v,v,v) -Z = subtract(M,M) -@test nnz(Z) == nnz(M) +B = sparse(v,v,-v) +Z = add(A,B) +@test nnz(Z) == nnz(A) +display(Z) + +A = sparsecsr(v,v,v) +Z = subtract(A,A) +@test nnz(Z) == nnz(A) +display(Z) + +B = sparsecsr(v,v,-v) +Z = add(A,B) +@test nnz(Z) == nnz(A) display(Z) with_debug(spmtmm_tests)